[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / French.pm

package Text::Tradition::Language::French;

use strict;
use warnings;
use Encode qw/ encode_utf8 decode_utf8 /;
use IPC::Run qw/ run /;
use Lingua::TagSet::Multext;
use Lingua::TagSet::TreeTagger;
use Module::Load;
use Text::Tradition::Collation::Reading::Lexeme;
use Text::Tradition::Collation::Reading::WordForm;
use TryCatch;

my $MORPHDIR = '/Users/tla/Projects/morphology';

=head1 NAME

Text::Tradition::Language::French - language-specific module for French

=head1 DESCRIPTION

Implements morphology lookup for French words in context.  This module
depends on the Flemm module for French lemmatization
(L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
the TreeTagger software
(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
(for now) expected to be installed in $MORPHDIR/TreeTagger.

=head1 SUBROUTINES

=head2 lemmatize( $text )

Evaluates the string using the Flemm package, and returns the results.

=begin testing

binmode STDOUT, ':utf8';
use Text::Tradition;
use_ok( 'Text::Tradition::Language::French' );

eval "use Flemm";
my $err = $@;

SKIP: {
	skip "Package Flemm not found" if $err;
	my $tf = Text::Tradition->new(
		'input' => 'Self',
		'file' => 't/data/besoin.xml',
		'language' => 'French' );
		
	is( $tf->language, 'French', "Set language okay" );
	$tf->lemmatize();
	# Test the lemmatization. How many readings now have morphological info?
	# Do the lexemes match the reading?
	my $ambig = 0;
	foreach my $r ( $tf->collation->readings ) {
		next if $r->is_meta;
		ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
		my @lex = $r->lexemes;
		my $lexstr = join( '', map { $_->string } @lex );
		my $textstr = $r->text;
		$textstr =~ s/\s+//g;
		is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
		foreach my $l ( @lex ) {
			next if $l->is_disambiguated;
	# 		printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
	# 			join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
			$ambig++;
		}
	}
	is( $ambig, 102, "Found 102 ambiguous forms as expected" );
	
	# Try setting the normal form of a reading and re-analyzing
	my $mr = $tf->collation->reading('99,2');
	is( $mr->text, 'minspire', "Picked correct test reading" );
	is( $mr->language, 'French', "Reading has correct language setting" );
	$mr->normal_form( "m'inspire" );
	$mr->lemmatize;
	is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
}

=end testing

=cut

sub lemmatize {
	my $tradition = shift;

	# Given a tradition, lemmatize it witness by witness and see what we get.
	my $workdir = File::Temp->newdir();
	my $c = $tradition->collation;
	# First, clear out all existing lexemes from the readings. Save the
	# path as long as we went to the trouble of generating it.
	my %witness_paths;
	foreach my $wit ( $tradition->witnesses ) {
		my @sigla = ( $wit->sigil );
		push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
		foreach my $sig ( @sigla ) {
			my @path = grep { !$_->is_meta } 
				$c->reading_sequence( $c->start, $c->end, $sig );
			map { $_->clear_lexemes } @path;
			$witness_paths{$sig} = \@path;
		}
	}
	
	foreach my $sig ( keys %witness_paths ) {
		# Get the text as a sequence of readings and as a string
		print STDERR "Morphologizing witness $sig\n";
		_lemmatize_sequence( undef, @{$witness_paths{$sig}} );
	}
}

sub _update_reading_lexemes {
	my( $replace, $reading, @lexemes ) = @_;
	if( $reading->has_lexemes && !$replace ) {
		# We need to merge what is in @lexemes with what we have already.
		my @oldlex = $reading->lexemes;
		my $cmp1 = join( '||', map { $_->string } @oldlex );
		my $cmp2 = join( '||', map { $_->string } @lexemes );
		if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
			# The lexeme strings are the same, so merge the possible
			# word forms from new to old.
			foreach my $i ( 0 .. $#lexemes ) {
				my $ol = $oldlex[$i];
				my $nl = $lexemes[$i];
				my %ofw;
				map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
				foreach my $form ( $nl->matching_forms ) {
					unless( $ofw{$form->to_string} ) {
						print STDERR "Adding form " . $form->to_string . 
							" to lexeme " . $nl->string . " at $reading\n";
						$ol->add_matching_form( $form );
						$ol->is_disambiguated(0);
					}
				}
			}
		} else {
			warn "Lexeme layout for $reading changed; replacing the lot";
			$reading->clear_lexemes;
			$reading->add_lexeme( @lexemes );
		}
	} else {
		$reading->clear_lexemes if $replace;
		$reading->add_lexeme( @lexemes );
	}
}

=head2 reading_lookup( $rdg[, $rdg, ...] )

Looks up one or more readings using the Flemm package, and returns the
possible results.  This uses the same logic as L<lemmatize> above for the
entire tradition, but can also be used to (re-)analyze individual readings.

=cut

sub reading_lookup {
	return _lemmatize_sequence( 1, @_ );
}

sub _lemmatize_sequence {
	my( $replace, @path ) = @_;
	my $tagresult = _treetag_string( _text_from_path( 1, @path ) );
	if( $tagresult ) {
		# Map the tagged words onto the original readings, splitting 
		# them up into lexemes where necessary.
		# NOTE we can have multiple lexemes in a reading, but not
		# multiple readings to a lexeme.
		my @tags = split( /\n/, $tagresult );
		my @lexemes;
		my $curr_rdg = shift @path;
		my @curr_lexemes;
		my $unused_rdg_part;
		foreach my $tag ( @tags ) {
			# Get the original word
			my( $lexeme, @rest ) = split( /\t/, $tag );
			# Lemmatize the whole
			my @forms = _parse_wordform( _flemm_lookup( $tag ) );
			my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
				'string' => $lexeme, 'language' => 'French',
				'wordform_matchlist' => \@forms );
			# Find the next non-meta reading
			while( $curr_rdg && $curr_rdg->is_meta ) {
				$curr_rdg = shift @path;
			}
			unless( $curr_rdg ) {
				warn "Ran out of readings in sequence at $lexeme";
				last;
			}
			my $curr_rdg_text = $curr_rdg->has_normal_form 
				? $curr_rdg->normal_form : $curr_rdg->text;
			if( $unused_rdg_part &&
				$unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
				# Nth part of curr_rdg
				$unused_rdg_part = $2;
				push( @curr_lexemes, $lexobj );
			} elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
				# Flag an error if there is already an unused reading part.
				warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
					if $unused_rdg_part;
				$unused_rdg_part = $2; # will be empty if the whole reading matched
				push( @curr_lexemes, $lexobj );
			} else {
				# We do not cope with the idea of a lexeme being 
				# spread across multiple readings.
				warn "Word sequence changed unexpectedly in text";
				# See if we can find a matching reading
				my @lookahead;
				my $matched;
				while( my $nr = shift @path ) {
					my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
					if( $nrtext =~ /^\Q$lexeme\E/ ) {
						$curr_rdg = $lookahead[-1] if @lookahead;
						$matched = 1;
						last;
					} else {
						push( @lookahead, $nr );
					}
				}
				# No match? Restore the state we had
				unless( $matched ) {
					unshift( @path, @lookahead );
				}
				# Trigger a move
				$unused_rdg_part = '';
			}
			
			unless( $unused_rdg_part ) {
				# Record the lexemes for the given reading.
				#print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
				#	join( ' ', map { $_->string } @curr_lexemes ),
				#	$curr_rdg->id, $curr_rdg->text );
				_update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes );
				$curr_rdg = shift @path;
				@curr_lexemes = ();
			}
		}
	}
}

# Utility function so that we can cheat and use it when we need both the path
# and its text.
sub _text_from_path {
	my( $normalize, @path ) = @_;
	my $pathtext = '';
	my $last;
	foreach my $r ( @path ) {
		unless ( $r->join_prior || !$last || $last->join_next ) {
			$pathtext .= ' ';
		} 
		$pathtext .= ( $normalize && $r->has_normal_form ) 
			? $r->normal_form : $r->text;
		$last = $r;
	}
	return $pathtext;
}

# Utility function that actually calls the tree tagger.
sub _treetag_string {
	my( $text ) = @_;
	my $wittext = encode_utf8( $text );
	# Then see if we have TreeTagger
	my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8";
	unless( -f $taggercmd ) {
		warn "Cannot do French word lemmatization without TreeTagger";
		return;
	}
	# OK, we can run it then.
	my @cmd = ( $taggercmd );
	my( $tagresult, $err ); # Capture the output and error
	run( \@cmd, \$wittext, \$tagresult, \$err );
	# TODO check for error
	return decode_utf8( $tagresult );
}

# Closure and utility function for the package lemmatizer
{
	my $lemmatizer;
	
	sub _flemm_lookup {
		# First try to load Flemm
		unless( $lemmatizer ) {
			try {
				load 'Flemm';
				$lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
			} catch {
				warn "Cannot do French word lemmatization without Flemm: @_";
				return;
			}
		}
		return $lemmatizer->lemmatize( @_ )
	}
	
}

# Utility function to turn a Flemm result into a WordForm
sub _parse_wordform {
	my $flemmobj = shift;
	# For now just parse the string, until we make sense of the documentation.
	my @results = split( / \|\| /, $flemmobj->getResult );
	my @forms;
	foreach ( @results ) {
		my( $orig, $tag, $lemma ) = split( /\t/, $_ );
		my( $pos, $morph ) = split( /:/, $tag );
		my $morphobj;
		if( $morph ) {
			$morphobj = Lingua::TagSet::Multext->tag2structure( $morph );
		} else {
			# Use the TreeTagger info if there is no Flemm morphology.
			$morphobj = Lingua::TagSet::TreeTagger->tag2structure( $pos );
		}
		if( $morphobj ) {
			my $wf = Text::Tradition::Collation::Reading::WordForm->new(
				'language' => 'French',
				'lemma' => $lemma,
				'morphology' => $morphobj,
				);
			push( @forms, $wf );
		} else {
			warn "No morphology found for word: $_";
		}
	}
	return @forms;
}

1;

=head2 TODO

=over

=item * Handle package dependencies more gracefully

=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews E<lt>aurum@cpan.orgE<gt>
Commit	Line	Data
cca4f996	1	package Text::Tradition::Language::French;
cca4f996	2
f4b6b4d0	3	use strict;
f4b6b4d0	4	use warnings;
d3e7842a	5	use Encode qw/ encode_utf8 decode_utf8 /;
6ad2ce78	6	use IPC::Run qw/ run /;
	7	use Lingua::TagSet::Multext;
	8	use Lingua::TagSet::TreeTagger;
d3e7842a	9	use Module::Load;
d3e7842a	10	use Text::Tradition::Collation::Reading::Lexeme;
cca4f996	11	use Text::Tradition::Collation::Reading::WordForm;
d3e7842a	12	use TryCatch;
	13
	14	my $MORPHDIR = '/Users/tla/Projects/morphology';
cca4f996	15
	16	=head1 NAME
	17
6ad2ce78	18	Text::Tradition::Language::French - language-specific module for French
cca4f996	19
	20	=head1 DESCRIPTION
	21
6ad2ce78	22	Implements morphology lookup for French words in context. This module
	23	depends on the Flemm module for French lemmatization
	24	(L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
	25	the TreeTagger software
	26	(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
	27	(for now) expected to be installed in $MORPHDIR/TreeTagger.
cca4f996	28
	29	=head1 SUBROUTINES
	30
	31	=head2 lemmatize( $text )
	32
	33	Evaluates the string using the Flemm package, and returns the results.
	34
6ad2ce78	35	=begin testing
	36
	37	binmode STDOUT, ':utf8';
	38	use Text::Tradition;
	39	use_ok( 'Text::Tradition::Language::French' );
	40
	41	eval "use Flemm";
	42	my $err = $@;
	43
	44	SKIP: {
	45	skip "Package Flemm not found" if $err;
	46	my $tf = Text::Tradition->new(
	47	'input' => 'Self',
	48	'file' => 't/data/besoin.xml',
	49	'language' => 'French' );
	50
	51	is( $tf->language, 'French', "Set language okay" );
	52	$tf->lemmatize();
	53	# Test the lemmatization. How many readings now have morphological info?
	54	# Do the lexemes match the reading?
	55	my $ambig = 0;
	56	foreach my $r ( $tf->collation->readings ) {
	57	next if $r->is_meta;
	58	ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
	59	my @lex = $r->lexemes;
	60	my $lexstr = join( '', map { $_->string } @lex );
	61	my $textstr = $r->text;
	62	$textstr =~ s/\s+//g;
	63	is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
	64	foreach my $l ( @lex ) {
	65	next if $l->is_disambiguated;
	66	# printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
	67	# join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
	68	$ambig++;
	69	}
	70	}
	71	is( $ambig, 102, "Found 102 ambiguous forms as expected" );
	72
	73	# Try setting the normal form of a reading and re-analyzing
	74	my $mr = $tf->collation->reading('99,2');
	75	is( $mr->text, 'minspire', "Picked correct test reading" );
	76	is( $mr->language, 'French', "Reading has correct language setting" );
	77	$mr->normal_form( "m'inspire" );
	78	$mr->lemmatize;
	79	is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
	80	}
	81
	82	=end testing
	83
cca4f996	84	=cut
	85
	86	sub lemmatize {
d3e7842a	87	my $tradition = shift;
	88
	89	# Given a tradition, lemmatize it witness by witness and see what we get.
	90	my $workdir = File::Temp->newdir();
	91	my $c = $tradition->collation;
	92	# First, clear out all existing lexemes from the readings. Save the
	93	# path as long as we went to the trouble of generating it.
	94	my %witness_paths;
	95	foreach my $wit ( $tradition->witnesses ) {
	96	my @sigla = ( $wit->sigil );
	97	push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
	98	foreach my $sig ( @sigla ) {
	99	my @path = grep { !$_->is_meta }
	100	$c->reading_sequence( $c->start, $c->end, $sig );
	101	map { $_->clear_lexemes } @path;
	102	$witness_paths{$sig} = \@path;
	103	}
	104	}
cca4f996	105
d3e7842a	106	foreach my $sig ( keys %witness_paths ) {
	107	# Get the text as a sequence of readings and as a string
	108	print STDERR "Morphologizing witness $sig\n";
6ad2ce78	109	_lemmatize_sequence( undef, @{$witness_paths{$sig}} );
d3e7842a	110	}
	111	}
	112
	113	sub _update_reading_lexemes {
6ad2ce78	114	my( $replace, $reading, @lexemes ) = @_;
6ad2ce78	115	if( $reading->has_lexemes && !$replace ) {
d3e7842a	116	# We need to merge what is in @lexemes with what we have already.
	117	my @oldlex = $reading->lexemes;
	118	my $cmp1 = join( '\|\|', map { $_->string } @oldlex );
	119	my $cmp2 = join( '\|\|', map { $_->string } @lexemes );
6ad2ce78	120	if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
d3e7842a	121	# The lexeme strings are the same, so merge the possible
	122	# word forms from new to old.
	123	foreach my $i ( 0 .. $#lexemes ) {
	124	my $ol = $oldlex[$i];
	125	my $nl = $lexemes[$i];
	126	my %ofw;
6ad2ce78	127	map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
d3e7842a	128	foreach my $form ( $nl->matching_forms ) {
6ad2ce78	129	unless( $ofw{$form->to_string} ) {
6ad2ce78	130	print STDERR "Adding form " . $form->to_string .
d3e7842a	131	" to lexeme " . $nl->string . " at $reading\n";
	132	$ol->add_matching_form( $form );
	133	$ol->is_disambiguated(0);
	134	}
	135	}
	136	}
	137	} else {
d3e7842a	138	warn "Lexeme layout for $reading changed; replacing the lot";
	139	$reading->clear_lexemes;
	140	$reading->add_lexeme( @lexemes );
	141	}
	142	} else {
6ad2ce78	143	$reading->clear_lexemes if $replace;
d3e7842a	144	$reading->add_lexeme( @lexemes );
d3e7842a	145	}
cca4f996	146	}
cca4f996	147
6ad2ce78	148	=head2 reading_lookup( $rdg[, $rdg, ...] )
cca4f996	149
6ad2ce78	150	Looks up one or more readings using the Flemm package, and returns the
	151	possible results. This uses the same logic as L<lemmatize> above for the
	152	entire tradition, but can also be used to (re-)analyze individual readings.
cca4f996	153
	154	=cut
	155
6ad2ce78	156	sub reading_lookup {
	157	return _lemmatize_sequence( 1, @_ );
	158	}
	159
	160	sub _lemmatize_sequence {
	161	my( $replace, @path ) = @_;
6ad2ce78	162	my $tagresult = _treetag_string( _text_from_path( 1, @path ) );
	163	if( $tagresult ) {
	164	# Map the tagged words onto the original readings, splitting
	165	# them up into lexemes where necessary.
	166	# NOTE we can have multiple lexemes in a reading, but not
	167	# multiple readings to a lexeme.
	168	my @tags = split( /\n/, $tagresult );
	169	my @lexemes;
	170	my $curr_rdg = shift @path;
	171	my @curr_lexemes;
	172	my $unused_rdg_part;
	173	foreach my $tag ( @tags ) {
	174	# Get the original word
	175	my( $lexeme, @rest ) = split( /\t/, $tag );
	176	# Lemmatize the whole
	177	my @forms = _parse_wordform( _flemm_lookup( $tag ) );
	178	my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
	179	'string' => $lexeme, 'language' => 'French',
	180	'wordform_matchlist' => \@forms );
	181	# Find the next non-meta reading
	182	while( $curr_rdg && $curr_rdg->is_meta ) {
	183	$curr_rdg = shift @path;
	184	}
	185	unless( $curr_rdg ) {
f4b6b4d0	186	warn "Ran out of readings in sequence at $lexeme";
6ad2ce78	187	last;
	188	}
	189	my $curr_rdg_text = $curr_rdg->has_normal_form
	190	? $curr_rdg->normal_form : $curr_rdg->text;
	191	if( $unused_rdg_part &&
	192	$unused_rdg_part =~ /^\Q$lexeme\E(\s)(.)$/ ) {
	193	# Nth part of curr_rdg
	194	$unused_rdg_part = $2;
	195	push( @curr_lexemes, $lexobj );
	196	} elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s)(.)$/ ) {
	197	# Flag an error if there is already an unused reading part.
	198	warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
	199	if $unused_rdg_part;
	200	$unused_rdg_part = $2; # will be empty if the whole reading matched
	201	push( @curr_lexemes, $lexobj );
	202	} else {
	203	# We do not cope with the idea of a lexeme being
	204	# spread across multiple readings.
	205	warn "Word sequence changed unexpectedly in text";
	206	# See if we can find a matching reading
	207	my @lookahead;
	208	my $matched;
	209	while( my $nr = shift @path ) {
	210	my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
	211	if( $nrtext =~ /^\Q$lexeme\E/ ) {
	212	$curr_rdg = $lookahead[-1] if @lookahead;
	213	$matched = 1;
	214	last;
	215	} else {
	216	push( @lookahead, $nr );
	217	}
	218	}
	219	# No match? Restore the state we had
	220	unless( $matched ) {
	221	unshift( @path, @lookahead );
	222	}
	223	# Trigger a move
	224	$unused_rdg_part = '';
	225	}
	226
	227	unless( $unused_rdg_part ) {
	228	# Record the lexemes for the given reading.
	229	#print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
	230	# join( ' ', map { $_->string } @curr_lexemes ),
	231	# $curr_rdg->id, $curr_rdg->text );
	232	_update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes );
	233	$curr_rdg = shift @path;
	234	@curr_lexemes = ();
	235	}
	236	}
	237	}
	238	}
	239
	240	# Utility function so that we can cheat and use it when we need both the path
	241	# and its text.
	242	sub _text_from_path {
	243	my( $normalize, @path ) = @_;
	244	my $pathtext = '';
	245	my $last;
	246	foreach my $r ( @path ) {
	247	unless ( $r->join_prior \|\| !$last \|\| $last->join_next ) {
	248	$pathtext .= ' ';
	249	}
	250	$pathtext .= ( $normalize && $r->has_normal_form )
251	? $r->normal_form : $r->text;
252	$last = $r;
d3e7842a	253	}
6ad2ce78	254	return $pathtext;
d3e7842a	255	}
	256
	257	# Utility function that actually calls the tree tagger.
	258	sub _treetag_string {
	259	my( $text ) = @_;
	260	my $wittext = encode_utf8( $text );
	261	# Then see if we have TreeTagger
	262	my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8";
	263	unless( -f $taggercmd ) {
	264	warn "Cannot do French word lemmatization without TreeTagger";
	265	return;
	266	}
	267	# OK, we can run it then.
	268	my @cmd = ( $taggercmd );
	269	my( $tagresult, $err ); # Capture the output and error
	270	run( \@cmd, \$wittext, \$tagresult, \$err );
	271	# TODO check for error
	272	return decode_utf8( $tagresult );
	273	}
	274
	275	# Closure and utility function for the package lemmatizer
	276	{
	277	my $lemmatizer;
	278
	279	sub _flemm_lookup {
	280	# First try to load Flemm
	281	unless( $lemmatizer ) {
	282	try {
	283	load 'Flemm';
	284	$lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
	285	} catch {
	286	warn "Cannot do French word lemmatization without Flemm: @_";
	287	return;
	288	}
	289	}
	290	return $lemmatizer->lemmatize( @_ )
	291	}
cca4f996	292
	293	}
	294
d3e7842a	295	# Utility function to turn a Flemm result into a WordForm
	296	sub _parse_wordform {
	297	my $flemmobj = shift;
	298	# For now just parse the string, until we make sense of the documentation.
	299	my @results = split( / \\|\\| /, $flemmobj->getResult );
	300	my @forms;
	301	foreach ( @results ) {
	302	my( $orig, $tag, $lemma ) = split( /\t/, $_ );
	303	my( $pos, $morph ) = split( /:/, $tag );
6ad2ce78	304	my $morphobj;
	305	if( $morph ) {
	306	$morphobj = Lingua::TagSet::Multext->tag2structure( $morph );
	307	} else {
	308	# Use the TreeTagger info if there is no Flemm morphology.
	309	$morphobj = Lingua::TagSet::TreeTagger->tag2structure( $pos );
	310	}
	311	if( $morphobj ) {
	312	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
	313	'language' => 'French',
	314	'lemma' => $lemma,
	315	'morphology' => $morphobj,
	316	);
	317	push( @forms, $wf );
	318	} else {
	319	warn "No morphology found for word: $_";
	320	}
d3e7842a	321	}
	322	return @forms;
	323	}
	324
	325	1;
	326
6ad2ce78	327	=head2 TODO
	328
	329	=over
	330
	331	=item * Handle package dependencies more gracefully
	332
	333	=back
	334
cca4f996	335	=head1 LICENSE
	336
	337	This package is free software and is provided "as is" without express
	338	or implied warranty. You can redistribute it and/or modify it under
	339	the same terms as Perl itself.
	340
	341	=head1 AUTHOR
	342
	343	Tara L Andrews E<lt>aurum@cpan.orgE<gt>