From: Tara L Andrews <tla\@mit.edu>
Date: Wed, 2 May 2012 13:28:03 +0000 (+0200)
Subject: make French morphology use Lingua objects; add tests
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=6ad2ce785dc85432a6227c7b2efdae0364959714;p=scpubgit%2Fstemmatology.git

make French morphology use Lingua objects; add tests
---

diff --git a/Makefile.PL b/Makefile.PL
index ba2ecf5..cf2a5e2 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -30,5 +30,8 @@ requires( 'TryCatch' );
 requires( 'XML::Easy::Syntax' );
 requires( 'XML::LibXML' );
 requires( 'XML::LibXML::XPathContext' );
+# For the morphology stuff
+requires( 'Lingua::TagSet::Multext' );
+requires( 'Lingua::TagSet::TreeTagger' );
 build_requires( 'Test::Warn' );
 &WriteAll;
diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm
index b246c7e..cc26b0e 100644
--- a/lib/Text/Tradition/Collation.pm
+++ b/lib/Text/Tradition/Collation.pm
@@ -284,6 +284,9 @@ sub add_reading {
 	my( $self, $reading ) = @_;
 	unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
 		my %args = %$reading;
+		if( $self->tradition->has_language && !exists $args{'language'} ) {
+			$args{'language'} = $self->tradition->language;
+		}
 		$reading = Text::Tradition::Collation::Reading->new( 
 			'collation' => $self,
 			%args );
@@ -1386,21 +1389,13 @@ sub path_text {
 	$start = $self->start unless $start;
 	$end = $self->end unless $end;
 	my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
-	return $self->_text_from_path( @path );
-}
-
-# Utility function so that we can cheat and use it when we need both the path
-# and its text.
-sub _text_from_path {
-	my( $self, @path ) = @_;
 	my $pathtext = '';
 	my $last;
 	foreach my $r ( @path ) {
-		if( $r->join_prior || !$last || $last->join_next ) {
-			$pathtext .= $r->text;
-		} else {
-			$pathtext .= ' ' . $r->text;
-		}
+		unless ( $r->join_prior || !$last || $last->join_next ) {
+			$pathtext .= ' ';
+		} 
+		$pathtext .= $r->text;
 		$last = $r;
 	}
 	return $pathtext;
diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm
index 11ec36e..4bdaecb 100644
--- a/lib/Text/Tradition/Collation/Reading.pm
+++ b/lib/Text/Tradition/Collation/Reading.pm
@@ -1,6 +1,7 @@
 package Text::Tradition::Collation::Reading;
 
 use Moose;
+use Module::Load;
 use overload '""' => \&_stringify, 'fallback' => 1;
 
 =head1 NAME
@@ -89,7 +90,7 @@ has 'text' => (
 has 'language' => (
 	is => 'ro',
 	isa => 'Str',
-	default => 'Default',
+	predicate => 'has_language',
 	);
 	
 has 'is_start' => (
@@ -277,47 +278,29 @@ sub _stringify {
 
 A few methods to try to tack on morphological information.
 
-=head2 use_lexemes
+=head2 lexemes
 
-TBD
+=head2 has_lexemes
+
+=head2 clear_lexemes
+
+=head2 add_lexeme
+
+=head2 lemmatize 
 
 =cut
 
-# sub use_lexemes {
-# 	my( $self, @lexemes ) = @_;
-# 	# The lexemes need to be the same as $self->text.
-# 	my $cmpstr = $self->has_normal_form ? lc( $self->normal_form ) : lc( $self->text );
-# 	$cmpstr =~ s/[\s-]+//g;
-# 	my $lexstr = lc( join( '', @lexemes ) );
-# 	$lexstr =~ s/[\s-]+//g;
-# 	unless( $lexstr eq $cmpstr ) {
-# 		warn "Cannot split " . $self->text . " into " . join( '.', @lexemes );
-# 		return;
-# 	}
-# 	$self->_clear_morph;
-# 	map { $self->_add_morph( { $_ => [] } ) } @lexemes;
-# }
-# 
-# sub add_morphological_tag {
-# 	my( $self, $lexeme, $opts ) = @_;
-# 	my $struct;
-# 	unless( $opts ) {
-# 		# No lexeme was passed; use reading text.
-# 		$opts = $lexeme;
-# 		$lexeme = $self->text;
-# 		$self->use_lexemes( $lexeme );
-# 	}
-# 	# Get the correct container
-# 	( $struct ) = grep { exists $_->{$lexeme} } $self->lexemes;
-# 	unless( $struct ) {
-# 		warn "No lexeme $lexeme exists in this reading";
-# 		return;
-# 	}
-# 	# Now make the morph object and add it to this lexeme.
-# 	my $morph_obj = Text::Tradition::Collation::Reading::Morphology->new( $opts );
-# 	# TODO Check for existence
-# 	push( @{$struct->{$lexeme}}, $morph_obj );
-# }
+sub lemmatize {
+	my $self = shift;
+	unless( $self->has_language ) {
+		warn "Please set a language to lemmatize a tradition";
+		return;
+	}
+	my $mod = "Text::Tradition::Language::" . $self->language;
+	load( $mod );
+	$mod->can( 'reading_lookup' )->( $self );
+
+}
 
 ## Utility methods
 
@@ -331,87 +314,4 @@ sub TO_JSON {
 no Moose;
 __PACKAGE__->meta->make_immutable;
 
-###################################################
-### Morphology objects, to be attached to readings
-###################################################
-
-package Text::Tradition::Collation::Reading::Morphology;
-
-use Moose;
-
-has 'lemma' => (
-	is => 'ro',
-	isa => 'Str',
-	required => 1,
-	);
-	
-has 'code' => (
-	is => 'ro',
-	isa => 'Str',
-	required => 1,
-	);
-	
-has 'language' => (
-	is => 'ro',
-	isa => 'Str',
-	required => 1,
-	);
-	
-## Transmute codes into comparison arrays for our various languages.
-
-around BUILDARGS => sub {
-	my $orig = shift;
-	my $class = shift;
-	my $args;
-	if( @_ == 1 && ref( $_[0] ) ) {
-		$args = shift;
-	} else {
-		$args = { @_ };
-	}
-	if( exists( $args->{'serial'} ) ) {
-		my( $lemma, $code ) = split( /!!/, delete $args->{'serial'} );
-		$args->{'lemma'} = $lemma;
-		$args->{'code'} = $code;
-	}
-	$class->$orig( $args );
-};
-
-sub serialization {
-	my $self = shift;
-	return join( '!!', $self->lemma, $self->code );
-};
-
-sub comparison_array {
-	my $self = shift;
-	if( $self->language eq 'French' ) {
-		my @array;
-		my @bits = split( /\+/, $self->code );
-		# First push the non k/v parts.
-		while( @bits && $bits[0] !~ /=/ ) {
-			push( @array, shift @bits );
-		}
-		while( @array < 2 ) {
-			push( @array, undef );
-		}
-		# Now push the k/v parts in a known order.
-		my @fields = qw/ Pers Nb Temps Genre Spec Fonc /;
-		my %props;
-		map { my( $k, $v ) = split( /=/, $_ ); $props{$k} = $v; } @bits;
-		foreach my $k ( @fields ) {
-			push( @array, $props{$k} );
-		}
-		# Give the answer.
-		return @array;
-	} elsif( $self->language eq 'English' ) {
-		# Do something as yet undetermined
-	} else {
-		# Latin or Greek or Armenian, just split the chars
-		return split( '', $self->code );
-	}
-};
-
-no Moose;
-__PACKAGE__->meta->make_immutable;
-
 1;
-
diff --git a/lib/Text/Tradition/Collation/Reading/WordForm.pm b/lib/Text/Tradition/Collation/Reading/WordForm.pm
index 8f519f9..4c81929 100644
--- a/lib/Text/Tradition/Collation/Reading/WordForm.pm
+++ b/lib/Text/Tradition/Collation/Reading/WordForm.pm
@@ -57,27 +57,22 @@ has 'lemma' => (
 	
 has 'morphology' => (
 	is => 'ro',
-	isa => 'ArrayRef',
+	isa => 'Lingua::Features::Structure',
 	required => 1,
 	);
 	
-around BUILDARGS => sub {
-	my $orig = shift;
-	my $class = shift;
-	my %args = @_ == 1 ? %{$_[0]} : @_;
-	unless( ref( $args{'morphology'} ) ) {
-		my @morph = split( '', $args{'morphology'} );
-		$args{'morphology'} = \@morph;
-	}
-	$class->$orig( %args );
-};
-
-sub _stringify {
+=head2 to_string
+
+Returns a string combination of language/lemma/morphology that can be used
+in equivalence testing.
+
+=cut
+
+sub to_string {
 	my $self = shift;
-	return sprintf( "%s//%s//%s", $self->language, $self->lemma,
-		join( '|', @{$self->morphology} ) );
+	return join( '++', $self->language, $self->lemma, $self->morphology->to_string );
 }
-
+	
 no Moose;
 __PACKAGE__->meta->make_immutable;
 
diff --git a/lib/Text/Tradition/Language/French.pm b/lib/Text/Tradition/Language/French.pm
index 8028c98..780b9de 100644
--- a/lib/Text/Tradition/Language/French.pm
+++ b/lib/Text/Tradition/Language/French.pm
@@ -1,7 +1,9 @@
 package Text::Tradition::Language::French;
 
 use Encode qw/ encode_utf8 decode_utf8 /;
-use IPC::Run qw/ run binary /;
+use IPC::Run qw/ run /;
+use Lingua::TagSet::Multext;
+use Lingua::TagSet::TreeTagger;
 use Module::Load;
 use Text::Tradition::Collation::Reading::Lexeme;
 use Text::Tradition::Collation::Reading::WordForm;
@@ -11,11 +13,16 @@ my $MORPHDIR = '/Users/tla/Projects/morphology';
 
 =head1 NAME
 
-Text::Tradition::Language::French - language-specific modules for French
+Text::Tradition::Language::French - language-specific module for French
 
 =head1 DESCRIPTION
 
-Implements morphology lookup for French words in context.
+Implements morphology lookup for French words in context.  This module
+depends on the Flemm module for French lemmatization
+(L<http://www.univ-nancy2.fr/pers/namer/Outils.htm#fl3> in conjunction with
+the TreeTagger software
+(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
+(for now) expected to be installed in $MORPHDIR/TreeTagger.
 
 =head1 SUBROUTINES
 
@@ -23,6 +30,55 @@ Implements morphology lookup for French words in context.
 
 Evaluates the string using the Flemm package, and returns the results.
 
+=begin testing
+
+binmode STDOUT, ':utf8';
+use Text::Tradition;
+use_ok( 'Text::Tradition::Language::French' );
+
+eval "use Flemm";
+my $err = $@;
+
+SKIP: {
+	skip "Package Flemm not found" if $err;
+	my $tf = Text::Tradition->new(
+		'input' => 'Self',
+		'file' => 't/data/besoin.xml',
+		'language' => 'French' );
+		
+	is( $tf->language, 'French', "Set language okay" );
+	$tf->lemmatize();
+	# Test the lemmatization. How many readings now have morphological info?
+	# Do the lexemes match the reading?
+	my $ambig = 0;
+	foreach my $r ( $tf->collation->readings ) {
+		next if $r->is_meta;
+		ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
+		my @lex = $r->lexemes;
+		my $lexstr = join( '', map { $_->string } @lex );
+		my $textstr = $r->text;
+		$textstr =~ s/\s+//g;
+		is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
+		foreach my $l ( @lex ) {
+			next if $l->is_disambiguated;
+	# 		printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
+	# 			join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
+			$ambig++;
+		}
+	}
+	is( $ambig, 102, "Found 102 ambiguous forms as expected" );
+	
+	# Try setting the normal form of a reading and re-analyzing
+	my $mr = $tf->collation->reading('99,2');
+	is( $mr->text, 'minspire', "Picked correct test reading" );
+	is( $mr->language, 'French', "Reading has correct language setting" );
+	$mr->normal_form( "m'inspire" );
+	$mr->lemmatize;
+	is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
+}
+
+=end testing
+
 =cut
 
 sub lemmatize {
@@ -48,102 +104,28 @@ sub lemmatize {
 	foreach my $sig ( keys %witness_paths ) {
 		# Get the text as a sequence of readings and as a string
 		print STDERR "Morphologizing witness $sig\n";
-		my @path = @{$witness_paths{$sig}};
-		my $tagresult = _treetag_string( $c->_text_from_path( @path ) );
-		if( $tagresult ) {
-			# Map the tagged words onto the original readings, splitting 
-			# them up into lexemes where necessary.
-			# NOTE we can have multiple lexemes in a reading, but not
-			# multiple readings to a lexeme.
-			my @tags = split( /\n/, $tagresult );
-			my @lexemes;
-			my $curr_rdg = shift @path;
-			my @curr_lexemes;
-			my $unused_rdg_part;
-			foreach my $tag ( @tags ) {
-				# Get the original word
-				my( $lexeme, @rest ) = split( /\t/, $tag );
-				# Lemmatize the whole
-				my @forms = _parse_wordform( _flemm_lookup( $tag ) );
-				my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
-					'string' => $lexeme, 'language' => 'French',
-					'wordform_matchlist' => \@forms );
-				# Find the next non-meta reading
-				while( $curr_rdg->is_meta ) {
-					$curr_rdg = shift @path;
-				}
-				unless( $curr_rdg ) {
-					warn "Ran out of readings in sequence for " . $wit->sigil
-						. " at $lexeme";
-					last;
-				}
-				if( $unused_rdg_part &&
-					$unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
-					# Nth part of curr_rdg
-					$unused_rdg_part = $2;
-					push( @curr_lexemes, $lexobj );
-				} elsif( $curr_rdg->text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
-					# Flag an error if there is already an unused reading part.
-					warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
-						if $unused_rdg_part;
-					$unused_rdg_part = $2; # will be empty if the whole reading matched
-					push( @curr_lexemes, $lexobj );
-				} else {
-					# We do not cope with the idea of a lexeme being 
-					# spread across multiple readings.
-					warn "Word sequence changed unexpectedly in text";
-					# See if we can find a matching reading
-					my @lookahead;
-					my $matched;
-					while( my $nr = shift @path ) {
-						if( $nr->text =~ /^\Q$lexeme\E/ ) {
-							$curr_rdg = $lookahead[-1] if @lookahead;
-							$matched = 1;
-							last;
-						} else {
-							push( @lookahead, $nr );
-						}
-					}
-					# No match? Restore the state we had
-					unless( $matched ) {
-						unshift( @path, @lookahead );
-					}
-					# Trigger a move
-					$unused_rdg_part = '';
-				}
-				
-				unless( $unused_rdg_part ) {
-					# Record the lexemes for the given reading.
-					#print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
-					#	join( ' ', map { $_->string } @curr_lexemes ),
-					#	$curr_rdg->id, $curr_rdg->text );
-					_update_reading_lexemes( $curr_rdg, @curr_lexemes );
-					$curr_rdg = shift @path;
-					@curr_lexemes = ();
-				}
-			}
-		}
+		_lemmatize_sequence( undef, @{$witness_paths{$sig}} );
 	}
 }
 
 sub _update_reading_lexemes {
-	my( $reading, @lexemes ) = @_;
-	if( $reading->has_lexemes ) {
+	my( $replace, $reading, @lexemes ) = @_;
+	if( $reading->has_lexemes && !$replace ) {
 		# We need to merge what is in @lexemes with what we have already.
 		my @oldlex = $reading->lexemes;
 		my $cmp1 = join( '||', map { $_->string } @oldlex );
 		my $cmp2 = join( '||', map { $_->string } @lexemes );
-		if ( @oldlex == @lexemes && $cmp1 == $cmp2 ) {
+		if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
 			# The lexeme strings are the same, so merge the possible
 			# word forms from new to old.
 			foreach my $i ( 0 .. $#lexemes ) {
 				my $ol = $oldlex[$i];
 				my $nl = $lexemes[$i];
 				my %ofw;
-				map { $ofw{$_->_stringify} = 1 } $ol->matching_forms;
+				map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
 				foreach my $form ( $nl->matching_forms ) {
-					unless( $ofw{$form->_stringify} ) {
-						print STDERR "Adding form " . $form->_stringify . 
+					unless( $ofw{$form->to_string} ) {
+						print STDERR "Adding form " . $form->to_string . 
 							" to lexeme " . $nl->string . " at $reading\n";
 						$ol->add_matching_form( $form );
 						$ol->is_disambiguated(0);
@@ -151,35 +133,125 @@ sub _update_reading_lexemes {
 				}
 			}
 		} else {
-			$DB::single = 1;
 			warn "Lexeme layout for $reading changed; replacing the lot";
 			$reading->clear_lexemes;
 			$reading->add_lexeme( @lexemes );
 		}
 	} else {
+		$reading->clear_lexemes if $replace;
 		$reading->add_lexeme( @lexemes );
 	}
 }
 
-=head2 word_lookup( $word )
+=head2 reading_lookup( $rdg[, $rdg, ...] )
 
-Looks up a word using the Flemm package, and returns the possible results.
-It is better to use L<lemmatize> for context sensitivity.
+Looks up one or more readings using the Flemm package, and returns the
+possible results.  This uses the same logic as L<lemmatize> above for the
+entire tradition, but can also be used to (re-)analyze individual readings.
 
 =cut
 
-sub word_lookup {
-	my $word = shift;
-	my $tagresult = _treetag_string( $word );
-	my $lemmatizer;
-	try {
-		load 'Flemm';
-		$lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' );
-	} catch {
-		warn "Cannot do French word lemmatization without Flemm: @_";
-		return;
+sub reading_lookup {
+	return _lemmatize_sequence( 1, @_ );
+}
+
+sub _lemmatize_sequence {
+	my( $replace, @path ) = @_;
+	$DB::single = 1 if $replace;
+	my $tagresult = _treetag_string( _text_from_path( 1, @path ) );
+	if( $tagresult ) {
+		# Map the tagged words onto the original readings, splitting 
+		# them up into lexemes where necessary.
+		# NOTE we can have multiple lexemes in a reading, but not
+		# multiple readings to a lexeme.
+		my @tags = split( /\n/, $tagresult );
+		my @lexemes;
+		my $curr_rdg = shift @path;
+		my @curr_lexemes;
+		my $unused_rdg_part;
+		foreach my $tag ( @tags ) {
+			# Get the original word
+			my( $lexeme, @rest ) = split( /\t/, $tag );
+			# Lemmatize the whole
+			my @forms = _parse_wordform( _flemm_lookup( $tag ) );
+			my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
+				'string' => $lexeme, 'language' => 'French',
+				'wordform_matchlist' => \@forms );
+			# Find the next non-meta reading
+			while( $curr_rdg && $curr_rdg->is_meta ) {
+				$curr_rdg = shift @path;
+			}
+			unless( $curr_rdg ) {
+				warn "Ran out of readings in sequence for " . $wit->sigil
+					. " at $lexeme";
+				last;
+			}
+			my $curr_rdg_text = $curr_rdg->has_normal_form 
+				? $curr_rdg->normal_form : $curr_rdg->text;
+			if( $unused_rdg_part &&
+				$unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
+				# Nth part of curr_rdg
+				$unused_rdg_part = $2;
+				push( @curr_lexemes, $lexobj );
+			} elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
+				# Flag an error if there is already an unused reading part.
+				warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
+					if $unused_rdg_part;
+				$unused_rdg_part = $2; # will be empty if the whole reading matched
+				push( @curr_lexemes, $lexobj );
+			} else {
+				# We do not cope with the idea of a lexeme being 
+				# spread across multiple readings.
+				warn "Word sequence changed unexpectedly in text";
+				# See if we can find a matching reading
+				my @lookahead;
+				my $matched;
+				while( my $nr = shift @path ) {
+					my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
+					if( $nrtext =~ /^\Q$lexeme\E/ ) {
+						$curr_rdg = $lookahead[-1] if @lookahead;
+						$matched = 1;
+						last;
+					} else {
+						push( @lookahead, $nr );
+					}
+				}
+				# No match? Restore the state we had
+				unless( $matched ) {
+					unshift( @path, @lookahead );
+				}
+				# Trigger a move
+				$unused_rdg_part = '';
+			}
+			
+			unless( $unused_rdg_part ) {
+				# Record the lexemes for the given reading.
+				#print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
+				#	join( ' ', map { $_->string } @curr_lexemes ),
+				#	$curr_rdg->id, $curr_rdg->text );
+				_update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes );
+				$curr_rdg = shift @path;
+				@curr_lexemes = ();
+			}
+		}
+	}
+}
+
+# Utility function so that we can cheat and use it when we need both the path
+# and its text.
+sub _text_from_path {
+	my( $normalize, @path ) = @_;
+	my $pathtext = '';
+	my $last;
+	foreach my $r ( @path ) {
+		unless ( $r->join_prior || !$last || $last->join_next ) {
+			$pathtext .= ' ';
+		} 
+		$pathtext .= ( $normalize && $r->has_normal_form ) 
+			? $r->normal_form : $r->text;
+		$last = $r;
 	}
-	return _parse_wordform( _flemm_lookup( $tagresult ) );
+	return $pathtext;
 }
 
 # Utility function that actually calls the tree tagger.
@@ -229,18 +301,37 @@ sub _parse_wordform {
 	foreach ( @results ) {
 		my( $orig, $tag, $lemma ) = split( /\t/, $_ );
 		my( $pos, $morph ) = split( /:/, $tag );
-		my $wf = Text::Tradition::Collation::Reading::WordForm->new(
-			'language' => 'French',
-			'lemma' => $lemma,
-			'morphology' => [ split( //, $morph ) ],
-			);
-		push( @forms, $wf );
+		my $morphobj;
+		if( $morph ) {
+			$morphobj = Lingua::TagSet::Multext->tag2structure( $morph );
+		} else {
+			# Use the TreeTagger info if there is no Flemm morphology.
+			$morphobj = Lingua::TagSet::TreeTagger->tag2structure( $pos );
+		}
+		if( $morphobj ) {
+			my $wf = Text::Tradition::Collation::Reading::WordForm->new(
+				'language' => 'French',
+				'lemma' => $lemma,
+				'morphology' => $morphobj,
+				);
+			push( @forms, $wf );
+		} else {
+			warn "No morphology found for word: $_";
+		}
 	}
 	return @forms;
 }
 
 1;
 
+=head2 TODO
+
+=over
+
+=item * Handle package dependencies more gracefully
+
+=back
+
 =head1 LICENSE
 
 This package is free software and is provided "as is" without express
diff --git a/t/text_tradition_language_french.t b/t/text_tradition_language_french.t
new file mode 100644
index 0000000..01b206b
--- /dev/null
+++ b/t/text_tradition_language_french.t
@@ -0,0 +1,60 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Test::More 'no_plan';
+$| = 1;
+
+
+
+# =begin testing
+{
+binmode STDOUT, ':utf8';
+use Text::Tradition;
+use_ok( 'Text::Tradition::Language::French' );
+
+eval "use Flemm";
+my $err = $@;
+
+SKIP: {
+	skip "Package Flemm not found" if $err;
+	my $tf = Text::Tradition->new(
+		'input' => 'Self',
+		'file' => 't/data/besoin.xml',
+		'language' => 'French' );
+		
+	is( $tf->language, 'French', "Set language okay" );
+	$tf->lemmatize();
+	# Test the lemmatization. How many readings now have morphological info?
+	# Do the lexemes match the reading?
+	my $ambig = 0;
+	foreach my $r ( $tf->collation->readings ) {
+		next if $r->is_meta;
+		ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
+		my @lex = $r->lexemes;
+		my $lexstr = join( '', map { $_->string } @lex );
+		my $textstr = $r->text;
+		$textstr =~ s/\s+//g;
+		is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
+		foreach my $l ( @lex ) {
+			next if $l->is_disambiguated;
+	# 		printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
+	# 			join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
+			$ambig++;
+		}
+	}
+	is( $ambig, 102, "Found 102 ambiguous forms as expected" );
+	
+	# Try setting the normal form of a reading and re-analyzing
+	my $mr = $tf->collation->reading('99,2');
+	is( $mr->text, 'minspire', "Picked correct test reading" );
+	is( $mr->language, 'French', "Reading has correct language setting" );
+	$mr->normal_form( "m'inspire" );
+	$mr->lemmatize;
+	is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
+}
+}
+
+
+
+
+1;