X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FLanguage%2FFrench.pm;h=9dbfd01ee0cfc288dd101fe4b8e316cb00d7559c;hb=f8862b584dcc04728d3bff48ea7c19cb9a078772;hp=745304414e799cf289bb05389b74b6706762ee15;hpb=e0f6836abec58dd0dd896a130c7587368880d255;p=scpubgit%2Fstemmatology.git

diff --git a/lib/Text/Tradition/Language/French.pm b/lib/Text/Tradition/Language/French.pm
index 7453044..9dbfd01 100644
--- a/lib/Text/Tradition/Language/French.pm
+++ b/lib/Text/Tradition/Language/French.pm
@@ -2,8 +2,10 @@ package Text::Tradition::Language::French;
 
 use strict;
 use warnings;
+use Lingua::TagSet::Multext;
+use Lingua::TagSet::TreeTagger::French;
 use Module::Load qw/ load /;
-use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger /;
+use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /;
 use TryCatch;
 
 =head1 NAME
@@ -46,6 +48,7 @@ SKIP: {
 	# Test the lemmatization. How many readings now have morphological info?
 	# Do the lexemes match the reading?
 	my $ambig = 0;
+	my $flemmed = 0;
 	foreach my $r ( $tf->collation->readings ) {
 		next if $r->is_meta;
 		ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
@@ -55,13 +58,16 @@ SKIP: {
 		$textstr =~ s/\s+//g;
 		is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
 		foreach my $l ( @lex ) {
+			# Check to see if Flemm actually ran
+			foreach my $wf ( $l->matching_forms ) {
+				$flemmed++ if $wf->morphology->get_feature('num');
+			}
 			next if $l->is_disambiguated;
-	# 		printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
-	# 			join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
 			$ambig++;
 		}
 	}
 	is( $ambig, 102, "Found 102 ambiguous forms as expected" );
+	ok( $flemmed > 500, "Found enough Flemm info in wordforms" );
 	
 	# Try setting the normal form of a reading and re-analyzing
 	my $mr = $tf->collation->reading('r99.2');
@@ -69,7 +75,11 @@ SKIP: {
 	is( $mr->language, 'French', "Reading has correct language setting" );
 	$mr->normal_form( "m'inspire" );
 	$mr->lemmatize;
-	is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
+	my @l = $mr->lexemes;
+	is( @l, 2, "Got two lexemes for new m'inspire reading" );
+	is( $l[0]->form->to_string,
+		'"French // se|le|lui // cat@pron type@pers pers@1 num@sing case@acc|dat"',
+		"New reading has correct first lexeme" );
 }
 
 =end testing
@@ -103,6 +113,16 @@ sub reading_lookup {
 	return reading_lookup_treetagger( %opts );
 }
 
+=head2 morphology_tags
+
+Return a data structure describing the available parts of speech and their attributes.
+
+=cut
+
+sub morphology_tags {
+	return lfs_morph_tags();
+}
+
 # Closure and utility function for the package lemmatizer
 {
 	my $lemmatizer;
@@ -137,7 +157,7 @@ sub _parse_wordform {
 			$morphobj = Lingua::TagSet::Multext->tag2structure( $morph );
 		} else {
 			# Use the TreeTagger info if there is no Flemm morphology.
-			$morphobj = Lingua::TagSet::TreeTagger->tag2structure( $pos );
+			$morphobj = Lingua::TagSet::TreeTagger::French->tag2structure( $pos );
 		}
 		if( $morphobj ) {
 			my $wf = Text::Tradition::Collation::Reading::WordForm->new(