From: Tara L Andrews <tla@mit.edu>
Date: Wed, 6 Jun 2012 20:41:33 +0000 (+0200)
Subject: use improvements in Lingua packages throughout our lexeme tagging
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=f8862b584dcc04728d3bff48ea7c19cb9a078772;p=scpubgit%2Fstemmatology.git

use improvements in Lingua packages throughout our lexeme tagging
---

diff --git a/Makefile.PL b/Makefile.PL
index ceb652d..789d19c 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -33,7 +33,8 @@ requires( 'XML::LibXML::XPathContext' );
 requires( 'YAML::XS' );
 # For the morphology stuff
 requires( 'Lingua::TagSet::Multext' );
-requires( 'Lingua::TagSet::TreeTagger' );
+requires( 'Lingua::TagSet::TreeTagger::French' );
+requires( 'Lingua::TagSet::TreeTagger::English' );
 requires( 'Lingua::Features::Structure' );
 build_requires( 'Test::Warn' );
 # Modules needed for morphology but not trivially CPANnable
diff --git a/lib/Text/Tradition/Language/Base.pm b/lib/Text/Tradition/Language/Base.pm
index 954c30c..dab12f2 100644
--- a/lib/Text/Tradition/Language/Base.pm
+++ b/lib/Text/Tradition/Language/Base.pm
@@ -6,15 +6,12 @@ use Encode qw/ encode_utf8 decode_utf8 /;
 use Exporter 'import';
 use vars qw/ @EXPORT_OK /;
 use IPC::Run qw/ run /;
-use Lingua::TagSet::Multext;
-use Lingua::TagSet::TreeTagger;
 use Module::Load;
 use Text::Tradition::Collation::Reading::Lexeme;
 use Text::Tradition::Collation::Reading::WordForm;
 use TryCatch;
 
-@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct
-	multext_struct lfs_morph_tags /;
+@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /;
 
 =head1 NAME
 
@@ -245,19 +242,6 @@ sub _treetag_string {
 	return $tagresult->as_text();
 }
 
-## HACK function to correct for TagSet::TreeTagger brokenness
-sub treetagger_struct {
-	my $pos = shift;
-	$pos =~ s/PREP/PRP/;
-	return Lingua::TagSet::TreeTagger->tag2structure( $pos );
-}
-
-sub multext_struct {
-	my $pos = shift;
-	# No known hacks needed
-	return Lingua::TagSet::Multext->tag2structure( $pos );
-}
-
 =head2 lfs_morph_tags
 
 Return a data structure describing the available parts of speech and their attributes
@@ -269,16 +253,25 @@ sub lfs_morph_tags {
 	load('Lingua::Features::StructureType');
 	my $tagset = { 'structures' => [], 'features' => {} };
 	foreach my $lfs ( sort { _by_structid( $a->id, $b->id ) } Lingua::Features::StructureType->types() ) {
-		my $tsstruct = { 'id' => $lfs->id, 'use_features' => [] };
+		my $tsstruct = { 'id' => $lfs->id, 'desc' => $lfs->desc, 'use_features' => [] };
 		foreach my $ftid ( Lingua::Features::StructureType->type($lfs->id)->features ) {
 			my $ftype = $lfs->feature_type( $ftid );
-			my $tfstruct = { 'id' => $ftid, 'values' => [] };
-			foreach my $fval( $ftype->values ) {
-				push( @{$tfstruct->{'values'}}, 
-					{ 'short' => $fval, 'long' => $ftype->value_name( $fval ) } );
+			if( !$ftype && $lfs->base ) {
+				$ftype = $lfs->base->feature_type( $ftid );
+			}
+			if( $ftype ) {
+				push( @{$tsstruct->{'use_features'}}, $ftid );
+				if( $ftid eq 'type' ) {
+					# Type values change according to category
+					$ftid .= " (" . $lfs->id . ")";
+				}
+				my $tfstruct = { 'id' => $ftid, 'values' => [] };
+				foreach my $fval( $ftype->values ) {
+					push( @{$tfstruct->{'values'}}, 
+						{ 'short' => $fval, 'long' => $ftype->value_name( $fval ) } );
+				}
+				$tagset->{'features'}->{$ftid} = $tfstruct;
 			}
-			push( @{$tsstruct->{'use_features'}}, $ftid );
-			$tagset->{'features'}->{$ftid} = $tfstruct;
 		}
 		push( @{$tagset->{'structures'}}, $tsstruct );
 	}
diff --git a/lib/Text/Tradition/Language/English.pm b/lib/Text/Tradition/Language/English.pm
index ea38a7f..a2dc236 100644
--- a/lib/Text/Tradition/Language/English.pm
+++ b/lib/Text/Tradition/Language/English.pm
@@ -2,6 +2,7 @@ package Text::Tradition::Language::English;
 
 use strict;
 use warnings;
+use Lingua::TagSet::TreeTagger::English;
 use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger
 	lfs_morph_tags /;
 use TryCatch;
@@ -74,15 +75,17 @@ sub morphology_tags {
 sub _parse_wordform {
 	my $tagresult = shift;
 	my( $orig, $tag, $lemma ) = split( /\t/, $tagresult );
-	my $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $tag );
+	return () unless $tag =~ /\w/; # skip punct-only "tags"
+	my $morphobj = Lingua::TagSet::TreeTagger::English->tag2structure( $tag );
 	if( $morphobj ) {
-		return Text::Tradition::Collation::Reading::WordForm->new(
+		return ( Text::Tradition::Collation::Reading::WordForm->new(
 			'language' => 'English',
 			'lemma' => $lemma,
 			'morphology' => $morphobj,
-			);
+			) );
 	} else {
-		warn "No morphology found for word: $_";
+		warn "No morphology found for word: $tagresult";
+		return ();
 	}
 }
 
diff --git a/lib/Text/Tradition/Language/French.pm b/lib/Text/Tradition/Language/French.pm
index fa884ff..9dbfd01 100644
--- a/lib/Text/Tradition/Language/French.pm
+++ b/lib/Text/Tradition/Language/French.pm
@@ -2,8 +2,10 @@ package Text::Tradition::Language::French;
 
 use strict;
 use warnings;
+use Lingua::TagSet::Multext;
+use Lingua::TagSet::TreeTagger::French;
 use Module::Load qw/ load /;
-use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct multext_struct lfs_morph_tags /;
+use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /;
 use TryCatch;
 
 =head1 NAME
@@ -46,6 +48,7 @@ SKIP: {
 	# Test the lemmatization. How many readings now have morphological info?
 	# Do the lexemes match the reading?
 	my $ambig = 0;
+	my $flemmed = 0;
 	foreach my $r ( $tf->collation->readings ) {
 		next if $r->is_meta;
 		ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
@@ -55,13 +58,16 @@ SKIP: {
 		$textstr =~ s/\s+//g;
 		is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
 		foreach my $l ( @lex ) {
+			# Check to see if Flemm actually ran
+			foreach my $wf ( $l->matching_forms ) {
+				$flemmed++ if $wf->morphology->get_feature('num');
+			}
 			next if $l->is_disambiguated;
-	# 		printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
-	# 			join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
 			$ambig++;
 		}
 	}
 	is( $ambig, 102, "Found 102 ambiguous forms as expected" );
+	ok( $flemmed > 500, "Found enough Flemm info in wordforms" );
 	
 	# Try setting the normal form of a reading and re-analyzing
 	my $mr = $tf->collation->reading('r99.2');
@@ -69,7 +75,11 @@ SKIP: {
 	is( $mr->language, 'French', "Reading has correct language setting" );
 	$mr->normal_form( "m'inspire" );
 	$mr->lemmatize;
-	is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
+	my @l = $mr->lexemes;
+	is( @l, 2, "Got two lexemes for new m'inspire reading" );
+	is( $l[0]->form->to_string,
+		'"French // se|le|lui // cat@pron type@pers pers@1 num@sing case@acc|dat"',
+		"New reading has correct first lexeme" );
 }
 
 =end testing
@@ -144,10 +154,10 @@ sub _parse_wordform {
 		my( $pos, $morph ) = split( /:/, $tag );
 		my $morphobj;
 		if( $morph ) {
-			$morphobj = multext_struct( $morph );
+			$morphobj = Lingua::TagSet::Multext->tag2structure( $morph );
 		} else {
 			# Use the TreeTagger info if there is no Flemm morphology.
-			$morphobj = treetagger_struct( $pos );
+			$morphobj = Lingua::TagSet::TreeTagger::French->tag2structure( $pos );
 		}
 		if( $morphobj ) {
 			my $wf = Text::Tradition::Collation::Reading::WordForm->new(
diff --git a/lib/Text/Tradition/Language/Latin.pm b/lib/Text/Tradition/Language/Latin.pm
index 7873c54..1d278b5 100644
--- a/lib/Text/Tradition/Language/Latin.pm
+++ b/lib/Text/Tradition/Language/Latin.pm
@@ -3,8 +3,7 @@ package Text::Tradition::Language::Latin;
 use strict;
 use warnings;
 use Module::Load;
-use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct 
-	lfs_morph_tags /;
+use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /;
 use TryCatch;
 
 =head1 NAME
@@ -14,10 +13,7 @@ Text::Tradition::Language::Latin - language-specific module for Latin
 =head1 DESCRIPTION
 
 Implements morphology lookup for French words in context.  This module
-depends on the Morph::Perseus module for access to PhiloLogic database data.
-It also depends on the TreeTagger software
-(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
-(for now) expected to be installed in $MORPHDIR/TreeTagger.
+depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
 
 =head1 SUBROUTINES
 
@@ -30,11 +26,11 @@ Evaluates the string using the Flemm package, and returns the results.
 use Text::Tradition;
 use_ok( 'Text::Tradition::Language::Latin' );
 
-eval "use Morph::Perseus";
+eval "use Lingua::Morph::Perseus";
 my $err = $@;
 
 SKIP: {
-	skip "Package Morph::Perseus not found" if $err;
+	skip "Package Lingua::Morph::Perseus not found" if $err;
 
 	my $trad = Text::Tradition->new(
 		'language' => 'Latin',
@@ -58,7 +54,7 @@ SKIP: {
 			$ambig++;
 		}
 	}
-	is( $ambig, 7, "Found 7 ambiguous forms as expected" );
+	is( $ambig, 4, "Found 4 ambiguous forms as expected" );
 }
 
 =end testing
@@ -94,11 +90,6 @@ Return a data structure describing the available parts of speech and their attri
 =cut
 
 sub morphology_tags {
-	try {
-		load 'Morph::Perseus::Structure';
-	} catch {
-		warn "Not using Perseus Latin tags";
-	}
 	return lfs_morph_tags();
 }
 
@@ -109,11 +100,10 @@ sub morphology_tags {
 	sub _morph_connect {
 		unless( $morph ) {
 			try {
-				load 'Morph::Perseus';
-				load 'Morph::Perseus::Structure';
-				$morph = Morph::Perseus->connect( 'Latin' );
+				load 'Lingua::Morph::Perseus';
+				$morph = Lingua::Morph::Perseus->connect( 'Latin' );
 			} catch {
-				warn "Cannot do Latin word lemmatization without Morph::Perseus: @_";
+				warn "Cannot do Latin word lemmatization without Lingua::Morph::Perseus: @_";
 				return;
 			}
 		}
@@ -127,7 +117,6 @@ sub morphology_tags {
 	#  T carne -> carnis
 	#  T melius -> bonus
 	
-
 	my %excep = (
 		'absens' => 'absum',
 		'aperte' => 'apertus',
@@ -162,49 +151,33 @@ sub morphology_tags {
 	sub _perseus_lookup_tt {
 		my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
 		_morph_connect();
-		return unlesss $morph;
-		my $result = $morph->lookup( $orig );
+		return unless $morph;
 		# Discard results that don't match the lemma, unless lemma is unknown
-		my @orig = @{$result->{'objects'}};
-		my @ret;
+		my $lookupopts = {};
 		unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
 			# TODO Perseus lemma might have a number on the end, yuck.
 			#  multiple lemmata separated with |
 			$lemma =~ s/[^\w|]//g;
 			$lemma = $excep{$lemma} if exists $excep{$lemma};
 			$lemma =~ s/j/i/g;
-			my %lems;
-			my @forms = 
-			map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
-			# Now match the lemmata from Treetagger to the lemmata and alt_ls 
-			# from Perseus.
-			@ret = grep { 
-				my $x = $_->lemma; 
-				$x =~ s/\d+$//;
-				my $y = $_->alt_lex || '';
-				$y =~ s/\d+$//;
-				$lems{$x} || $lems{$y};
-				} @orig;
-			warn "TreeTagger lemma $lemma matched no results from Perseus for $orig"
-				if @orig && !@ret;
+			if( $lemma ) { # if we have anything left...
+				my %lems;
+				map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
+				$lookupopts->{'lemma'} = [ keys %lems ];
+			}
 		}
-		@ret = @orig unless @ret;
+		$lookupopts->{'ttpos'} = $pos if $pos;
 		
+		my $result = $morph->lexicon_lookup( $orig, $lookupopts );
+		# unless( !keys( %$lookupopts ) ||  $result->{'filtered'} ) {
+		# 	warn "Filter on $pos / $lemma returned no results; using all results";
+		# }
+		my @ret = @{$result->{'objects'}};
 		my %unique_wordforms;
 		foreach my $obj ( @ret ) {
 			my $wf = _wordform_from_row( $obj );
 			$unique_wordforms{$wf->to_string} = $wf;
 		}
-		## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet
-		# Discard results that don't match the given TreeTagger POS, unless
-		# that leaves zero results
-# 		my $ttstruct = treetagger_struct( $pos );
-# 		my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms;
-# 		unless( @ttmatch ) {
-# 			warn "TreeTagger POS $pos matched no results from Perseus for $orig";
-# 			@ttmatch = @wordforms;
-# 		}
-# 		return @ttmatch;
 		return values( %unique_wordforms );
 	}
 	
@@ -221,20 +194,12 @@ sub morphology_tags {
 
 sub _wordform_from_row {
 	my( $rowobj ) = @_;
-	my $mpstruct;
-	try {
-		# M::P::St will be loaded already if we got here
-		$mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code );
-	} catch {
-		warn "Could not create morphology structure from "
-			. $rowobj->code . ": $!";
-	}
 	my $lemma = $rowobj->lemma;
 	$lemma =~ s/^(\D+)\d*$/$1/;
 	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
 		'language' => 'Latin',
 		'lemma' => $lemma,
-		'morphology' => $mpstruct,
+		'morphology' => $rowobj->morphology,
 		);
 	return $wf;
 }
diff --git a/t/text_tradition_language_french.t b/t/text_tradition_language_french.t
index 84e9d02..3d1b38a 100644
--- a/t/text_tradition_language_french.t
+++ b/t/text_tradition_language_french.t
@@ -27,6 +27,7 @@ SKIP: {
 	# Test the lemmatization. How many readings now have morphological info?
 	# Do the lexemes match the reading?
 	my $ambig = 0;
+	my $flemmed = 0;
 	foreach my $r ( $tf->collation->readings ) {
 		next if $r->is_meta;
 		ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
@@ -36,13 +37,16 @@ SKIP: {
 		$textstr =~ s/\s+//g;
 		is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
 		foreach my $l ( @lex ) {
+			# Check to see if Flemm actually ran
+			foreach my $wf ( $l->matching_forms ) {
+				$flemmed++ if $wf->morphology->get_feature('num');
+			}
 			next if $l->is_disambiguated;
-	# 		printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
-	# 			join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
 			$ambig++;
 		}
 	}
 	is( $ambig, 102, "Found 102 ambiguous forms as expected" );
+	ok( $flemmed > 500, "Found enough Flemm info in wordforms" );
 	
 	# Try setting the normal form of a reading and re-analyzing
 	my $mr = $tf->collation->reading('r99.2');
@@ -50,7 +54,11 @@ SKIP: {
 	is( $mr->language, 'French', "Reading has correct language setting" );
 	$mr->normal_form( "m'inspire" );
 	$mr->lemmatize;
-	is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
+	my @l = $mr->lexemes;
+	is( @l, 2, "Got two lexemes for new m'inspire reading" );
+	is( $l[0]->form->to_string,
+		'"French // se|le|lui // cat@pron type@pers pers@1 num@sing case@acc|dat"',
+		"New reading has correct first lexeme" );
 }
 }
 
diff --git a/t/text_tradition_language_latin.t b/t/text_tradition_language_latin.t
index 0fe9405..4b948f6 100644
--- a/t/text_tradition_language_latin.t
+++ b/t/text_tradition_language_latin.t
@@ -11,11 +11,11 @@ $| = 1;
 use Text::Tradition;
 use_ok( 'Text::Tradition::Language::Latin' );
 
-eval "use Morph::Perseus";
+eval "use Lingua::Morph::Perseus";
 my $err = $@;
 
 SKIP: {
-	skip "Package Morph::Perseus not found" if $err;
+	skip "Package Lingua::Morph::Perseus not found" if $err;
 
 	my $trad = Text::Tradition->new(
 		'language' => 'Latin',
@@ -39,7 +39,7 @@ SKIP: {
 			$ambig++;
 		}
 	}
-	is( $ambig, 7, "Found 7 ambiguous forms as expected" );
+	is( $ambig, 4, "Found 4 ambiguous forms as expected" );
 }
 }