From: Tara L Andrews Date: Wed, 6 Jun 2012 20:41:33 +0000 (+0200) Subject: use improvements in Lingua packages throughout our lexeme tagging X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=f8862b584dcc04728d3bff48ea7c19cb9a078772;p=scpubgit%2Fstemmatology.git use improvements in Lingua packages throughout our lexeme tagging --- diff --git a/Makefile.PL b/Makefile.PL index ceb652d..789d19c 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -33,7 +33,8 @@ requires( 'XML::LibXML::XPathContext' ); requires( 'YAML::XS' ); # For the morphology stuff requires( 'Lingua::TagSet::Multext' ); -requires( 'Lingua::TagSet::TreeTagger' ); +requires( 'Lingua::TagSet::TreeTagger::French' ); +requires( 'Lingua::TagSet::TreeTagger::English' ); requires( 'Lingua::Features::Structure' ); build_requires( 'Test::Warn' ); # Modules needed for morphology but not trivially CPANnable diff --git a/lib/Text/Tradition/Language/Base.pm b/lib/Text/Tradition/Language/Base.pm index 954c30c..dab12f2 100644 --- a/lib/Text/Tradition/Language/Base.pm +++ b/lib/Text/Tradition/Language/Base.pm @@ -6,15 +6,12 @@ use Encode qw/ encode_utf8 decode_utf8 /; use Exporter 'import'; use vars qw/ @EXPORT_OK /; use IPC::Run qw/ run /; -use Lingua::TagSet::Multext; -use Lingua::TagSet::TreeTagger; use Module::Load; use Text::Tradition::Collation::Reading::Lexeme; use Text::Tradition::Collation::Reading::WordForm; use TryCatch; -@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct - multext_struct lfs_morph_tags /; +@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /; =head1 NAME @@ -245,19 +242,6 @@ sub _treetag_string { return $tagresult->as_text(); } -## HACK function to correct for TagSet::TreeTagger brokenness -sub treetagger_struct { - my $pos = shift; - $pos =~ s/PREP/PRP/; - return Lingua::TagSet::TreeTagger->tag2structure( $pos ); -} - -sub multext_struct { - my $pos = shift; - # No known hacks needed - return Lingua::TagSet::Multext->tag2structure( $pos ); -} - =head2 lfs_morph_tags Return a data structure describing the available parts of speech and their attributes @@ -269,16 +253,25 @@ sub lfs_morph_tags { load('Lingua::Features::StructureType'); my $tagset = { 'structures' => [], 'features' => {} }; foreach my $lfs ( sort { _by_structid( $a->id, $b->id ) } Lingua::Features::StructureType->types() ) { - my $tsstruct = { 'id' => $lfs->id, 'use_features' => [] }; + my $tsstruct = { 'id' => $lfs->id, 'desc' => $lfs->desc, 'use_features' => [] }; foreach my $ftid ( Lingua::Features::StructureType->type($lfs->id)->features ) { my $ftype = $lfs->feature_type( $ftid ); - my $tfstruct = { 'id' => $ftid, 'values' => [] }; - foreach my $fval( $ftype->values ) { - push( @{$tfstruct->{'values'}}, - { 'short' => $fval, 'long' => $ftype->value_name( $fval ) } ); + if( !$ftype && $lfs->base ) { + $ftype = $lfs->base->feature_type( $ftid ); + } + if( $ftype ) { + push( @{$tsstruct->{'use_features'}}, $ftid ); + if( $ftid eq 'type' ) { + # Type values change according to category + $ftid .= " (" . $lfs->id . ")"; + } + my $tfstruct = { 'id' => $ftid, 'values' => [] }; + foreach my $fval( $ftype->values ) { + push( @{$tfstruct->{'values'}}, + { 'short' => $fval, 'long' => $ftype->value_name( $fval ) } ); + } + $tagset->{'features'}->{$ftid} = $tfstruct; } - push( @{$tsstruct->{'use_features'}}, $ftid ); - $tagset->{'features'}->{$ftid} = $tfstruct; } push( @{$tagset->{'structures'}}, $tsstruct ); } diff --git a/lib/Text/Tradition/Language/English.pm b/lib/Text/Tradition/Language/English.pm index ea38a7f..a2dc236 100644 --- a/lib/Text/Tradition/Language/English.pm +++ b/lib/Text/Tradition/Language/English.pm @@ -2,6 +2,7 @@ package Text::Tradition::Language::English; use strict; use warnings; +use Lingua::TagSet::TreeTagger::English; use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /; use TryCatch; @@ -74,15 +75,17 @@ sub morphology_tags { sub _parse_wordform { my $tagresult = shift; my( $orig, $tag, $lemma ) = split( /\t/, $tagresult ); - my $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $tag ); + return () unless $tag =~ /\w/; # skip punct-only "tags" + my $morphobj = Lingua::TagSet::TreeTagger::English->tag2structure( $tag ); if( $morphobj ) { - return Text::Tradition::Collation::Reading::WordForm->new( + return ( Text::Tradition::Collation::Reading::WordForm->new( 'language' => 'English', 'lemma' => $lemma, 'morphology' => $morphobj, - ); + ) ); } else { - warn "No morphology found for word: $_"; + warn "No morphology found for word: $tagresult"; + return (); } } diff --git a/lib/Text/Tradition/Language/French.pm b/lib/Text/Tradition/Language/French.pm index fa884ff..9dbfd01 100644 --- a/lib/Text/Tradition/Language/French.pm +++ b/lib/Text/Tradition/Language/French.pm @@ -2,8 +2,10 @@ package Text::Tradition::Language::French; use strict; use warnings; +use Lingua::TagSet::Multext; +use Lingua::TagSet::TreeTagger::French; use Module::Load qw/ load /; -use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct multext_struct lfs_morph_tags /; +use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /; use TryCatch; =head1 NAME @@ -46,6 +48,7 @@ SKIP: { # Test the lemmatization. How many readings now have morphological info? # Do the lexemes match the reading? my $ambig = 0; + my $flemmed = 0; foreach my $r ( $tf->collation->readings ) { next if $r->is_meta; ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); @@ -55,13 +58,16 @@ SKIP: { $textstr =~ s/\s+//g; is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); foreach my $l ( @lex ) { + # Check to see if Flemm actually ran + foreach my $wf ( $l->matching_forms ) { + $flemmed++ if $wf->morphology->get_feature('num'); + } next if $l->is_disambiguated; - # printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, - # join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); $ambig++; } } is( $ambig, 102, "Found 102 ambiguous forms as expected" ); + ok( $flemmed > 500, "Found enough Flemm info in wordforms" ); # Try setting the normal form of a reading and re-analyzing my $mr = $tf->collation->reading('r99.2'); @@ -69,7 +75,11 @@ SKIP: { is( $mr->language, 'French', "Reading has correct language setting" ); $mr->normal_form( "m'inspire" ); $mr->lemmatize; - is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" ); + my @l = $mr->lexemes; + is( @l, 2, "Got two lexemes for new m'inspire reading" ); + is( $l[0]->form->to_string, + '"French // se|le|lui // cat@pron type@pers pers@1 num@sing case@acc|dat"', + "New reading has correct first lexeme" ); } =end testing @@ -144,10 +154,10 @@ sub _parse_wordform { my( $pos, $morph ) = split( /:/, $tag ); my $morphobj; if( $morph ) { - $morphobj = multext_struct( $morph ); + $morphobj = Lingua::TagSet::Multext->tag2structure( $morph ); } else { # Use the TreeTagger info if there is no Flemm morphology. - $morphobj = treetagger_struct( $pos ); + $morphobj = Lingua::TagSet::TreeTagger::French->tag2structure( $pos ); } if( $morphobj ) { my $wf = Text::Tradition::Collation::Reading::WordForm->new( diff --git a/lib/Text/Tradition/Language/Latin.pm b/lib/Text/Tradition/Language/Latin.pm index 7873c54..1d278b5 100644 --- a/lib/Text/Tradition/Language/Latin.pm +++ b/lib/Text/Tradition/Language/Latin.pm @@ -3,8 +3,7 @@ package Text::Tradition::Language::Latin; use strict; use warnings; use Module::Load; -use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct - lfs_morph_tags /; +use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /; use TryCatch; =head1 NAME @@ -14,10 +13,7 @@ Text::Tradition::Language::Latin - language-specific module for Latin =head1 DESCRIPTION Implements morphology lookup for French words in context. This module -depends on the Morph::Perseus module for access to PhiloLogic database data. -It also depends on the TreeTagger software -(L), which is -(for now) expected to be installed in $MORPHDIR/TreeTagger. +depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data. =head1 SUBROUTINES @@ -30,11 +26,11 @@ Evaluates the string using the Flemm package, and returns the results. use Text::Tradition; use_ok( 'Text::Tradition::Language::Latin' ); -eval "use Morph::Perseus"; +eval "use Lingua::Morph::Perseus"; my $err = $@; SKIP: { - skip "Package Morph::Perseus not found" if $err; + skip "Package Lingua::Morph::Perseus not found" if $err; my $trad = Text::Tradition->new( 'language' => 'Latin', @@ -58,7 +54,7 @@ SKIP: { $ambig++; } } - is( $ambig, 7, "Found 7 ambiguous forms as expected" ); + is( $ambig, 4, "Found 4 ambiguous forms as expected" ); } =end testing @@ -94,11 +90,6 @@ Return a data structure describing the available parts of speech and their attri =cut sub morphology_tags { - try { - load 'Morph::Perseus::Structure'; - } catch { - warn "Not using Perseus Latin tags"; - } return lfs_morph_tags(); } @@ -109,11 +100,10 @@ sub morphology_tags { sub _morph_connect { unless( $morph ) { try { - load 'Morph::Perseus'; - load 'Morph::Perseus::Structure'; - $morph = Morph::Perseus->connect( 'Latin' ); + load 'Lingua::Morph::Perseus'; + $morph = Lingua::Morph::Perseus->connect( 'Latin' ); } catch { - warn "Cannot do Latin word lemmatization without Morph::Perseus: @_"; + warn "Cannot do Latin word lemmatization without Lingua::Morph::Perseus: @_"; return; } } @@ -127,7 +117,6 @@ sub morphology_tags { # T carne -> carnis # T melius -> bonus - my %excep = ( 'absens' => 'absum', 'aperte' => 'apertus', @@ -162,49 +151,33 @@ sub morphology_tags { sub _perseus_lookup_tt { my( $orig, $pos, $lemma ) = split( /\t/, $_[0] ); _morph_connect(); - return unlesss $morph; - my $result = $morph->lookup( $orig ); + return unless $morph; # Discard results that don't match the lemma, unless lemma is unknown - my @orig = @{$result->{'objects'}}; - my @ret; + my $lookupopts = {}; unless( $lemma eq '' || $lemma =~ /^\W+$/ ) { # TODO Perseus lemma might have a number on the end, yuck. # multiple lemmata separated with | $lemma =~ s/[^\w|]//g; $lemma = $excep{$lemma} if exists $excep{$lemma}; $lemma =~ s/j/i/g; - my %lems; - my @forms = - map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma ); - # Now match the lemmata from Treetagger to the lemmata and alt_ls - # from Perseus. - @ret = grep { - my $x = $_->lemma; - $x =~ s/\d+$//; - my $y = $_->alt_lex || ''; - $y =~ s/\d+$//; - $lems{$x} || $lems{$y}; - } @orig; - warn "TreeTagger lemma $lemma matched no results from Perseus for $orig" - if @orig && !@ret; + if( $lemma ) { # if we have anything left... + my %lems; + map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma ); + $lookupopts->{'lemma'} = [ keys %lems ]; + } } - @ret = @orig unless @ret; + $lookupopts->{'ttpos'} = $pos if $pos; + my $result = $morph->lexicon_lookup( $orig, $lookupopts ); + # unless( !keys( %$lookupopts ) || $result->{'filtered'} ) { + # warn "Filter on $pos / $lemma returned no results; using all results"; + # } + my @ret = @{$result->{'objects'}}; my %unique_wordforms; foreach my $obj ( @ret ) { my $wf = _wordform_from_row( $obj ); $unique_wordforms{$wf->to_string} = $wf; } - ## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet - # Discard results that don't match the given TreeTagger POS, unless - # that leaves zero results -# my $ttstruct = treetagger_struct( $pos ); -# my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms; -# unless( @ttmatch ) { -# warn "TreeTagger POS $pos matched no results from Perseus for $orig"; -# @ttmatch = @wordforms; -# } -# return @ttmatch; return values( %unique_wordforms ); } @@ -221,20 +194,12 @@ sub morphology_tags { sub _wordform_from_row { my( $rowobj ) = @_; - my $mpstruct; - try { - # M::P::St will be loaded already if we got here - $mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code ); - } catch { - warn "Could not create morphology structure from " - . $rowobj->code . ": $!"; - } my $lemma = $rowobj->lemma; $lemma =~ s/^(\D+)\d*$/$1/; my $wf = Text::Tradition::Collation::Reading::WordForm->new( 'language' => 'Latin', 'lemma' => $lemma, - 'morphology' => $mpstruct, + 'morphology' => $rowobj->morphology, ); return $wf; } diff --git a/t/text_tradition_language_french.t b/t/text_tradition_language_french.t index 84e9d02..3d1b38a 100644 --- a/t/text_tradition_language_french.t +++ b/t/text_tradition_language_french.t @@ -27,6 +27,7 @@ SKIP: { # Test the lemmatization. How many readings now have morphological info? # Do the lexemes match the reading? my $ambig = 0; + my $flemmed = 0; foreach my $r ( $tf->collation->readings ) { next if $r->is_meta; ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); @@ -36,13 +37,16 @@ SKIP: { $textstr =~ s/\s+//g; is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); foreach my $l ( @lex ) { + # Check to see if Flemm actually ran + foreach my $wf ( $l->matching_forms ) { + $flemmed++ if $wf->morphology->get_feature('num'); + } next if $l->is_disambiguated; - # printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, - # join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); $ambig++; } } is( $ambig, 102, "Found 102 ambiguous forms as expected" ); + ok( $flemmed > 500, "Found enough Flemm info in wordforms" ); # Try setting the normal form of a reading and re-analyzing my $mr = $tf->collation->reading('r99.2'); @@ -50,7 +54,11 @@ SKIP: { is( $mr->language, 'French', "Reading has correct language setting" ); $mr->normal_form( "m'inspire" ); $mr->lemmatize; - is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" ); + my @l = $mr->lexemes; + is( @l, 2, "Got two lexemes for new m'inspire reading" ); + is( $l[0]->form->to_string, + '"French // se|le|lui // cat@pron type@pers pers@1 num@sing case@acc|dat"', + "New reading has correct first lexeme" ); } } diff --git a/t/text_tradition_language_latin.t b/t/text_tradition_language_latin.t index 0fe9405..4b948f6 100644 --- a/t/text_tradition_language_latin.t +++ b/t/text_tradition_language_latin.t @@ -11,11 +11,11 @@ $| = 1; use Text::Tradition; use_ok( 'Text::Tradition::Language::Latin' ); -eval "use Morph::Perseus"; +eval "use Lingua::Morph::Perseus"; my $err = $@; SKIP: { - skip "Package Morph::Perseus not found" if $err; + skip "Package Lingua::Morph::Perseus not found" if $err; my $trad = Text::Tradition->new( 'language' => 'Latin', @@ -39,7 +39,7 @@ SKIP: { $ambig++; } } - is( $ambig, 7, "Found 7 ambiguous forms as expected" ); + is( $ambig, 4, "Found 4 ambiguous forms as expected" ); } }