From: Tara L Andrews Date: Thu, 17 May 2012 23:08:05 +0000 (+0200) Subject: add morphology support for Latin, dependent on Perseus morphology X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=5271a01157263fd46b8d504e0ddb0b349a3fb9c4;p=scpubgit%2Fstemmatology.git add morphology support for Latin, dependent on Perseus morphology --- diff --git a/lib/Text/Tradition/Language/Base.pm b/lib/Text/Tradition/Language/Base.pm index 98d2625..05f81c0 100644 --- a/lib/Text/Tradition/Language/Base.pm +++ b/lib/Text/Tradition/Language/Base.pm @@ -13,7 +13,8 @@ use Text::Tradition::Collation::Reading::Lexeme; use Text::Tradition::Collation::Reading::WordForm; use TryCatch; -@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger /; +@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct + multext_struct /; =head1 NAME @@ -244,6 +245,18 @@ sub _treetag_string { return $tagresult->as_text(); } +## HACK function to correct for TagSet::TreeTagger brokenness +sub treetagger_struct { + my $pos = shift; + $pos =~ s/PREP/PRP/; + return Lingua::TagSet::TreeTagger->tag2structure( $pos ); +} + +sub multext_struct { + my $pos = shift; + # No known hacks needed + return Lingua::TagSet::Multext->tag2structure( $pos ); +} 1; diff --git a/lib/Text/Tradition/Language/English.pm b/lib/Text/Tradition/Language/English.pm index 9d25c55..f477759 100644 --- a/lib/Text/Tradition/Language/English.pm +++ b/lib/Text/Tradition/Language/English.pm @@ -35,7 +35,7 @@ use_ok( 'Text::Tradition::Language::English' ); sub lemmatize { my $tradition = shift; my %opts = ( - 'language' => 'French', + 'language' => 'English', 'callback' => sub { _parse_wordform( @_ ) } ); return lemmatize_treetagger( $tradition, %opts ); diff --git a/lib/Text/Tradition/Language/French.pm b/lib/Text/Tradition/Language/French.pm index 7453044..f820586 100644 --- a/lib/Text/Tradition/Language/French.pm +++ b/lib/Text/Tradition/Language/French.pm @@ -3,7 +3,7 @@ package Text::Tradition::Language::French; use strict; use warnings; use Module::Load qw/ load /; -use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger /; +use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct multext_struct /; use TryCatch; =head1 NAME @@ -134,10 +134,10 @@ sub _parse_wordform { my( $pos, $morph ) = split( /:/, $tag ); my $morphobj; if( $morph ) { - $morphobj = Lingua::TagSet::Multext->tag2structure( $morph ); + $morphobj = multext_struct( $morph ); } else { # Use the TreeTagger info if there is no Flemm morphology. - $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $pos ); + $morphobj = treetagger_struct( $pos ); } if( $morphobj ) { my $wf = Text::Tradition::Collation::Reading::WordForm->new( diff --git a/lib/Text/Tradition/Language/Latin.pm b/lib/Text/Tradition/Language/Latin.pm new file mode 100644 index 0000000..07dc611 --- /dev/null +++ b/lib/Text/Tradition/Language/Latin.pm @@ -0,0 +1,158 @@ +package Text::Tradition::Language::Latin; + +use strict; +use warnings; +use Module::Load; +use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct /; +use TryCatch; + +=head1 NAME + +Text::Tradition::Language::Latin - language-specific module for Latin + +=head1 DESCRIPTION + +Implements morphology lookup for French words in context. This module +depends on the Morph::Perseus module for access to PhiloLogic database data. +It also depends on the TreeTagger software +(L), which is +(for now) expected to be installed in $MORPHDIR/TreeTagger. + +=head1 SUBROUTINES + +=head2 lemmatize( $text ) + +Evaluates the string using the Flemm package, and returns the results. + +=begin testing + +use Text::Tradition; +use_ok( 'Text::Tradition::Language::Latin' ); + +eval "use Morph::Perseus"; +my $err = $@; + +SKIP: { + skip "Package Morph::Perseus not found" if $err; + + my $trad = Text::Tradition->new( + 'language' => 'Latin', + 'file' => 't/data/legendfrag.xml', + 'input' => 'Self' ); + $trad->lemmatize(); + my $ambig = 0; + foreach my $r ( $trad->collation->readings ) { + next if $r->is_meta; + ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); + my @lex = $r->lexemes; + my $lexstr = join( '', map { $_->string } @lex ); + my $textstr = $r->text; + $textstr =~ s/\s+//g; + is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); + foreach my $l ( @lex ) { + next if $l->is_disambiguated; + printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, + join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); + $ambig++; + } + } + is( $ambig, 19, "Found 19 ambiguous forms as expected" ); +} + +=end testing + +=cut + +sub lemmatize { + my $tradition = shift; + my %opts = ( + 'language' => 'Latin', + 'callback' => sub { _perseus_lookup_tt( @_ ) } + ); + return lemmatize_treetagger( $tradition, %opts ); +} + +=head2 reading_lookup( $rdg[, $rdg, ...] ) + +Looks up one or more readings using the Perseus package, and returns the +possible results. This skips the tree tagger / tokenizer, returning any +match for the word string(s) in the morphology DB. + +=cut + +sub reading_lookup { + my @words = @_; + return map { _perseus_lookup_str( $_ ) } @words; +} + +{ + my $morph; + + sub _morph_connect { + unless( $morph ) { + try { + load 'Morph::Perseus'; + load 'Morph::Perseus::Structure'; + $morph = Morph::Perseus->connect( 'Latin' ); + } catch { + warn "Cannot do Latin word lemmatization without Morph::Perseus: @_"; + return; + } + } + } + + sub _perseus_lookup_tt { + my( $orig, $pos, $lemma ) = split( /\t/, $_[0] ); + _morph_connect(); + my $result = $morph->lookup( $orig ); + # Discard results that don't match the lemma, unless lemma is unknown + my @ret; + unless( $lemma eq '' ) { + # TODO Perseus lemma might have a number on the end, yuck. + @ret = grep { $_->lemma =~ /^$lemma(\d*)$/ } @{$result->{'objects'}}; + } + unless( @ret ) { + @ret = @{$result->{'objects'}}; + warn "TreeTagger lemma $lemma matched no results from Perseus for $orig" + if @ret; + } + + # Discard results that don't match the given TreeTagger POS, unless + # that leaves zero results + my @wordforms; + foreach my $obj ( @ret ) { + push( @wordforms, _wordform_from_row( $obj ) ); + } + ## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet +# my $ttstruct = treetagger_struct( $pos ); +# my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms; +# unless( @ttmatch ) { +# warn "TreeTagger POS $pos matched no results from Perseus for $orig"; +# @ttmatch = @wordforms; +# } +# return @ttmatch; + return @wordforms; + } + + sub _perseus_lookup_str { + my( $orig ) = @_; + _morph_connect(); + # Simple morph DB lookup, and return the results. + my $result = $morph->lookup( $orig ); + return map { _wordform_from_row( $_ ) } @{$result->{'objects'}}; + } + + sub _wordform_from_row { + my( $rowobj ) = @_; + my $mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code ); + my $wf = Text::Tradition::Collation::Reading::WordForm->new( + 'language' => 'Latin', + 'lemma' => $rowobj->lemma, + 'morphology' => $mpstruct, + ); + return $wf; + } + +} + +1; \ No newline at end of file diff --git a/t/text_tradition_language_latin.t b/t/text_tradition_language_latin.t new file mode 100644 index 0000000..4e9d602 --- /dev/null +++ b/t/text_tradition_language_latin.t @@ -0,0 +1,48 @@ +#!/usr/bin/perl -w + +use strict; +use Test::More 'no_plan'; +$| = 1; + + + +# =begin testing +{ +use Text::Tradition; +use_ok( 'Text::Tradition::Language::Latin' ); + +eval "use Morph::Perseus"; +my $err = $@; + +SKIP: { + skip "Package Morph::Perseus not found" if $err; + + my $trad = Text::Tradition->new( + 'language' => 'Latin', + 'file' => 't/data/legendfrag.xml', + 'input' => 'Self' ); + $trad->lemmatize(); + my $ambig = 0; + foreach my $r ( $trad->collation->readings ) { + next if $r->is_meta; + ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); + my @lex = $r->lexemes; + my $lexstr = join( '', map { $_->string } @lex ); + my $textstr = $r->text; + $textstr =~ s/\s+//g; + is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); + foreach my $l ( @lex ) { + next if $l->is_disambiguated; + printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, + join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); + $ambig++; + } + } + is( $ambig, 19, "Found 19 ambiguous forms as expected" ); +} +} + + + + +1;