X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FLanguage%2FLatin.pm;h=1d278b5ca87f124b87a416c196b52ff4bbcf8af0;hb=f8862b584dcc04728d3bff48ea7c19cb9a078772;hp=9f117675a32ea0730c6c7a05600d139befc2f179;hpb=fe77efe0d84ff0d31dc2ce020b73bf57783d464e;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Language/Latin.pm b/lib/Text/Tradition/Language/Latin.pm index 9f11767..1d278b5 100644 --- a/lib/Text/Tradition/Language/Latin.pm +++ b/lib/Text/Tradition/Language/Latin.pm @@ -3,7 +3,7 @@ package Text::Tradition::Language::Latin; use strict; use warnings; use Module::Load; -use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct /; +use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /; use TryCatch; =head1 NAME @@ -13,10 +13,7 @@ Text::Tradition::Language::Latin - language-specific module for Latin =head1 DESCRIPTION Implements morphology lookup for French words in context. This module -depends on the Morph::Perseus module for access to PhiloLogic database data. -It also depends on the TreeTagger software -(L), which is -(for now) expected to be installed in $MORPHDIR/TreeTagger. +depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data. =head1 SUBROUTINES @@ -29,11 +26,11 @@ Evaluates the string using the Flemm package, and returns the results. use Text::Tradition; use_ok( 'Text::Tradition::Language::Latin' ); -eval "use Morph::Perseus"; +eval "use Lingua::Morph::Perseus"; my $err = $@; SKIP: { - skip "Package Morph::Perseus not found" if $err; + skip "Package Lingua::Morph::Perseus not found" if $err; my $trad = Text::Tradition->new( 'language' => 'Latin', @@ -57,7 +54,7 @@ SKIP: { $ambig++; } } - is( $ambig, 7, "Found 7 ambiguous forms as expected" ); + is( $ambig, 4, "Found 4 ambiguous forms as expected" ); } =end testing @@ -86,17 +83,27 @@ sub reading_lookup { return map { _perseus_lookup_str( $_ ) } @words; } +=head2 morphology_tags + +Return a data structure describing the available parts of speech and their attributes. + +=cut + +sub morphology_tags { + return lfs_morph_tags(); +} + + { my $morph; sub _morph_connect { unless( $morph ) { try { - load 'Morph::Perseus'; - load 'Morph::Perseus::Structure'; - $morph = Morph::Perseus->connect( 'Latin' ); + load 'Lingua::Morph::Perseus'; + $morph = Lingua::Morph::Perseus->connect( 'Latin' ); } catch { - warn "Cannot do Latin word lemmatization without Morph::Perseus: @_"; + warn "Cannot do Latin word lemmatization without Lingua::Morph::Perseus: @_"; return; } } @@ -110,7 +117,6 @@ sub reading_lookup { # T carne -> carnis # T melius -> bonus - my %excep = ( 'absens' => 'absum', 'aperte' => 'apertus', @@ -145,53 +151,40 @@ sub reading_lookup { sub _perseus_lookup_tt { my( $orig, $pos, $lemma ) = split( /\t/, $_[0] ); _morph_connect(); - my $result = $morph->lookup( $orig ); + return unless $morph; # Discard results that don't match the lemma, unless lemma is unknown - my @orig = @{$result->{'objects'}}; - my @ret; + my $lookupopts = {}; unless( $lemma eq '' || $lemma =~ /^\W+$/ ) { # TODO Perseus lemma might have a number on the end, yuck. # multiple lemmata separated with | $lemma =~ s/[^\w|]//g; $lemma = $excep{$lemma} if exists $excep{$lemma}; $lemma =~ s/j/i/g; - my %lems; - my @forms = - map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma ); - # Now match the lemmata from Treetagger to the lemmata and alt_ls - # from Perseus. - @ret = grep { - my $x = $_->lemma; - $x =~ s/\d+$//; - my $y = $_->alt_lex || ''; - $y =~ s/\d+$//; - $lems{$x} || $lems{$y}; - } @orig; - warn "TreeTagger lemma $lemma matched no results from Perseus for $orig" - if @orig && !@ret; + if( $lemma ) { # if we have anything left... + my %lems; + map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma ); + $lookupopts->{'lemma'} = [ keys %lems ]; + } } - @ret = @orig unless @ret; + $lookupopts->{'ttpos'} = $pos if $pos; - my @wordforms; + my $result = $morph->lexicon_lookup( $orig, $lookupopts ); + # unless( !keys( %$lookupopts ) || $result->{'filtered'} ) { + # warn "Filter on $pos / $lemma returned no results; using all results"; + # } + my @ret = @{$result->{'objects'}}; + my %unique_wordforms; foreach my $obj ( @ret ) { - push( @wordforms, _wordform_from_row( $obj ) ); + my $wf = _wordform_from_row( $obj ); + $unique_wordforms{$wf->to_string} = $wf; } - ## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet - # Discard results that don't match the given TreeTagger POS, unless - # that leaves zero results -# my $ttstruct = treetagger_struct( $pos ); -# my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms; -# unless( @ttmatch ) { -# warn "TreeTagger POS $pos matched no results from Perseus for $orig"; -# @ttmatch = @wordforms; -# } -# return @ttmatch; - return @wordforms; + return values( %unique_wordforms ); } sub _perseus_lookup_str { my( $orig ) = @_; _morph_connect(); + return unless $morph; # Simple morph DB lookup, and return the results. my $result = $morph->lookup( $orig ); return map { _wordform_from_row( $_ ) } @{$result->{'objects'}}; @@ -201,20 +194,12 @@ sub reading_lookup { sub _wordform_from_row { my( $rowobj ) = @_; - my $mpstruct; - try { - $mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code ); - } catch { - warn "Could not create morphology structure from " - . $rowobj->code . ": $!"; - } - $DB::single = 1 unless $mpstruct; my $lemma = $rowobj->lemma; $lemma =~ s/^(\D+)\d*$/$1/; my $wf = Text::Tradition::Collation::Reading::WordForm->new( 'language' => 'Latin', 'lemma' => $lemma, - 'morphology' => $mpstruct, + 'morphology' => $rowobj->morphology, ); return $wf; }