From: Tara L Andrews Date: Wed, 20 Jun 2012 20:01:55 +0000 (+0200) Subject: generalize Latin module to Latin/Greek/Armenian on Perseus X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=0ce8c0cf4cff6dc07fc49c4c46928ad428c6e1d1;p=scpubgit%2Fstemmatology.git generalize Latin module to Latin/Greek/Armenian on Perseus --- diff --git a/lib/Text/Tradition/Language/Armenian.pm b/lib/Text/Tradition/Language/Armenian.pm new file mode 100644 index 0000000..8d09353 --- /dev/null +++ b/lib/Text/Tradition/Language/Armenian.pm @@ -0,0 +1,73 @@ +package Text::Tradition::Language::Armenian; + +use strict; +use warnings; +use Module::Load; +use parent qw/ Text::Tradition::Language::Perseus /; + +=head1 NAME + +Text::Tradition::Language::Armenian - language-specific module for Armenian + +=head1 DESCRIPTION + +Implements morphology lookup for Armenian (Grabar) words in context. This module +depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data. + +=head1 SUBROUTINES + +=head2 lemmatize( $text ) + +Evaluates the string using Treetagger and Perseus, and returns the results. + +=begin testing + +use Text::Tradition; +use_ok( 'Text::Tradition::Language::Armenian' ); + +eval "use Lingua::Morph::Perseus"; +my $err = $@; + +SKIP: { + skip "No Armenian test data yet"; + + my $trad = Text::Tradition->new( + 'language' => 'Armenian', + 'file' => 't/data/legendfrag.xml', + 'input' => 'Self' ); + $trad->lemmatize(); + my $ambig = 0; + foreach my $r ( $trad->collation->readings ) { + next if $r->is_meta; + ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); + my @lex = $r->lexemes; + my $lexstr = join( '', map { $_->string } @lex ); + my $textstr = $r->text; + $textstr =~ s/\s+//g; + is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); + foreach my $l ( @lex ) { + next unless $l->matches; + next if $l->is_disambiguated; + printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, + join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); + $ambig++; + } + } + is( $ambig, 4, "Found 4 ambiguous forms as expected" ); +} + +=end testing + +=cut + +our $dbhandle; + +sub lemmatize { + return __PACKAGE__->perseus_lemmatize( @_ ); +} + +sub reading_lookup { + return __PACKAGE__->perseus_reading_lookup( @_ ); +} + +1; diff --git a/lib/Text/Tradition/Language/Greek.pm b/lib/Text/Tradition/Language/Greek.pm new file mode 100644 index 0000000..28d69cf --- /dev/null +++ b/lib/Text/Tradition/Language/Greek.pm @@ -0,0 +1,73 @@ +package Text::Tradition::Language::Greek; + +use strict; +use warnings; +use Module::Load; +use parent qw/ Text::Tradition::Language::Perseus /; + +=head1 NAME + +Text::Tradition::Language::Greek - language-specific module for Greek + +=head1 DESCRIPTION + +Implements morphology lookup for Greek words in context. This module +depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data. + +=head1 SUBROUTINES + +=head2 lemmatize( $text ) + +Evaluates the string using Treetagger and Perseus, and returns the results. + +=begin testing + +use Text::Tradition; +use_ok( 'Text::Tradition::Language::Greek' ); + +eval "use Lingua::Morph::Perseus"; +my $err = $@; + +SKIP: { + skip "Greek linguistic data not read yet"; + + my $trad = Text::Tradition->new( + 'language' => 'Greek', + 'file' => 't/data/florilegium_graphml.xml', + 'input' => 'Self' ); + $trad->lemmatize(); + my $ambig = 0; + foreach my $r ( $trad->collation->readings ) { + next if $r->is_meta; + ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); + my @lex = $r->lexemes; + my $lexstr = join( '', map { $_->string } @lex ); + my $textstr = $r->text; + $textstr =~ s/\s+//g; + is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); + foreach my $l ( @lex ) { + next unless $l->matches; + next if $l->is_disambiguated; + printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, + join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); + $ambig++; + } + } + is( $ambig, 4, "Found 4 ambiguous forms as expected" ); +} + +=end testing + +=cut + +our $dbhandle; + +sub lemmatize { + return __PACKAGE__->perseus_lemmatize( @_ ); +} + +sub reading_lookup { + return __PACKAGE__->perseus_reading_lookup( @_ ); +} + +1; diff --git a/lib/Text/Tradition/Language/Latin.pm b/lib/Text/Tradition/Language/Latin.pm index df87505..2f4a42a 100644 --- a/lib/Text/Tradition/Language/Latin.pm +++ b/lib/Text/Tradition/Language/Latin.pm @@ -3,8 +3,7 @@ package Text::Tradition::Language::Latin; use strict; use warnings; use Module::Load; -use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /; -use TryCatch; +use parent qw/ Text::Tradition::Language::Perseus /; =head1 NAME @@ -12,14 +11,14 @@ Text::Tradition::Language::Latin - language-specific module for Latin =head1 DESCRIPTION -Implements morphology lookup for French words in context. This module +Implements morphology lookup for Latin words in context. This module depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data. =head1 SUBROUTINES =head2 lemmatize( $text ) -Evaluates the string using the Flemm package, and returns the results. +Evaluates the string using Treetagger and Perseus, and returns the results. =begin testing @@ -61,101 +60,14 @@ SKIP: { =cut +our $dbhandle; + sub lemmatize { - my $tradition = shift; - my %opts = ( - 'language' => 'Latin', - 'callback' => sub { _perseus_lookup_tt( @_ ) } - ); - return lemmatize_treetagger( $tradition, %opts ); + return __PACKAGE__->perseus_lemmatize( @_ ); } -=head2 reading_lookup( $rdg[, $rdg, ...] ) - -Looks up one or more readings using the Perseus package, and returns the -possible results. This skips the tree tagger / tokenizer, returning any -match for the word string(s) in the morphology DB. - -=cut - sub reading_lookup { - my @words = @_; - return map { _perseus_lookup_str( $_ ) } @words; -} - -=head2 morphology_tags - -Return a data structure describing the available parts of speech and their attributes. - -=cut - -sub morphology_tags { - return lfs_morph_tags(); + return __PACKAGE__->perseus_reading_lookup( @_ ); } - -{ - my $morph; - - sub _morph_connect { - unless( $morph ) { - try { - load 'Lingua::Morph::Perseus'; - $morph = Lingua::Morph::Perseus->connect( 'Latin' ); - } catch { - warn "Cannot do Latin word lemmatization without Lingua::Morph::Perseus: @_"; - return; - } - } - } - - sub _perseus_lookup_tt { - my( $orig, $pos, $lemma ) = split( /\t/, $_[0] ); - _morph_connect(); - return unless $morph; - # Discard results that don't match the lemma, unless lemma is unknown - my $lookupopts = {}; - unless( $lemma eq '' || $lemma =~ /^\W+$/ ) { - my %lems; - map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma ); - $lookupopts->{'lemma'} = [ keys %lems ]; - } - $lookupopts->{'ttpos'} = $pos if $pos; - - my $result = $morph->lexicon_lookup( $orig, $lookupopts ); - # unless( !keys( %$lookupopts ) || $result->{'filtered'} ) { - # warn "Filter on $pos / $lemma returned no results; using all results"; - # } - my @ret = @{$result->{'objects'}}; - my %unique_wordforms; - foreach my $obj ( @ret ) { - my $wf = _wordform_from_row( $obj ); - $unique_wordforms{$wf->to_string} = $wf; - } - return values( %unique_wordforms ); - } - - sub _perseus_lookup_str { - my( $orig ) = @_; - _morph_connect(); - return unless $morph; - # Simple morph DB lookup, and return the results. - my $result = $morph->lexicon_lookup( $orig ); - return map { _wordform_from_row( $_ ) } @{$result->{'objects'}}; - } - -} - -sub _wordform_from_row { - my( $rowobj ) = @_; - my $lemma = $rowobj->lemma; - $lemma =~ s/^(\D+)\d*$/$1/; - my $wf = Text::Tradition::Collation::Reading::WordForm->new( - 'language' => 'Latin', - 'lemma' => $lemma, - 'morphology' => $rowobj->morphology, - ); - return $wf; -} - 1; diff --git a/lib/Text/Tradition/Language/Perseus.pm b/lib/Text/Tradition/Language/Perseus.pm new file mode 100644 index 0000000..9bf71d8 --- /dev/null +++ b/lib/Text/Tradition/Language/Perseus.pm @@ -0,0 +1,129 @@ +package Text::Tradition::Language::Perseus; + +use strict; +use warnings; +use Module::Load; +use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /; +use TryCatch; + +=head1 NAME + +Text::Tradition::Language::Perseus - base module for those languages that rely +on a Lingua::Morph::Perseus database. + +=head1 DESCRIPTION + +Implements morphology lookup for words in context. This module depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data. + +=head1 SUBROUTINES + +=head2 lemmatize( $text ) + +Evaluates the string using Treetagger and Perseus, and returns the results. + +=cut + +# tested in child language modules + +sub perseus_lemmatize { + my $self = shift; + my $tradition = shift; + my %opts = ( + 'language' => $tradition->language, + 'callback' => sub { _perseus_lookup_tt( $self, @_ ) } + ); + return lemmatize_treetagger( $tradition, %opts ); +} + +=head2 reading_lookup( $rdg[, $rdg, ...] ) + +Looks up one or more readings using the Perseus package, and returns the +possible results. This skips the tree tagger / tokenizer, returning any +match for the word string(s) in the morphology DB. + +=cut + +sub perseus_reading_lookup { + my( $self, @words ) = @_; + return map { $self->_perseus_lookup_str( $_ ) } @words; +} + +=head2 morphology_tags + +Return a data structure describing the available parts of speech and their attributes. + +=cut + +sub morphology_tags { + return lfs_morph_tags(); +} + +sub _get_lang { + my $self = shift; + my @parts = split( /::/, $self ); + return $parts[-1]; +} + +sub _morph_connect { + my $self = shift; + unless( $self::dbhandle ) { + my $lang = $self->_get_lang(); + try { + load 'Lingua::Morph::Perseus'; + $self::dbhandle = Lingua::Morph::Perseus->connect( $lang ); + } catch { + warn "Cannot do $lang word lemmatization without Lingua::Morph::Perseus: @_"; + return; + } + } +} + +sub _perseus_lookup_tt { + my $self = shift; + my( $orig, $pos, $lemma ) = split( /\t/, $_[0] ); + $self->_morph_connect(); + return unless $self::dbhandle; + # Discard results that don't match the lemma, unless lemma is unknown + my $lookupopts = {}; + unless( $lemma eq '' || $lemma =~ /^\W+$/ ) { + my %lems; + map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma ); + $lookupopts->{'lemma'} = [ keys %lems ]; + } + $lookupopts->{'ttpos'} = $pos if $pos; + + my $result = $self::dbhandle->lexicon_lookup( $orig, $lookupopts ); + # unless( !keys( %$lookupopts ) || $result->{'filtered'} ) { + # warn "Filter on $pos / $lemma returned no results; using all results"; + # } + my @ret = @{$result->{'objects'}}; + my %unique_wordforms; + foreach my $obj ( @ret ) { + my $wf = $self->_wordform_from_row( $obj ); + $unique_wordforms{$wf->to_string} = $wf; + } + return values( %unique_wordforms ); +} + +sub _perseus_lookup_str { + my( $self, $orig ) = @_; + $self->_morph_connect(); + return unless $self::dbhandle; + # Simple morph DB lookup, and return the results. + my $result = $self::dbhandle->lexicon_lookup( $orig ); + return map { $self->_wordform_from_row( $_ ) } @{$result->{'objects'}}; +} + +sub _wordform_from_row { + my( $self, $rowobj ) = @_; + my $lemma = $rowobj->lemma; + $lemma =~ s/^(\D+)\d*$/$1/; + my $wf = Text::Tradition::Collation::Reading::WordForm->new( + 'language' => $self->_get_lang(), + 'lemma' => $lemma, + 'morphology' => $rowobj->morphology, + ); + return $wf; +} + +1; diff --git a/t/00dependencies.t b/t/00dependencies.t index 9b46598..e70a616 100644 --- a/t/00dependencies.t +++ b/t/00dependencies.t @@ -76,7 +76,7 @@ my %required; for ( sort keys %used ) { my $first_in = Module::CoreList->first_release($_); next if defined $first_in and $first_in <= 5.00803; - next if /^(Text::Tradition|inc|t|feature)(::|$)/; + next if /^(Text::Tradition|inc|t|feature|parent)(::|$)/; #warn $_; ok( exists $required{$_}, "$_ in Makefile.PL" ) diff --git a/t/text_tradition_language_armenian.t b/t/text_tradition_language_armenian.t new file mode 100644 index 0000000..409f7df --- /dev/null +++ b/t/text_tradition_language_armenian.t @@ -0,0 +1,49 @@ +#!/usr/bin/perl -w + +use strict; +use Test::More 'no_plan'; +$| = 1; + + + +# =begin testing +{ +use Text::Tradition; +use_ok( 'Text::Tradition::Language::Armenian' ); + +eval "use Lingua::Morph::Perseus"; +my $err = $@; + +SKIP: { + skip "No Armenian test data yet"; + + my $trad = Text::Tradition->new( + 'language' => 'Armenian', + 'file' => 't/data/legendfrag.xml', + 'input' => 'Self' ); + $trad->lemmatize(); + my $ambig = 0; + foreach my $r ( $trad->collation->readings ) { + next if $r->is_meta; + ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); + my @lex = $r->lexemes; + my $lexstr = join( '', map { $_->string } @lex ); + my $textstr = $r->text; + $textstr =~ s/\s+//g; + is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); + foreach my $l ( @lex ) { + next unless $l->matches; + next if $l->is_disambiguated; + printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, + join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); + $ambig++; + } + } + is( $ambig, 4, "Found 4 ambiguous forms as expected" ); +} +} + + + + +1; diff --git a/t/text_tradition_language_greek.t b/t/text_tradition_language_greek.t new file mode 100644 index 0000000..edfa653 --- /dev/null +++ b/t/text_tradition_language_greek.t @@ -0,0 +1,49 @@ +#!/usr/bin/perl -w + +use strict; +use Test::More 'no_plan'; +$| = 1; + + + +# =begin testing +{ +use Text::Tradition; +use_ok( 'Text::Tradition::Language::Greek' ); + +eval "use Lingua::Morph::Perseus"; +my $err = $@; + +SKIP: { + skip "Greek linguistic data not read yet"; + + my $trad = Text::Tradition->new( + 'language' => 'Greek', + 'file' => 't/data/florilegium_graphml.xml', + 'input' => 'Self' ); + $trad->lemmatize(); + my $ambig = 0; + foreach my $r ( $trad->collation->readings ) { + next if $r->is_meta; + ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); + my @lex = $r->lexemes; + my $lexstr = join( '', map { $_->string } @lex ); + my $textstr = $r->text; + $textstr =~ s/\s+//g; + is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); + foreach my $l ( @lex ) { + next unless $l->matches; + next if $l->is_disambiguated; + printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, + join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); + $ambig++; + } + } + is( $ambig, 4, "Found 4 ambiguous forms as expected" ); +} +} + + + + +1;