--- /dev/null
+package Text::Tradition::Language::Armenian;
+
+use strict;
+use warnings;
+use Module::Load;
+use parent qw/ Text::Tradition::Language::Perseus /;
+
+=head1 NAME
+
+Text::Tradition::Language::Armenian - language-specific module for Armenian
+
+=head1 DESCRIPTION
+
+Implements morphology lookup for Armenian (Grabar) words in context. This module
+depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
+
+=head1 SUBROUTINES
+
+=head2 lemmatize( $text )
+
+Evaluates the string using Treetagger and Perseus, and returns the results.
+
+=begin testing
+
+use Text::Tradition;
+use_ok( 'Text::Tradition::Language::Armenian' );
+
+eval "use Lingua::Morph::Perseus";
+my $err = $@;
+
+SKIP: {
+ skip "No Armenian test data yet";
+
+ my $trad = Text::Tradition->new(
+ 'language' => 'Armenian',
+ 'file' => 't/data/legendfrag.xml',
+ 'input' => 'Self' );
+ $trad->lemmatize();
+ my $ambig = 0;
+ foreach my $r ( $trad->collation->readings ) {
+ next if $r->is_meta;
+ ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
+ my @lex = $r->lexemes;
+ my $lexstr = join( '', map { $_->string } @lex );
+ my $textstr = $r->text;
+ $textstr =~ s/\s+//g;
+ is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
+ foreach my $l ( @lex ) {
+ next unless $l->matches;
+ next if $l->is_disambiguated;
+ printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
+ join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
+ $ambig++;
+ }
+ }
+ is( $ambig, 4, "Found 4 ambiguous forms as expected" );
+}
+
+=end testing
+
+=cut
+
+our $dbhandle;
+
+sub lemmatize {
+ return __PACKAGE__->perseus_lemmatize( @_ );
+}
+
+sub reading_lookup {
+ return __PACKAGE__->perseus_reading_lookup( @_ );
+}
+
+1;
--- /dev/null
+package Text::Tradition::Language::Greek;
+
+use strict;
+use warnings;
+use Module::Load;
+use parent qw/ Text::Tradition::Language::Perseus /;
+
+=head1 NAME
+
+Text::Tradition::Language::Greek - language-specific module for Greek
+
+=head1 DESCRIPTION
+
+Implements morphology lookup for Greek words in context. This module
+depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
+
+=head1 SUBROUTINES
+
+=head2 lemmatize( $text )
+
+Evaluates the string using Treetagger and Perseus, and returns the results.
+
+=begin testing
+
+use Text::Tradition;
+use_ok( 'Text::Tradition::Language::Greek' );
+
+eval "use Lingua::Morph::Perseus";
+my $err = $@;
+
+SKIP: {
+ skip "Greek linguistic data not read yet";
+
+ my $trad = Text::Tradition->new(
+ 'language' => 'Greek',
+ 'file' => 't/data/florilegium_graphml.xml',
+ 'input' => 'Self' );
+ $trad->lemmatize();
+ my $ambig = 0;
+ foreach my $r ( $trad->collation->readings ) {
+ next if $r->is_meta;
+ ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
+ my @lex = $r->lexemes;
+ my $lexstr = join( '', map { $_->string } @lex );
+ my $textstr = $r->text;
+ $textstr =~ s/\s+//g;
+ is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
+ foreach my $l ( @lex ) {
+ next unless $l->matches;
+ next if $l->is_disambiguated;
+ printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
+ join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
+ $ambig++;
+ }
+ }
+ is( $ambig, 4, "Found 4 ambiguous forms as expected" );
+}
+
+=end testing
+
+=cut
+
+our $dbhandle;
+
+sub lemmatize {
+ return __PACKAGE__->perseus_lemmatize( @_ );
+}
+
+sub reading_lookup {
+ return __PACKAGE__->perseus_reading_lookup( @_ );
+}
+
+1;
use strict;
use warnings;
use Module::Load;
-use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /;
-use TryCatch;
+use parent qw/ Text::Tradition::Language::Perseus /;
=head1 NAME
=head1 DESCRIPTION
-Implements morphology lookup for French words in context. This module
+Implements morphology lookup for Latin words in context. This module
depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
=head1 SUBROUTINES
=head2 lemmatize( $text )
-Evaluates the string using the Flemm package, and returns the results.
+Evaluates the string using Treetagger and Perseus, and returns the results.
=begin testing
=cut
+our $dbhandle;
+
sub lemmatize {
- my $tradition = shift;
- my %opts = (
- 'language' => 'Latin',
- 'callback' => sub { _perseus_lookup_tt( @_ ) }
- );
- return lemmatize_treetagger( $tradition, %opts );
+ return __PACKAGE__->perseus_lemmatize( @_ );
}
-=head2 reading_lookup( $rdg[, $rdg, ...] )
-
-Looks up one or more readings using the Perseus package, and returns the
-possible results. This skips the tree tagger / tokenizer, returning any
-match for the word string(s) in the morphology DB.
-
-=cut
-
sub reading_lookup {
- my @words = @_;
- return map { _perseus_lookup_str( $_ ) } @words;
-}
-
-=head2 morphology_tags
-
-Return a data structure describing the available parts of speech and their attributes.
-
-=cut
-
-sub morphology_tags {
- return lfs_morph_tags();
+ return __PACKAGE__->perseus_reading_lookup( @_ );
}
-
-{
- my $morph;
-
- sub _morph_connect {
- unless( $morph ) {
- try {
- load 'Lingua::Morph::Perseus';
- $morph = Lingua::Morph::Perseus->connect( 'Latin' );
- } catch {
- warn "Cannot do Latin word lemmatization without Lingua::Morph::Perseus: @_";
- return;
- }
- }
- }
-
- sub _perseus_lookup_tt {
- my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
- _morph_connect();
- return unless $morph;
- # Discard results that don't match the lemma, unless lemma is unknown
- my $lookupopts = {};
- unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
- my %lems;
- map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
- $lookupopts->{'lemma'} = [ keys %lems ];
- }
- $lookupopts->{'ttpos'} = $pos if $pos;
-
- my $result = $morph->lexicon_lookup( $orig, $lookupopts );
- # unless( !keys( %$lookupopts ) || $result->{'filtered'} ) {
- # warn "Filter on $pos / $lemma returned no results; using all results";
- # }
- my @ret = @{$result->{'objects'}};
- my %unique_wordforms;
- foreach my $obj ( @ret ) {
- my $wf = _wordform_from_row( $obj );
- $unique_wordforms{$wf->to_string} = $wf;
- }
- return values( %unique_wordforms );
- }
-
- sub _perseus_lookup_str {
- my( $orig ) = @_;
- _morph_connect();
- return unless $morph;
- # Simple morph DB lookup, and return the results.
- my $result = $morph->lexicon_lookup( $orig );
- return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
- }
-
-}
-
-sub _wordform_from_row {
- my( $rowobj ) = @_;
- my $lemma = $rowobj->lemma;
- $lemma =~ s/^(\D+)\d*$/$1/;
- my $wf = Text::Tradition::Collation::Reading::WordForm->new(
- 'language' => 'Latin',
- 'lemma' => $lemma,
- 'morphology' => $rowobj->morphology,
- );
- return $wf;
-}
-
1;
--- /dev/null
+package Text::Tradition::Language::Perseus;
+
+use strict;
+use warnings;
+use Module::Load;
+use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /;
+use TryCatch;
+
+=head1 NAME
+
+Text::Tradition::Language::Perseus - base module for those languages that rely
+on a Lingua::Morph::Perseus database.
+
+=head1 DESCRIPTION
+
+Implements morphology lookup for words in context. This module depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
+
+=head1 SUBROUTINES
+
+=head2 lemmatize( $text )
+
+Evaluates the string using Treetagger and Perseus, and returns the results.
+
+=cut
+
+# tested in child language modules
+
+sub perseus_lemmatize {
+ my $self = shift;
+ my $tradition = shift;
+ my %opts = (
+ 'language' => $tradition->language,
+ 'callback' => sub { _perseus_lookup_tt( $self, @_ ) }
+ );
+ return lemmatize_treetagger( $tradition, %opts );
+}
+
+=head2 reading_lookup( $rdg[, $rdg, ...] )
+
+Looks up one or more readings using the Perseus package, and returns the
+possible results. This skips the tree tagger / tokenizer, returning any
+match for the word string(s) in the morphology DB.
+
+=cut
+
+sub perseus_reading_lookup {
+ my( $self, @words ) = @_;
+ return map { $self->_perseus_lookup_str( $_ ) } @words;
+}
+
+=head2 morphology_tags
+
+Return a data structure describing the available parts of speech and their attributes.
+
+=cut
+
+sub morphology_tags {
+ return lfs_morph_tags();
+}
+
+sub _get_lang {
+ my $self = shift;
+ my @parts = split( /::/, $self );
+ return $parts[-1];
+}
+
+sub _morph_connect {
+ my $self = shift;
+ unless( $self::dbhandle ) {
+ my $lang = $self->_get_lang();
+ try {
+ load 'Lingua::Morph::Perseus';
+ $self::dbhandle = Lingua::Morph::Perseus->connect( $lang );
+ } catch {
+ warn "Cannot do $lang word lemmatization without Lingua::Morph::Perseus: @_";
+ return;
+ }
+ }
+}
+
+sub _perseus_lookup_tt {
+ my $self = shift;
+ my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
+ $self->_morph_connect();
+ return unless $self::dbhandle;
+ # Discard results that don't match the lemma, unless lemma is unknown
+ my $lookupopts = {};
+ unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
+ my %lems;
+ map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
+ $lookupopts->{'lemma'} = [ keys %lems ];
+ }
+ $lookupopts->{'ttpos'} = $pos if $pos;
+
+ my $result = $self::dbhandle->lexicon_lookup( $orig, $lookupopts );
+ # unless( !keys( %$lookupopts ) || $result->{'filtered'} ) {
+ # warn "Filter on $pos / $lemma returned no results; using all results";
+ # }
+ my @ret = @{$result->{'objects'}};
+ my %unique_wordforms;
+ foreach my $obj ( @ret ) {
+ my $wf = $self->_wordform_from_row( $obj );
+ $unique_wordforms{$wf->to_string} = $wf;
+ }
+ return values( %unique_wordforms );
+}
+
+sub _perseus_lookup_str {
+ my( $self, $orig ) = @_;
+ $self->_morph_connect();
+ return unless $self::dbhandle;
+ # Simple morph DB lookup, and return the results.
+ my $result = $self::dbhandle->lexicon_lookup( $orig );
+ return map { $self->_wordform_from_row( $_ ) } @{$result->{'objects'}};
+}
+
+sub _wordform_from_row {
+ my( $self, $rowobj ) = @_;
+ my $lemma = $rowobj->lemma;
+ $lemma =~ s/^(\D+)\d*$/$1/;
+ my $wf = Text::Tradition::Collation::Reading::WordForm->new(
+ 'language' => $self->_get_lang(),
+ 'lemma' => $lemma,
+ 'morphology' => $rowobj->morphology,
+ );
+ return $wf;
+}
+
+1;
for ( sort keys %used ) {
my $first_in = Module::CoreList->first_release($_);
next if defined $first_in and $first_in <= 5.00803;
- next if /^(Text::Tradition|inc|t|feature)(::|$)/;
+ next if /^(Text::Tradition|inc|t|feature|parent)(::|$)/;
#warn $_;
ok( exists $required{$_}, "$_ in Makefile.PL" )
--- /dev/null
+#!/usr/bin/perl -w
+
+use strict;
+use Test::More 'no_plan';
+$| = 1;
+
+
+
+# =begin testing
+{
+use Text::Tradition;
+use_ok( 'Text::Tradition::Language::Armenian' );
+
+eval "use Lingua::Morph::Perseus";
+my $err = $@;
+
+SKIP: {
+ skip "No Armenian test data yet";
+
+ my $trad = Text::Tradition->new(
+ 'language' => 'Armenian',
+ 'file' => 't/data/legendfrag.xml',
+ 'input' => 'Self' );
+ $trad->lemmatize();
+ my $ambig = 0;
+ foreach my $r ( $trad->collation->readings ) {
+ next if $r->is_meta;
+ ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
+ my @lex = $r->lexemes;
+ my $lexstr = join( '', map { $_->string } @lex );
+ my $textstr = $r->text;
+ $textstr =~ s/\s+//g;
+ is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
+ foreach my $l ( @lex ) {
+ next unless $l->matches;
+ next if $l->is_disambiguated;
+ printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
+ join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
+ $ambig++;
+ }
+ }
+ is( $ambig, 4, "Found 4 ambiguous forms as expected" );
+}
+}
+
+
+
+
+1;
--- /dev/null
+#!/usr/bin/perl -w
+
+use strict;
+use Test::More 'no_plan';
+$| = 1;
+
+
+
+# =begin testing
+{
+use Text::Tradition;
+use_ok( 'Text::Tradition::Language::Greek' );
+
+eval "use Lingua::Morph::Perseus";
+my $err = $@;
+
+SKIP: {
+ skip "Greek linguistic data not read yet";
+
+ my $trad = Text::Tradition->new(
+ 'language' => 'Greek',
+ 'file' => 't/data/florilegium_graphml.xml',
+ 'input' => 'Self' );
+ $trad->lemmatize();
+ my $ambig = 0;
+ foreach my $r ( $trad->collation->readings ) {
+ next if $r->is_meta;
+ ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
+ my @lex = $r->lexemes;
+ my $lexstr = join( '', map { $_->string } @lex );
+ my $textstr = $r->text;
+ $textstr =~ s/\s+//g;
+ is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
+ foreach my $l ( @lex ) {
+ next unless $l->matches;
+ next if $l->is_disambiguated;
+ printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
+ join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
+ $ambig++;
+ }
+ }
+ is( $ambig, 4, "Found 4 ambiguous forms as expected" );
+}
+}
+
+
+
+
+1;