From: Tara L Andrews <tla@mit.edu>
Date: Wed, 20 Jun 2012 20:01:55 +0000 (+0200)
Subject: generalize Latin module to Latin/Greek/Armenian on Perseus
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=0ce8c0cf4cff6dc07fc49c4c46928ad428c6e1d1;p=scpubgit%2Fstemmatology.git

generalize Latin module to Latin/Greek/Armenian on Perseus
---

diff --git a/lib/Text/Tradition/Language/Armenian.pm b/lib/Text/Tradition/Language/Armenian.pm
new file mode 100644
index 0000000..8d09353
--- /dev/null
+++ b/lib/Text/Tradition/Language/Armenian.pm
@@ -0,0 +1,73 @@
+package Text::Tradition::Language::Armenian;
+
+use strict;
+use warnings;
+use Module::Load;
+use parent qw/ Text::Tradition::Language::Perseus /;
+
+=head1 NAME
+
+Text::Tradition::Language::Armenian - language-specific module for Armenian
+
+=head1 DESCRIPTION
+
+Implements morphology lookup for Armenian (Grabar) words in context.  This module
+depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
+
+=head1 SUBROUTINES
+
+=head2 lemmatize( $text )
+
+Evaluates the string using Treetagger and Perseus, and returns the results.
+
+=begin testing
+
+use Text::Tradition;
+use_ok( 'Text::Tradition::Language::Armenian' );
+
+eval "use Lingua::Morph::Perseus";
+my $err = $@;
+
+SKIP: {
+	skip "No Armenian test data yet";
+
+	my $trad = Text::Tradition->new(
+		'language' => 'Armenian',
+		'file' => 't/data/legendfrag.xml',
+		'input' => 'Self' );
+	$trad->lemmatize();
+	my $ambig = 0;
+	foreach my $r ( $trad->collation->readings ) {
+		next if $r->is_meta;
+		ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
+		my @lex = $r->lexemes;
+		my $lexstr = join( '', map { $_->string } @lex );
+		my $textstr = $r->text;
+		$textstr =~ s/\s+//g;
+		is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
+		foreach my $l ( @lex ) {
+			next unless $l->matches;
+			next if $l->is_disambiguated;
+	 		printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
+	 			join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
+			$ambig++;
+		}
+	}
+	is( $ambig, 4, "Found 4 ambiguous forms as expected" );
+}
+
+=end testing
+
+=cut
+
+our $dbhandle;
+
+sub lemmatize {
+	return __PACKAGE__->perseus_lemmatize( @_ );
+}
+
+sub reading_lookup {
+	return __PACKAGE__->perseus_reading_lookup( @_ );
+}
+
+1;
diff --git a/lib/Text/Tradition/Language/Greek.pm b/lib/Text/Tradition/Language/Greek.pm
new file mode 100644
index 0000000..28d69cf
--- /dev/null
+++ b/lib/Text/Tradition/Language/Greek.pm
@@ -0,0 +1,73 @@
+package Text::Tradition::Language::Greek;
+
+use strict;
+use warnings;
+use Module::Load;
+use parent qw/ Text::Tradition::Language::Perseus /;
+
+=head1 NAME
+
+Text::Tradition::Language::Greek - language-specific module for Greek
+
+=head1 DESCRIPTION
+
+Implements morphology lookup for Greek words in context.  This module
+depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
+
+=head1 SUBROUTINES
+
+=head2 lemmatize( $text )
+
+Evaluates the string using Treetagger and Perseus, and returns the results.
+
+=begin testing
+
+use Text::Tradition;
+use_ok( 'Text::Tradition::Language::Greek' );
+
+eval "use Lingua::Morph::Perseus";
+my $err = $@;
+
+SKIP: {
+	skip "Greek linguistic data not read yet";
+
+	my $trad = Text::Tradition->new(
+		'language' => 'Greek',
+		'file' => 't/data/florilegium_graphml.xml',
+		'input' => 'Self' );
+	$trad->lemmatize();
+	my $ambig = 0;
+	foreach my $r ( $trad->collation->readings ) {
+		next if $r->is_meta;
+		ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
+		my @lex = $r->lexemes;
+		my $lexstr = join( '', map { $_->string } @lex );
+		my $textstr = $r->text;
+		$textstr =~ s/\s+//g;
+		is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
+		foreach my $l ( @lex ) {
+			next unless $l->matches;
+			next if $l->is_disambiguated;
+	 		printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
+	 			join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
+			$ambig++;
+		}
+	}
+	is( $ambig, 4, "Found 4 ambiguous forms as expected" );
+}
+
+=end testing
+
+=cut
+
+our $dbhandle;
+
+sub lemmatize {
+	return __PACKAGE__->perseus_lemmatize( @_ );
+}
+
+sub reading_lookup {
+	return __PACKAGE__->perseus_reading_lookup( @_ );
+}
+
+1;
diff --git a/lib/Text/Tradition/Language/Latin.pm b/lib/Text/Tradition/Language/Latin.pm
index df87505..2f4a42a 100644
--- a/lib/Text/Tradition/Language/Latin.pm
+++ b/lib/Text/Tradition/Language/Latin.pm
@@ -3,8 +3,7 @@ package Text::Tradition::Language::Latin;
 use strict;
 use warnings;
 use Module::Load;
-use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /;
-use TryCatch;
+use parent qw/ Text::Tradition::Language::Perseus /;
 
 =head1 NAME
 
@@ -12,14 +11,14 @@ Text::Tradition::Language::Latin - language-specific module for Latin
 
 =head1 DESCRIPTION
 
-Implements morphology lookup for French words in context.  This module
+Implements morphology lookup for Latin words in context.  This module
 depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
 
 =head1 SUBROUTINES
 
 =head2 lemmatize( $text )
 
-Evaluates the string using the Flemm package, and returns the results.
+Evaluates the string using Treetagger and Perseus, and returns the results.
 
 =begin testing
 
@@ -61,101 +60,14 @@ SKIP: {
 
 =cut
 
+our $dbhandle;
+
 sub lemmatize {
-	my $tradition = shift;
-	my %opts = ( 
-		'language' => 'Latin', 
-		'callback' => sub { _perseus_lookup_tt( @_ ) } 
-		);
-	return lemmatize_treetagger( $tradition, %opts );
+	return __PACKAGE__->perseus_lemmatize( @_ );
 }
 
-=head2 reading_lookup( $rdg[, $rdg, ...] )
-
-Looks up one or more readings using the Perseus package, and returns the
-possible results.  This skips the tree tagger / tokenizer, returning any
-match for the word string(s) in the morphology DB.
-
-=cut
-
 sub reading_lookup {
-	my @words = @_;
-	return map { _perseus_lookup_str( $_ ) } @words;
-}
-
-=head2 morphology_tags
-
-Return a data structure describing the available parts of speech and their attributes.
-
-=cut
-
-sub morphology_tags {
-	return lfs_morph_tags();
+	return __PACKAGE__->perseus_reading_lookup( @_ );
 }
 
-
-{
-	my $morph;
-	
-	sub _morph_connect {
-		unless( $morph ) {
-			try {
-				load 'Lingua::Morph::Perseus';
-				$morph = Lingua::Morph::Perseus->connect( 'Latin' );
-			} catch {
-				warn "Cannot do Latin word lemmatization without Lingua::Morph::Perseus: @_";
-				return;
-			}
-		}
-	}
-				
-	sub _perseus_lookup_tt {
-		my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
-		_morph_connect();
-		return unless $morph;
-		# Discard results that don't match the lemma, unless lemma is unknown
-		my $lookupopts = {};
-		unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
-			my %lems;
-			map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
-			$lookupopts->{'lemma'} = [ keys %lems ];
-		}
-		$lookupopts->{'ttpos'} = $pos if $pos;
-		
-		my $result = $morph->lexicon_lookup( $orig, $lookupopts );
-		# unless( !keys( %$lookupopts ) ||  $result->{'filtered'} ) {
-		# 	warn "Filter on $pos / $lemma returned no results; using all results";
-		# }
-		my @ret = @{$result->{'objects'}};
-		my %unique_wordforms;
-		foreach my $obj ( @ret ) {
-			my $wf = _wordform_from_row( $obj );
-			$unique_wordforms{$wf->to_string} = $wf;
-		}
-		return values( %unique_wordforms );
-	}
-	
-	sub _perseus_lookup_str {
-		my( $orig ) = @_;
-		_morph_connect();
-		return unless $morph;
-		# Simple morph DB lookup, and return the results.
-		my $result = $morph->lexicon_lookup( $orig );
-		return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
-	}
-	
-}
-
-sub _wordform_from_row {
-	my( $rowobj ) = @_;
-	my $lemma = $rowobj->lemma;
-	$lemma =~ s/^(\D+)\d*$/$1/;
-	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
-		'language' => 'Latin',
-		'lemma' => $lemma,
-		'morphology' => $rowobj->morphology,
-		);
-	return $wf;
-}
-	
 1;
diff --git a/lib/Text/Tradition/Language/Perseus.pm b/lib/Text/Tradition/Language/Perseus.pm
new file mode 100644
index 0000000..9bf71d8
--- /dev/null
+++ b/lib/Text/Tradition/Language/Perseus.pm
@@ -0,0 +1,129 @@
+package Text::Tradition::Language::Perseus;
+
+use strict;
+use warnings;
+use Module::Load;
+use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /;
+use TryCatch;
+
+=head1 NAME
+
+Text::Tradition::Language::Perseus - base module for those languages that rely
+on a Lingua::Morph::Perseus database.
+
+=head1 DESCRIPTION
+
+Implements morphology lookup for words in context.  This module depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
+
+=head1 SUBROUTINES
+
+=head2 lemmatize( $text )
+
+Evaluates the string using Treetagger and Perseus, and returns the results.
+
+=cut
+
+# tested in child language modules
+
+sub perseus_lemmatize {
+	my $self = shift;
+	my $tradition = shift;
+	my %opts = ( 
+		'language' => $tradition->language, 
+		'callback' => sub { _perseus_lookup_tt( $self, @_ ) } 
+		);
+	return lemmatize_treetagger( $tradition, %opts );
+}
+
+=head2 reading_lookup( $rdg[, $rdg, ...] )
+
+Looks up one or more readings using the Perseus package, and returns the
+possible results.  This skips the tree tagger / tokenizer, returning any
+match for the word string(s) in the morphology DB.
+
+=cut
+
+sub perseus_reading_lookup {
+	my( $self, @words ) = @_;
+	return map { $self->_perseus_lookup_str( $_ ) } @words;
+}
+
+=head2 morphology_tags
+
+Return a data structure describing the available parts of speech and their attributes.
+
+=cut
+
+sub morphology_tags {
+	return lfs_morph_tags();
+}
+
+sub _get_lang {
+	my $self = shift;
+	my @parts = split( /::/, $self );
+	return $parts[-1];
+}
+
+sub _morph_connect {
+	my $self = shift;
+	unless( $self::dbhandle ) {
+		my $lang = $self->_get_lang();
+		try {
+			load 'Lingua::Morph::Perseus';
+			$self::dbhandle = Lingua::Morph::Perseus->connect( $lang );
+		} catch {
+			warn "Cannot do $lang word lemmatization without Lingua::Morph::Perseus: @_";
+			return;
+		}
+	}
+}
+			
+sub _perseus_lookup_tt {
+	my $self = shift;
+	my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
+	$self->_morph_connect();
+	return unless $self::dbhandle;
+	# Discard results that don't match the lemma, unless lemma is unknown
+	my $lookupopts = {};
+	unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
+		my %lems;
+		map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
+		$lookupopts->{'lemma'} = [ keys %lems ];
+	}
+	$lookupopts->{'ttpos'} = $pos if $pos;
+	
+	my $result = $self::dbhandle->lexicon_lookup( $orig, $lookupopts );
+	# unless( !keys( %$lookupopts ) ||  $result->{'filtered'} ) {
+	# 	warn "Filter on $pos / $lemma returned no results; using all results";
+	# }
+	my @ret = @{$result->{'objects'}};
+	my %unique_wordforms;
+	foreach my $obj ( @ret ) {
+		my $wf = $self->_wordform_from_row( $obj );
+		$unique_wordforms{$wf->to_string} = $wf;
+	}
+	return values( %unique_wordforms );
+}
+
+sub _perseus_lookup_str {
+	my( $self, $orig ) = @_;
+	$self->_morph_connect();
+	return unless $self::dbhandle;
+	# Simple morph DB lookup, and return the results.
+	my $result = $self::dbhandle->lexicon_lookup( $orig );
+	return map { $self->_wordform_from_row( $_ ) } @{$result->{'objects'}};
+}
+	
+sub _wordform_from_row {
+	my( $self, $rowobj ) = @_;
+	my $lemma = $rowobj->lemma;
+	$lemma =~ s/^(\D+)\d*$/$1/;
+	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
+		'language' => $self->_get_lang(),
+		'lemma' => $lemma,
+		'morphology' => $rowobj->morphology,
+		);
+	return $wf;
+}
+	
+1;
diff --git a/t/00dependencies.t b/t/00dependencies.t
index 9b46598..e70a616 100644
--- a/t/00dependencies.t
+++ b/t/00dependencies.t
@@ -76,7 +76,7 @@ my %required;
 for ( sort keys %used ) {
     my $first_in = Module::CoreList->first_release($_);
     next if defined $first_in and $first_in <= 5.00803;
-    next if /^(Text::Tradition|inc|t|feature)(::|$)/;
+    next if /^(Text::Tradition|inc|t|feature|parent)(::|$)/;
 
     #warn $_;
     ok( exists $required{$_}, "$_ in Makefile.PL" )
diff --git a/t/text_tradition_language_armenian.t b/t/text_tradition_language_armenian.t
new file mode 100644
index 0000000..409f7df
--- /dev/null
+++ b/t/text_tradition_language_armenian.t
@@ -0,0 +1,49 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Test::More 'no_plan';
+$| = 1;
+
+
+
+# =begin testing
+{
+use Text::Tradition;
+use_ok( 'Text::Tradition::Language::Armenian' );
+
+eval "use Lingua::Morph::Perseus";
+my $err = $@;
+
+SKIP: {
+	skip "No Armenian test data yet";
+
+	my $trad = Text::Tradition->new(
+		'language' => 'Armenian',
+		'file' => 't/data/legendfrag.xml',
+		'input' => 'Self' );
+	$trad->lemmatize();
+	my $ambig = 0;
+	foreach my $r ( $trad->collation->readings ) {
+		next if $r->is_meta;
+		ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
+		my @lex = $r->lexemes;
+		my $lexstr = join( '', map { $_->string } @lex );
+		my $textstr = $r->text;
+		$textstr =~ s/\s+//g;
+		is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
+		foreach my $l ( @lex ) {
+			next unless $l->matches;
+			next if $l->is_disambiguated;
+	 		printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
+	 			join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
+			$ambig++;
+		}
+	}
+	is( $ambig, 4, "Found 4 ambiguous forms as expected" );
+}
+}
+
+
+
+
+1;
diff --git a/t/text_tradition_language_greek.t b/t/text_tradition_language_greek.t
new file mode 100644
index 0000000..edfa653
--- /dev/null
+++ b/t/text_tradition_language_greek.t
@@ -0,0 +1,49 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Test::More 'no_plan';
+$| = 1;
+
+
+
+# =begin testing
+{
+use Text::Tradition;
+use_ok( 'Text::Tradition::Language::Greek' );
+
+eval "use Lingua::Morph::Perseus";
+my $err = $@;
+
+SKIP: {
+	skip "Greek linguistic data not read yet";
+
+	my $trad = Text::Tradition->new(
+		'language' => 'Greek',
+		'file' => 't/data/florilegium_graphml.xml',
+		'input' => 'Self' );
+	$trad->lemmatize();
+	my $ambig = 0;
+	foreach my $r ( $trad->collation->readings ) {
+		next if $r->is_meta;
+		ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
+		my @lex = $r->lexemes;
+		my $lexstr = join( '', map { $_->string } @lex );
+		my $textstr = $r->text;
+		$textstr =~ s/\s+//g;
+		is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
+		foreach my $l ( @lex ) {
+			next unless $l->matches;
+			next if $l->is_disambiguated;
+	 		printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
+	 			join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
+			$ambig++;
+		}
+	}
+	is( $ambig, 4, "Found 4 ambiguous forms as expected" );
+}
+}
+
+
+
+
+1;