X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FLanguage%2FLatin.pm;h=1d278b5ca87f124b87a416c196b52ff4bbcf8af0;hb=f8862b584dcc04728d3bff48ea7c19cb9a078772;hp=9f117675a32ea0730c6c7a05600d139befc2f179;hpb=fe77efe0d84ff0d31dc2ce020b73bf57783d464e;p=scpubgit%2Fstemmatology.git

diff --git a/lib/Text/Tradition/Language/Latin.pm b/lib/Text/Tradition/Language/Latin.pm
index 9f11767..1d278b5 100644
--- a/lib/Text/Tradition/Language/Latin.pm
+++ b/lib/Text/Tradition/Language/Latin.pm
@@ -3,7 +3,7 @@ package Text::Tradition::Language::Latin;
 use strict;
 use warnings;
 use Module::Load;
-use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct /;
+use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /;
 use TryCatch;
 
 =head1 NAME
@@ -13,10 +13,7 @@ Text::Tradition::Language::Latin - language-specific module for Latin
 =head1 DESCRIPTION
 
 Implements morphology lookup for French words in context.  This module
-depends on the Morph::Perseus module for access to PhiloLogic database data.
-It also depends on the TreeTagger software
-(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
-(for now) expected to be installed in $MORPHDIR/TreeTagger.
+depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
 
 =head1 SUBROUTINES
 
@@ -29,11 +26,11 @@ Evaluates the string using the Flemm package, and returns the results.
 use Text::Tradition;
 use_ok( 'Text::Tradition::Language::Latin' );
 
-eval "use Morph::Perseus";
+eval "use Lingua::Morph::Perseus";
 my $err = $@;
 
 SKIP: {
-	skip "Package Morph::Perseus not found" if $err;
+	skip "Package Lingua::Morph::Perseus not found" if $err;
 
 	my $trad = Text::Tradition->new(
 		'language' => 'Latin',
@@ -57,7 +54,7 @@ SKIP: {
 			$ambig++;
 		}
 	}
-	is( $ambig, 7, "Found 7 ambiguous forms as expected" );
+	is( $ambig, 4, "Found 4 ambiguous forms as expected" );
 }
 
 =end testing
@@ -86,17 +83,27 @@ sub reading_lookup {
 	return map { _perseus_lookup_str( $_ ) } @words;
 }
 
+=head2 morphology_tags
+
+Return a data structure describing the available parts of speech and their attributes.
+
+=cut
+
+sub morphology_tags {
+	return lfs_morph_tags();
+}
+
+
 {
 	my $morph;
 	
 	sub _morph_connect {
 		unless( $morph ) {
 			try {
-				load 'Morph::Perseus';
-				load 'Morph::Perseus::Structure';
-				$morph = Morph::Perseus->connect( 'Latin' );
+				load 'Lingua::Morph::Perseus';
+				$morph = Lingua::Morph::Perseus->connect( 'Latin' );
 			} catch {
-				warn "Cannot do Latin word lemmatization without Morph::Perseus: @_";
+				warn "Cannot do Latin word lemmatization without Lingua::Morph::Perseus: @_";
 				return;
 			}
 		}
@@ -110,7 +117,6 @@ sub reading_lookup {
 	#  T carne -> carnis
 	#  T melius -> bonus
 	
-
 	my %excep = (
 		'absens' => 'absum',
 		'aperte' => 'apertus',
@@ -145,53 +151,40 @@ sub reading_lookup {
 	sub _perseus_lookup_tt {
 		my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
 		_morph_connect();
-		my $result = $morph->lookup( $orig );
+		return unless $morph;
 		# Discard results that don't match the lemma, unless lemma is unknown
-		my @orig = @{$result->{'objects'}};
-		my @ret;
+		my $lookupopts = {};
 		unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
 			# TODO Perseus lemma might have a number on the end, yuck.
 			#  multiple lemmata separated with |
 			$lemma =~ s/[^\w|]//g;
 			$lemma = $excep{$lemma} if exists $excep{$lemma};
 			$lemma =~ s/j/i/g;
-			my %lems;
-			my @forms = 
-			map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
-			# Now match the lemmata from Treetagger to the lemmata and alt_ls 
-			# from Perseus.
-			@ret = grep { 
-				my $x = $_->lemma; 
-				$x =~ s/\d+$//;
-				my $y = $_->alt_lex || '';
-				$y =~ s/\d+$//;
-				$lems{$x} || $lems{$y};
-				} @orig;
-			warn "TreeTagger lemma $lemma matched no results from Perseus for $orig"
-				if @orig && !@ret;
+			if( $lemma ) { # if we have anything left...
+				my %lems;
+				map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
+				$lookupopts->{'lemma'} = [ keys %lems ];
+			}
 		}
-		@ret = @orig unless @ret;
+		$lookupopts->{'ttpos'} = $pos if $pos;
 		
-		my @wordforms;
+		my $result = $morph->lexicon_lookup( $orig, $lookupopts );
+		# unless( !keys( %$lookupopts ) ||  $result->{'filtered'} ) {
+		# 	warn "Filter on $pos / $lemma returned no results; using all results";
+		# }
+		my @ret = @{$result->{'objects'}};
+		my %unique_wordforms;
 		foreach my $obj ( @ret ) {
-			push( @wordforms, _wordform_from_row( $obj ) );
+			my $wf = _wordform_from_row( $obj );
+			$unique_wordforms{$wf->to_string} = $wf;
 		}
-		## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet
-		# Discard results that don't match the given TreeTagger POS, unless
-		# that leaves zero results
-# 		my $ttstruct = treetagger_struct( $pos );
-# 		my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms;
-# 		unless( @ttmatch ) {
-# 			warn "TreeTagger POS $pos matched no results from Perseus for $orig";
-# 			@ttmatch = @wordforms;
-# 		}
-# 		return @ttmatch;
-		return @wordforms;
+		return values( %unique_wordforms );
 	}
 	
 	sub _perseus_lookup_str {
 		my( $orig ) = @_;
 		_morph_connect();
+		return unless $morph;
 		# Simple morph DB lookup, and return the results.
 		my $result = $morph->lookup( $orig );
 		return map { _wordform_from_row( $_ ) } @{$result->{'objects'}};
@@ -201,20 +194,12 @@ sub reading_lookup {
 
 sub _wordform_from_row {
 	my( $rowobj ) = @_;
-	my $mpstruct;
-	try {
-		$mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code );
-	} catch {
-		warn "Could not create morphology structure from "
-			. $rowobj->code . ": $!";
-	}
-	$DB::single = 1 unless $mpstruct;
 	my $lemma = $rowobj->lemma;
 	$lemma =~ s/^(\D+)\d*$/$1/;
 	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
 		'language' => 'Latin',
 		'lemma' => $lemma,
-		'morphology' => $mpstruct,
+		'morphology' => $rowobj->morphology,
 		);
 	return $wf;
 }