[scpubgit/stemmatology.git] / morphology / lib / Text / Tradition / Language / Perseus.pm

package Text::Tradition::Language::Perseus;

use strict;
use warnings;
use Module::Load;
use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger
	 lfs_morph_tags /;
use TryCatch;

=head1 NAME

Text::Tradition::Language::Perseus - base module for those languages that rely
on a Lingua::Morph::Perseus database.

=head1 DESCRIPTION

Implements morphology lookup for words in context.  This module depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.

=head1 SUBROUTINES

=head2 lemmatize( $text )

Evaluates the string using Treetagger and Perseus, and returns the results.

=cut

# tested in child language modules

sub perseus_lemmatize {
	my $self = shift;
	my $tradition = shift;
	my %opts = ( 
		'language' => $tradition->language, 
		'callback' => sub { _perseus_lookup_tt( $self, @_ ) } 
		);
	return lemmatize_treetagger( $tradition, %opts );
}

=head2 reading_lookup( $rdg[, $rdg, ...] )

Looks up one or more readings using the Perseus package, and returns the
possible results.  This skips the tree tagger / tokenizer, returning any
match for the word string(s) in the morphology DB.

=cut

sub perseus_reading_lookup {
	my( $self, @words ) = @_;
	my %opts = ( 
		'language' => $self->_get_lang(),
		'callback' => sub { _perseus_lookup_str( $self, @_ ) },
		'path' => \@words,
		);
	return reading_lookup_treetagger( %opts );
}

=head2 morphology_tags

Return a data structure describing the available parts of speech and their attributes.

=cut

sub morphology_tags {
	return lfs_morph_tags();
}

sub _get_lang {
	my $self = shift;
	my @parts = split( /::/, $self );
	return $parts[-1];
}

sub _morph_connect {
	my $self = shift;
	unless( $self::dbhandle ) {
		my $lang = $self->_get_lang();
		try {
			load 'Lingua::Morph::Perseus';
			$self::dbhandle = Lingua::Morph::Perseus->connect( $lang );
		} catch {
			warn "Cannot do $lang word lemmatization without Lingua::Morph::Perseus: @_";
			return;
		}
	}
}
			
sub _perseus_lookup_tt {
	my $self = shift;
	my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
	$self->_morph_connect();
	return unless $self::dbhandle;
	# Discard results that don't match the lemma, unless lemma is unknown
	my $lookupopts = {};
	unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
		my %lems;
		map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
		$lookupopts->{'lemma'} = [ keys %lems ];
	}
	$lookupopts->{'ttpos'} = $pos if $pos;
	
	my $result = $self::dbhandle->lexicon_lookup( $orig, $lookupopts );
	# unless( !keys( %$lookupopts ) ||  $result->{'filtered'} ) {
	# 	warn "Filter on $pos / $lemma returned no results; using all results";
	# }
	my @ret = @{$result->{'objects'}};
	my %unique_wordforms;
	foreach my $obj ( @ret ) {
		my $wf = $self->_wordform_from_row( $obj );
		$unique_wordforms{$wf->to_string} = $wf;
	}
	return values( %unique_wordforms );
}

sub _perseus_lookup_str {
	my $self = shift;
	my ( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
	$self->_morph_connect();
	return unless $self::dbhandle;
	# Simple morph DB lookup, and return the results. Disregard the treetagger.
	my $result = $self::dbhandle->lexicon_lookup( $orig );
	return map { $self->_wordform_from_row( $_ ) } @{$result->{'objects'}};
}
	
sub _wordform_from_row {
	my( $self, $rowobj ) = @_;
	my $lemma = $rowobj->lemma;
	$lemma =~ s/^(\D+)\d*$/$1/;
	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
		'language' => $self->_get_lang(),
		'lemma' => $lemma,
		'morphology' => $rowobj->morphology,
		);
	return $wf;
}
	
1;

=head1 ACKNOWLEDGMENTS

The Perseus model and its dependents (currently Latin and Greek) draws its
data from the ARTFL project at the University of Chicago. The module author
wishes to thank Helma Dik in particular for her kind assistance.
Commit	Line	Data
0ce8c0cf	1	package Text::Tradition::Language::Perseus;
	2
	3	use strict;
	4	use warnings;
	5	use Module::Load;
b0f883e0	6	use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger
b0f883e0	7	lfs_morph_tags /;
0ce8c0cf	8	use TryCatch;
	9
	10	=head1 NAME
	11
	12	Text::Tradition::Language::Perseus - base module for those languages that rely
	13	on a Lingua::Morph::Perseus database.
	14
	15	=head1 DESCRIPTION
	16
	17	Implements morphology lookup for words in context. This module depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
	18
	19	=head1 SUBROUTINES
	20
	21	=head2 lemmatize( $text )
	22
	23	Evaluates the string using Treetagger and Perseus, and returns the results.
	24
	25	=cut
	26
	27	# tested in child language modules
	28
	29	sub perseus_lemmatize {
	30	my $self = shift;
	31	my $tradition = shift;
	32	my %opts = (
	33	'language' => $tradition->language,
	34	'callback' => sub { _perseus_lookup_tt( $self, @_ ) }
	35	);
	36	return lemmatize_treetagger( $tradition, %opts );
	37	}
	38
	39	=head2 reading_lookup( $rdg[, $rdg, ...] )
	40
	41	Looks up one or more readings using the Perseus package, and returns the
	42	possible results. This skips the tree tagger / tokenizer, returning any
	43	match for the word string(s) in the morphology DB.
	44
	45	=cut
	46
	47	sub perseus_reading_lookup {
	48	my( $self, @words ) = @_;
b0f883e0	49	my %opts = (
	50	'language' => $self->_get_lang(),
	51	'callback' => sub { _perseus_lookup_str( $self, @_ ) },
	52	'path' => \@words,
	53	);
	54	return reading_lookup_treetagger( %opts );
0ce8c0cf	55	}
	56
	57	=head2 morphology_tags
	58
	59	Return a data structure describing the available parts of speech and their attributes.
	60
	61	=cut
	62
	63	sub morphology_tags {
	64	return lfs_morph_tags();
	65	}
	66
	67	sub _get_lang {
	68	my $self = shift;
	69	my @parts = split( /::/, $self );
	70	return $parts[-1];
	71	}
	72
	73	sub _morph_connect {
	74	my $self = shift;
	75	unless( $self::dbhandle ) {
	76	my $lang = $self->_get_lang();
	77	try {
	78	load 'Lingua::Morph::Perseus';
	79	$self::dbhandle = Lingua::Morph::Perseus->connect( $lang );
	80	} catch {
	81	warn "Cannot do $lang word lemmatization without Lingua::Morph::Perseus: @_";
	82	return;
	83	}
	84	}
	85	}
	86
	87	sub _perseus_lookup_tt {
	88	my $self = shift;
	89	my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
	90	$self->_morph_connect();
	91	return unless $self::dbhandle;
	92	# Discard results that don't match the lemma, unless lemma is unknown
	93	my $lookupopts = {};
	94	unless( $lemma eq '<unknown>' \|\| $lemma =~ /^\W+$/ ) {
	95	my %lems;
	96	map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\\|/, $lemma );
	97	$lookupopts->{'lemma'} = [ keys %lems ];
	98	}
	99	$lookupopts->{'ttpos'} = $pos if $pos;
	100
	101	my $result = $self::dbhandle->lexicon_lookup( $orig, $lookupopts );
	102	# unless( !keys( %$lookupopts ) \|\| $result->{'filtered'} ) {
	103	# warn "Filter on $pos / $lemma returned no results; using all results";
	104	# }
	105	my @ret = @{$result->{'objects'}};
	106	my %unique_wordforms;
	107	foreach my $obj ( @ret ) {
	108	my $wf = $self->_wordform_from_row( $obj );
	109	$unique_wordforms{$wf->to_string} = $wf;
	110	}
	111	return values( %unique_wordforms );
	112	}
	113
	114	sub _perseus_lookup_str {
b0f883e0	115	my $self = shift;
b0f883e0	116	my ( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
0ce8c0cf	117	$self->_morph_connect();
0ce8c0cf	118	return unless $self::dbhandle;
b0f883e0	119	# Simple morph DB lookup, and return the results. Disregard the treetagger.
0ce8c0cf	120	my $result = $self::dbhandle->lexicon_lookup( $orig );
	121	return map { $self->_wordform_from_row( $_ ) } @{$result->{'objects'}};
	122	}
	123
	124	sub _wordform_from_row {
	125	my( $self, $rowobj ) = @_;
	126	my $lemma = $rowobj->lemma;
	127	$lemma =~ s/^(\D+)\d*$/$1/;
	128	my $wf = Text::Tradition::Collation::Reading::WordForm->new(
	129	'language' => $self->_get_lang(),
	130	'lemma' => $lemma,
	131	'morphology' => $rowobj->morphology,
	132	);
	133	return $wf;
	134	}
	135
	136	1;
e92d4229	137
8943ff68	138	=head1 ACKNOWLEDGMENTS
e92d4229	139
8943ff68	140	The Perseus model and its dependents (currently Latin and Greek) draws its
	141	data from the ARTFL project at the University of Chicago. The module author
	142	wishes to thank Helma Dik in particular for her kind assistance.