generalize Latin module to Latin/Greek/Armenian on Perseus
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / Perseus.pm
CommitLineData
0ce8c0cf 1package Text::Tradition::Language::Perseus;
2
3use strict;
4use warnings;
5use Module::Load;
6use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /;
7use TryCatch;
8
9=head1 NAME
10
11Text::Tradition::Language::Perseus - base module for those languages that rely
12on a Lingua::Morph::Perseus database.
13
14=head1 DESCRIPTION
15
16Implements morphology lookup for words in context. This module depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
17
18=head1 SUBROUTINES
19
20=head2 lemmatize( $text )
21
22Evaluates the string using Treetagger and Perseus, and returns the results.
23
24=cut
25
26# tested in child language modules
27
28sub perseus_lemmatize {
29 my $self = shift;
30 my $tradition = shift;
31 my %opts = (
32 'language' => $tradition->language,
33 'callback' => sub { _perseus_lookup_tt( $self, @_ ) }
34 );
35 return lemmatize_treetagger( $tradition, %opts );
36}
37
38=head2 reading_lookup( $rdg[, $rdg, ...] )
39
40Looks up one or more readings using the Perseus package, and returns the
41possible results. This skips the tree tagger / tokenizer, returning any
42match for the word string(s) in the morphology DB.
43
44=cut
45
46sub perseus_reading_lookup {
47 my( $self, @words ) = @_;
48 return map { $self->_perseus_lookup_str( $_ ) } @words;
49}
50
51=head2 morphology_tags
52
53Return a data structure describing the available parts of speech and their attributes.
54
55=cut
56
57sub morphology_tags {
58 return lfs_morph_tags();
59}
60
61sub _get_lang {
62 my $self = shift;
63 my @parts = split( /::/, $self );
64 return $parts[-1];
65}
66
67sub _morph_connect {
68 my $self = shift;
69 unless( $self::dbhandle ) {
70 my $lang = $self->_get_lang();
71 try {
72 load 'Lingua::Morph::Perseus';
73 $self::dbhandle = Lingua::Morph::Perseus->connect( $lang );
74 } catch {
75 warn "Cannot do $lang word lemmatization without Lingua::Morph::Perseus: @_";
76 return;
77 }
78 }
79}
80
81sub _perseus_lookup_tt {
82 my $self = shift;
83 my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
84 $self->_morph_connect();
85 return unless $self::dbhandle;
86 # Discard results that don't match the lemma, unless lemma is unknown
87 my $lookupopts = {};
88 unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
89 my %lems;
90 map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
91 $lookupopts->{'lemma'} = [ keys %lems ];
92 }
93 $lookupopts->{'ttpos'} = $pos if $pos;
94
95 my $result = $self::dbhandle->lexicon_lookup( $orig, $lookupopts );
96 # unless( !keys( %$lookupopts ) || $result->{'filtered'} ) {
97 # warn "Filter on $pos / $lemma returned no results; using all results";
98 # }
99 my @ret = @{$result->{'objects'}};
100 my %unique_wordforms;
101 foreach my $obj ( @ret ) {
102 my $wf = $self->_wordform_from_row( $obj );
103 $unique_wordforms{$wf->to_string} = $wf;
104 }
105 return values( %unique_wordforms );
106}
107
108sub _perseus_lookup_str {
109 my( $self, $orig ) = @_;
110 $self->_morph_connect();
111 return unless $self::dbhandle;
112 # Simple morph DB lookup, and return the results.
113 my $result = $self::dbhandle->lexicon_lookup( $orig );
114 return map { $self->_wordform_from_row( $_ ) } @{$result->{'objects'}};
115}
116
117sub _wordform_from_row {
118 my( $self, $rowobj ) = @_;
119 my $lemma = $rowobj->lemma;
120 $lemma =~ s/^(\D+)\d*$/$1/;
121 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
122 'language' => $self->_get_lang(),
123 'lemma' => $lemma,
124 'morphology' => $rowobj->morphology,
125 );
126 return $wf;
127}
128
1291;