make single-word lemmatization work
[scpubgit/stemmatology.git] / lib / Text / Tradition / Language / Perseus.pm
CommitLineData
0ce8c0cf 1package Text::Tradition::Language::Perseus;
2
3use strict;
4use warnings;
5use Module::Load;
b0f883e0 6use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger
7 lfs_morph_tags /;
0ce8c0cf 8use TryCatch;
9
10=head1 NAME
11
12Text::Tradition::Language::Perseus - base module for those languages that rely
13on a Lingua::Morph::Perseus database.
14
15=head1 DESCRIPTION
16
17Implements morphology lookup for words in context. This module depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
18
19=head1 SUBROUTINES
20
21=head2 lemmatize( $text )
22
23Evaluates the string using Treetagger and Perseus, and returns the results.
24
25=cut
26
27# tested in child language modules
28
29sub perseus_lemmatize {
30 my $self = shift;
31 my $tradition = shift;
32 my %opts = (
33 'language' => $tradition->language,
34 'callback' => sub { _perseus_lookup_tt( $self, @_ ) }
35 );
36 return lemmatize_treetagger( $tradition, %opts );
37}
38
39=head2 reading_lookup( $rdg[, $rdg, ...] )
40
41Looks up one or more readings using the Perseus package, and returns the
42possible results. This skips the tree tagger / tokenizer, returning any
43match for the word string(s) in the morphology DB.
44
45=cut
46
47sub perseus_reading_lookup {
48 my( $self, @words ) = @_;
b0f883e0 49 my %opts = (
50 'language' => $self->_get_lang(),
51 'callback' => sub { _perseus_lookup_str( $self, @_ ) },
52 'path' => \@words,
53 );
54 return reading_lookup_treetagger( %opts );
0ce8c0cf 55}
56
57=head2 morphology_tags
58
59Return a data structure describing the available parts of speech and their attributes.
60
61=cut
62
63sub morphology_tags {
64 return lfs_morph_tags();
65}
66
67sub _get_lang {
68 my $self = shift;
69 my @parts = split( /::/, $self );
70 return $parts[-1];
71}
72
73sub _morph_connect {
74 my $self = shift;
75 unless( $self::dbhandle ) {
76 my $lang = $self->_get_lang();
77 try {
78 load 'Lingua::Morph::Perseus';
79 $self::dbhandle = Lingua::Morph::Perseus->connect( $lang );
80 } catch {
81 warn "Cannot do $lang word lemmatization without Lingua::Morph::Perseus: @_";
82 return;
83 }
84 }
85}
86
87sub _perseus_lookup_tt {
88 my $self = shift;
89 my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
90 $self->_morph_connect();
91 return unless $self::dbhandle;
92 # Discard results that don't match the lemma, unless lemma is unknown
93 my $lookupopts = {};
94 unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
95 my %lems;
96 map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
97 $lookupopts->{'lemma'} = [ keys %lems ];
98 }
99 $lookupopts->{'ttpos'} = $pos if $pos;
100
101 my $result = $self::dbhandle->lexicon_lookup( $orig, $lookupopts );
102 # unless( !keys( %$lookupopts ) || $result->{'filtered'} ) {
103 # warn "Filter on $pos / $lemma returned no results; using all results";
104 # }
105 my @ret = @{$result->{'objects'}};
106 my %unique_wordforms;
107 foreach my $obj ( @ret ) {
108 my $wf = $self->_wordform_from_row( $obj );
109 $unique_wordforms{$wf->to_string} = $wf;
110 }
111 return values( %unique_wordforms );
112}
113
114sub _perseus_lookup_str {
b0f883e0 115 my $self = shift;
116 my ( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
0ce8c0cf 117 $self->_morph_connect();
118 return unless $self::dbhandle;
b0f883e0 119 # Simple morph DB lookup, and return the results. Disregard the treetagger.
0ce8c0cf 120 my $result = $self::dbhandle->lexicon_lookup( $orig );
121 return map { $self->_wordform_from_row( $_ ) } @{$result->{'objects'}};
122}
123
124sub _wordform_from_row {
125 my( $self, $rowobj ) = @_;
126 my $lemma = $rowobj->lemma;
127 $lemma =~ s/^(\D+)\d*$/$1/;
128 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
129 'language' => $self->_get_lang(),
130 'lemma' => $lemma,
131 'morphology' => $rowobj->morphology,
132 );
133 return $wf;
134}
135
1361;