make sure all tests work in all combos, save the broken Directory deletion; POD doc...
[scpubgit/stemmatology.git] / morphology / lib / Text / Tradition / Language / Perseus.pm
CommitLineData
0ce8c0cf 1package Text::Tradition::Language::Perseus;
2
3use strict;
4use warnings;
5use Module::Load;
b0f883e0 6use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger
7 lfs_morph_tags /;
0ce8c0cf 8use TryCatch;
9
10=head1 NAME
11
12Text::Tradition::Language::Perseus - base module for those languages that rely
13on a Lingua::Morph::Perseus database.
14
15=head1 DESCRIPTION
16
17Implements morphology lookup for words in context. This module depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
18
19=head1 SUBROUTINES
20
21=head2 lemmatize( $text )
22
23Evaluates the string using Treetagger and Perseus, and returns the results.
24
25=cut
26
27# tested in child language modules
28
29sub perseus_lemmatize {
30 my $self = shift;
31 my $tradition = shift;
32 my %opts = (
33 'language' => $tradition->language,
34 'callback' => sub { _perseus_lookup_tt( $self, @_ ) }
35 );
36 return lemmatize_treetagger( $tradition, %opts );
37}
38
39=head2 reading_lookup( $rdg[, $rdg, ...] )
40
41Looks up one or more readings using the Perseus package, and returns the
42possible results. This skips the tree tagger / tokenizer, returning any
43match for the word string(s) in the morphology DB.
44
45=cut
46
47sub perseus_reading_lookup {
48 my( $self, @words ) = @_;
b0f883e0 49 my %opts = (
50 'language' => $self->_get_lang(),
51 'callback' => sub { _perseus_lookup_str( $self, @_ ) },
52 'path' => \@words,
53 );
54 return reading_lookup_treetagger( %opts );
0ce8c0cf 55}
56
57=head2 morphology_tags
58
59Return a data structure describing the available parts of speech and their attributes.
60
61=cut
62
63sub morphology_tags {
64 return lfs_morph_tags();
65}
66
67sub _get_lang {
68 my $self = shift;
69 my @parts = split( /::/, $self );
70 return $parts[-1];
71}
72
73sub _morph_connect {
74 my $self = shift;
75 unless( $self::dbhandle ) {
76 my $lang = $self->_get_lang();
77 try {
78 load 'Lingua::Morph::Perseus';
79 $self::dbhandle = Lingua::Morph::Perseus->connect( $lang );
80 } catch {
81 warn "Cannot do $lang word lemmatization without Lingua::Morph::Perseus: @_";
82 return;
83 }
84 }
85}
86
87sub _perseus_lookup_tt {
88 my $self = shift;
89 my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
90 $self->_morph_connect();
91 return unless $self::dbhandle;
92 # Discard results that don't match the lemma, unless lemma is unknown
93 my $lookupopts = {};
94 unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
95 my %lems;
96 map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
97 $lookupopts->{'lemma'} = [ keys %lems ];
98 }
99 $lookupopts->{'ttpos'} = $pos if $pos;
100
101 my $result = $self::dbhandle->lexicon_lookup( $orig, $lookupopts );
102 # unless( !keys( %$lookupopts ) || $result->{'filtered'} ) {
103 # warn "Filter on $pos / $lemma returned no results; using all results";
104 # }
105 my @ret = @{$result->{'objects'}};
106 my %unique_wordforms;
107 foreach my $obj ( @ret ) {
108 my $wf = $self->_wordform_from_row( $obj );
109 $unique_wordforms{$wf->to_string} = $wf;
110 }
111 return values( %unique_wordforms );
112}
113
114sub _perseus_lookup_str {
b0f883e0 115 my $self = shift;
116 my ( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
0ce8c0cf 117 $self->_morph_connect();
118 return unless $self::dbhandle;
b0f883e0 119 # Simple morph DB lookup, and return the results. Disregard the treetagger.
0ce8c0cf 120 my $result = $self::dbhandle->lexicon_lookup( $orig );
121 return map { $self->_wordform_from_row( $_ ) } @{$result->{'objects'}};
122}
123
124sub _wordform_from_row {
125 my( $self, $rowobj ) = @_;
126 my $lemma = $rowobj->lemma;
127 $lemma =~ s/^(\D+)\d*$/$1/;
128 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
129 'language' => $self->_get_lang(),
130 'lemma' => $lemma,
131 'morphology' => $rowobj->morphology,
132 );
133 return $wf;
134}
135
1361;
e92d4229 137
8943ff68 138=head1 ACKNOWLEDGMENTS
e92d4229 139
8943ff68 140The Perseus model and its dependents (currently Latin and Greek) draws its
141data from the ARTFL project at the University of Chicago. The module author
142wishes to thank Helma Dik in particular for her kind assistance.