Commit | Line | Data |
0ce8c0cf |
1 | package Text::Tradition::Language::Perseus; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Module::Load; |
b0f883e0 |
6 | use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger |
7 | lfs_morph_tags /; |
0ce8c0cf |
8 | use TryCatch; |
9 | |
10 | =head1 NAME |
11 | |
12 | Text::Tradition::Language::Perseus - base module for those languages that rely |
13 | on a Lingua::Morph::Perseus database. |
14 | |
15 | =head1 DESCRIPTION |
16 | |
17 | Implements morphology lookup for words in context. This module depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data. |
18 | |
19 | =head1 SUBROUTINES |
20 | |
21 | =head2 lemmatize( $text ) |
22 | |
23 | Evaluates the string using Treetagger and Perseus, and returns the results. |
24 | |
25 | =cut |
26 | |
27 | # tested in child language modules |
28 | |
29 | sub perseus_lemmatize { |
30 | my $self = shift; |
31 | my $tradition = shift; |
32 | my %opts = ( |
33 | 'language' => $tradition->language, |
34 | 'callback' => sub { _perseus_lookup_tt( $self, @_ ) } |
35 | ); |
36 | return lemmatize_treetagger( $tradition, %opts ); |
37 | } |
38 | |
39 | =head2 reading_lookup( $rdg[, $rdg, ...] ) |
40 | |
41 | Looks up one or more readings using the Perseus package, and returns the |
42 | possible results. This skips the tree tagger / tokenizer, returning any |
43 | match for the word string(s) in the morphology DB. |
44 | |
45 | =cut |
46 | |
47 | sub perseus_reading_lookup { |
48 | my( $self, @words ) = @_; |
b0f883e0 |
49 | my %opts = ( |
50 | 'language' => $self->_get_lang(), |
51 | 'callback' => sub { _perseus_lookup_str( $self, @_ ) }, |
52 | 'path' => \@words, |
53 | ); |
54 | return reading_lookup_treetagger( %opts ); |
0ce8c0cf |
55 | } |
56 | |
57 | =head2 morphology_tags |
58 | |
59 | Return a data structure describing the available parts of speech and their attributes. |
60 | |
61 | =cut |
62 | |
63 | sub morphology_tags { |
64 | return lfs_morph_tags(); |
65 | } |
66 | |
67 | sub _get_lang { |
68 | my $self = shift; |
69 | my @parts = split( /::/, $self ); |
70 | return $parts[-1]; |
71 | } |
72 | |
73 | sub _morph_connect { |
74 | my $self = shift; |
75 | unless( $self::dbhandle ) { |
76 | my $lang = $self->_get_lang(); |
77 | try { |
78 | load 'Lingua::Morph::Perseus'; |
79 | $self::dbhandle = Lingua::Morph::Perseus->connect( $lang ); |
80 | } catch { |
81 | warn "Cannot do $lang word lemmatization without Lingua::Morph::Perseus: @_"; |
82 | return; |
83 | } |
84 | } |
85 | } |
86 | |
87 | sub _perseus_lookup_tt { |
88 | my $self = shift; |
89 | my( $orig, $pos, $lemma ) = split( /\t/, $_[0] ); |
90 | $self->_morph_connect(); |
91 | return unless $self::dbhandle; |
92 | # Discard results that don't match the lemma, unless lemma is unknown |
93 | my $lookupopts = {}; |
94 | unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) { |
95 | my %lems; |
96 | map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma ); |
97 | $lookupopts->{'lemma'} = [ keys %lems ]; |
98 | } |
99 | $lookupopts->{'ttpos'} = $pos if $pos; |
100 | |
101 | my $result = $self::dbhandle->lexicon_lookup( $orig, $lookupopts ); |
102 | # unless( !keys( %$lookupopts ) || $result->{'filtered'} ) { |
103 | # warn "Filter on $pos / $lemma returned no results; using all results"; |
104 | # } |
105 | my @ret = @{$result->{'objects'}}; |
106 | my %unique_wordforms; |
107 | foreach my $obj ( @ret ) { |
108 | my $wf = $self->_wordform_from_row( $obj ); |
109 | $unique_wordforms{$wf->to_string} = $wf; |
110 | } |
111 | return values( %unique_wordforms ); |
112 | } |
113 | |
114 | sub _perseus_lookup_str { |
b0f883e0 |
115 | my $self = shift; |
116 | my ( $orig, $pos, $lemma ) = split( /\t/, $_[0] ); |
0ce8c0cf |
117 | $self->_morph_connect(); |
118 | return unless $self::dbhandle; |
b0f883e0 |
119 | # Simple morph DB lookup, and return the results. Disregard the treetagger. |
0ce8c0cf |
120 | my $result = $self::dbhandle->lexicon_lookup( $orig ); |
121 | return map { $self->_wordform_from_row( $_ ) } @{$result->{'objects'}}; |
122 | } |
123 | |
124 | sub _wordform_from_row { |
125 | my( $self, $rowobj ) = @_; |
126 | my $lemma = $rowobj->lemma; |
127 | $lemma =~ s/^(\D+)\d*$/$1/; |
128 | my $wf = Text::Tradition::Collation::Reading::WordForm->new( |
129 | 'language' => $self->_get_lang(), |
130 | 'lemma' => $lemma, |
131 | 'morphology' => $rowobj->morphology, |
132 | ); |
133 | return $wf; |
134 | } |
135 | |
136 | 1; |
e92d4229 |
137 | |
8943ff68 |
138 | =head1 ACKNOWLEDGMENTS |
e92d4229 |
139 | |
8943ff68 |
140 | The Perseus model and its dependents (currently Latin and Greek) draws its |
141 | data from the ARTFL project at the University of Chicago. The module author |
142 | wishes to thank Helma Dik in particular for her kind assistance. |