Commit | Line | Data |
5271a011 |
1 | package Text::Tradition::Language::Latin; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Module::Load; |
f8862b58 |
6 | use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /; |
5271a011 |
7 | use TryCatch; |
8 | |
9 | =head1 NAME |
10 | |
11 | Text::Tradition::Language::Latin - language-specific module for Latin |
12 | |
13 | =head1 DESCRIPTION |
14 | |
15 | Implements morphology lookup for French words in context. This module |
f8862b58 |
16 | depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data. |
5271a011 |
17 | |
18 | =head1 SUBROUTINES |
19 | |
20 | =head2 lemmatize( $text ) |
21 | |
22 | Evaluates the string using the Flemm package, and returns the results. |
23 | |
24 | =begin testing |
25 | |
26 | use Text::Tradition; |
27 | use_ok( 'Text::Tradition::Language::Latin' ); |
28 | |
f8862b58 |
29 | eval "use Lingua::Morph::Perseus"; |
5271a011 |
30 | my $err = $@; |
31 | |
32 | SKIP: { |
f8862b58 |
33 | skip "Package Lingua::Morph::Perseus not found" if $err; |
5271a011 |
34 | |
35 | my $trad = Text::Tradition->new( |
36 | 'language' => 'Latin', |
37 | 'file' => 't/data/legendfrag.xml', |
38 | 'input' => 'Self' ); |
39 | $trad->lemmatize(); |
40 | my $ambig = 0; |
41 | foreach my $r ( $trad->collation->readings ) { |
42 | next if $r->is_meta; |
43 | ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); |
44 | my @lex = $r->lexemes; |
45 | my $lexstr = join( '', map { $_->string } @lex ); |
46 | my $textstr = $r->text; |
47 | $textstr =~ s/\s+//g; |
48 | is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); |
49 | foreach my $l ( @lex ) { |
fe77efe0 |
50 | next unless $l->matches; |
5271a011 |
51 | next if $l->is_disambiguated; |
52 | printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, |
53 | join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); |
54 | $ambig++; |
55 | } |
56 | } |
f8862b58 |
57 | is( $ambig, 4, "Found 4 ambiguous forms as expected" ); |
5271a011 |
58 | } |
59 | |
60 | =end testing |
61 | |
62 | =cut |
63 | |
64 | sub lemmatize { |
65 | my $tradition = shift; |
66 | my %opts = ( |
67 | 'language' => 'Latin', |
68 | 'callback' => sub { _perseus_lookup_tt( @_ ) } |
69 | ); |
70 | return lemmatize_treetagger( $tradition, %opts ); |
71 | } |
72 | |
73 | =head2 reading_lookup( $rdg[, $rdg, ...] ) |
74 | |
75 | Looks up one or more readings using the Perseus package, and returns the |
76 | possible results. This skips the tree tagger / tokenizer, returning any |
77 | match for the word string(s) in the morphology DB. |
78 | |
79 | =cut |
80 | |
81 | sub reading_lookup { |
82 | my @words = @_; |
83 | return map { _perseus_lookup_str( $_ ) } @words; |
84 | } |
85 | |
75ae2b25 |
86 | =head2 morphology_tags |
87 | |
88 | Return a data structure describing the available parts of speech and their attributes. |
89 | |
90 | =cut |
91 | |
92 | sub morphology_tags { |
75ae2b25 |
93 | return lfs_morph_tags(); |
94 | } |
95 | |
96 | |
5271a011 |
97 | { |
98 | my $morph; |
99 | |
100 | sub _morph_connect { |
101 | unless( $morph ) { |
102 | try { |
f8862b58 |
103 | load 'Lingua::Morph::Perseus'; |
104 | $morph = Lingua::Morph::Perseus->connect( 'Latin' ); |
5271a011 |
105 | } catch { |
f8862b58 |
106 | warn "Cannot do Latin word lemmatization without Lingua::Morph::Perseus: @_"; |
5271a011 |
107 | return; |
108 | } |
109 | } |
110 | } |
ad2a2c47 |
111 | |
5271a011 |
112 | sub _perseus_lookup_tt { |
113 | my( $orig, $pos, $lemma ) = split( /\t/, $_[0] ); |
114 | _morph_connect(); |
f8862b58 |
115 | return unless $morph; |
5271a011 |
116 | # Discard results that don't match the lemma, unless lemma is unknown |
f8862b58 |
117 | my $lookupopts = {}; |
fe77efe0 |
118 | unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) { |
ad2a2c47 |
119 | my %lems; |
120 | map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma ); |
121 | $lookupopts->{'lemma'} = [ keys %lems ]; |
5271a011 |
122 | } |
f8862b58 |
123 | $lookupopts->{'ttpos'} = $pos if $pos; |
5271a011 |
124 | |
f8862b58 |
125 | my $result = $morph->lexicon_lookup( $orig, $lookupopts ); |
126 | # unless( !keys( %$lookupopts ) || $result->{'filtered'} ) { |
127 | # warn "Filter on $pos / $lemma returned no results; using all results"; |
128 | # } |
129 | my @ret = @{$result->{'objects'}}; |
75ae2b25 |
130 | my %unique_wordforms; |
5271a011 |
131 | foreach my $obj ( @ret ) { |
75ae2b25 |
132 | my $wf = _wordform_from_row( $obj ); |
133 | $unique_wordforms{$wf->to_string} = $wf; |
5271a011 |
134 | } |
75ae2b25 |
135 | return values( %unique_wordforms ); |
5271a011 |
136 | } |
137 | |
138 | sub _perseus_lookup_str { |
139 | my( $orig ) = @_; |
140 | _morph_connect(); |
772edba8 |
141 | return unless $morph; |
5271a011 |
142 | # Simple morph DB lookup, and return the results. |
143 | my $result = $morph->lookup( $orig ); |
144 | return map { _wordform_from_row( $_ ) } @{$result->{'objects'}}; |
145 | } |
146 | |
5271a011 |
147 | } |
148 | |
fe77efe0 |
149 | sub _wordform_from_row { |
150 | my( $rowobj ) = @_; |
fe77efe0 |
151 | my $lemma = $rowobj->lemma; |
152 | $lemma =~ s/^(\D+)\d*$/$1/; |
153 | my $wf = Text::Tradition::Collation::Reading::WordForm->new( |
154 | 'language' => 'Latin', |
155 | 'lemma' => $lemma, |
f8862b58 |
156 | 'morphology' => $rowobj->morphology, |
fe77efe0 |
157 | ); |
158 | return $wf; |
159 | } |
160 | |
5271a011 |
161 | 1; |