Commit | Line | Data |
a445ce40 |
1 | package Text::Tradition::Morphology; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use JSON qw/ from_json /; |
6 | use Moose::Role; |
7 | use Module::Load; |
8 | use Text::Tradition::Collation::Reading::Lexeme; |
9 | |
483478bf |
10 | use vars qw/ $VERSION /; |
11 | $VERSION = "0.1"; |
12 | |
a445ce40 |
13 | =head1 NAME |
14 | |
8943ff68 |
15 | Text::Tradition::Morphology - morphology plugin for Text::Tradition |
16 | |
17 | =head1 DESCRIPTION |
18 | |
19 | The Text::Tradition::Morphology package enables lemma and part-of-speech |
20 | information for traditions and their Reading objects. This distribution |
21 | includes the L<Text::Tradition::Language> role for Traditions, the |
22 | L<Text::Tradition::Morphology> role (this package) for Readings, and a set |
23 | of Language::* modules for language-specific lemmatization. |
24 | |
25 | See L<Text::Tradition::Collation::Reading::Lexeme> for more about the |
26 | morphology object structure. |
a445ce40 |
27 | |
28 | =cut |
29 | |
332750fc |
30 | requires 'is_identical', 'is_combinable', '_combine'; |
31 | |
58f9c2b9 |
32 | has 'language' => ( |
33 | is => 'ro', |
34 | isa => 'Str', |
35 | predicate => 'has_language', |
36 | ); |
37 | |
a445ce40 |
38 | has 'grammar_invalid' => ( |
39 | is => 'rw', |
40 | isa => 'Bool', |
41 | default => undef, |
42 | ); |
43 | |
44 | has 'is_nonsense' => ( |
45 | is => 'rw', |
46 | isa => 'Bool', |
47 | default => undef, |
48 | ); |
49 | |
50 | has 'normal_form' => ( |
51 | is => 'rw', |
52 | isa => 'Str', |
53 | predicate => '_has_normal_form', |
54 | clearer => '_clear_normal_form', |
55 | ); |
56 | |
57 | # Holds the lexemes for the reading. |
58 | has 'reading_lexemes' => ( |
59 | traits => ['Array'], |
60 | isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]', |
61 | handles => { |
62 | lexeme => 'get', |
63 | lexemes => 'elements', |
64 | has_lexemes => 'count', |
65 | clear_lexemes => 'clear', |
66 | add_lexeme => 'push', |
67 | }, |
68 | default => sub { [] }, |
69 | ); |
70 | |
71 | |
72 | |
73 | # Make normal_form default to text, transparently. |
74 | around 'normal_form' => sub { |
75 | my $orig = shift; |
76 | my $self = shift; |
77 | my( $arg ) = @_; |
78 | if( $arg && $arg eq $self->text ) { |
79 | $self->_clear_normal_form; |
80 | return $arg; |
81 | } elsif( !$arg && !$self->_has_normal_form ) { |
82 | return $self->text; |
83 | } else { |
84 | $self->$orig( @_ ); |
85 | } |
86 | }; |
87 | |
8943ff68 |
88 | =head1 READING METHODS |
a445ce40 |
89 | |
90 | Methods for the morphological information (if any) attached to readings. |
91 | A reading may be made up of multiple lexemes; the concatenated lexeme |
92 | strings ought to match the reading's normalized form. |
93 | |
94 | See L<Text::Tradition::Collation::Reading::Lexeme> for more information |
95 | on Lexeme objects and their attributes. |
96 | |
97 | =head2 has_lexemes |
98 | |
99 | Returns a true value if the reading has any attached lexemes. |
100 | |
101 | =head2 lexemes |
102 | |
103 | Returns the Lexeme objects (if any) attached to the reading. |
104 | |
105 | =head2 clear_lexemes |
106 | |
107 | Wipes any associated Lexeme objects out of the reading. |
108 | |
109 | =head2 add_lexeme( $lexobj ) |
110 | |
111 | Adds the Lexeme in $lexobj to the list of lexemes. |
112 | |
113 | =head2 lemmatize |
114 | |
115 | If the language of the reading is set, this method will use the appropriate |
116 | Language model to determine the lexemes that belong to this reading. See |
8943ff68 |
117 | L<Text::Tradition::Language::lemmatize> if you wish to lemmatize an entire tradition. |
a445ce40 |
118 | |
119 | =cut |
120 | |
121 | sub lemmatize { |
122 | my $self = shift; |
123 | unless( $self->has_language ) { |
124 | warn "Please set a language to lemmatize a tradition"; |
125 | return; |
126 | } |
127 | my $mod = "Text::Tradition::Language::" . $self->language; |
128 | load( $mod ); |
129 | $mod->can( 'reading_lookup' )->( $self ); |
130 | |
131 | } |
132 | |
133 | # For graph serialization. Return a JSON representation of the associated |
134 | # reading lexemes. |
135 | sub _serialize_lexemes { |
136 | my $self = shift; |
137 | my $json = JSON->new->allow_blessed(1)->convert_blessed(1); |
138 | return $json->encode( [ $self->lexemes ] ); |
139 | } |
140 | |
141 | # Given a JSON representation of the lexemes, instantiate them and add |
142 | # them to the reading. |
143 | sub _deserialize_lexemes { |
144 | my( $self, $json ) = @_; |
145 | my $data = from_json( $json ); |
146 | return unless @$data; |
147 | |
148 | my @lexemes; |
149 | foreach my $lexhash ( @$data ) { |
150 | push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new( |
151 | 'JSON' => $lexhash ) ); |
152 | } |
153 | $self->clear_lexemes; |
154 | $self->add_lexeme( @lexemes ); |
155 | } |
156 | |
157 | sub disambiguated { |
158 | my $self = shift; |
159 | return 0 unless $self->has_lexemes; |
160 | return !grep { !$_->is_disambiguated } $self->lexemes; |
161 | } |
162 | |
163 | sub filter_lexemes { |
164 | my $self = shift; |
165 | # While we are here, get rid of any extra wordforms from a disambiguated |
166 | # reading. |
167 | if( $self->disambiguated ) { |
168 | foreach my $lex ( $self->lexemes ) { |
169 | $lex->clear_matching_forms(); |
170 | $lex->add_matching_form( $lex->form ); |
171 | } |
172 | } |
173 | } |
174 | |
58f9c2b9 |
175 | =head2 regularize |
176 | |
177 | Call the 'regularize' function of the appropriate language model on our |
178 | own reading text. This is a rules-based function distinct from 'normal_form', |
179 | which can be set to any arbitrary string. |
180 | |
181 | =cut |
182 | |
183 | # TODO Test this stuff |
184 | |
185 | sub regularize { |
186 | my $self = shift; |
187 | if( $self->has_language ) { |
188 | # If we do have a language, regularize the tokens in $answer. |
189 | my $mod = 'Text::Tradition::Language::' . $self->language; |
58f9c2b9 |
190 | eval { load( $mod ); }; |
191 | # If a module doesn't exist for our language, use the base routine |
192 | $mod = 'Text::Tradition::Language::Base' if $@; |
193 | return $mod->can( 'regularize' )->( $self->text ); |
194 | } else { |
195 | return $self->text; |
196 | } |
197 | } |
198 | |
a445ce40 |
199 | around 'is_identical' => sub { |
200 | my $orig = shift; |
201 | my $self = shift; |
202 | my $other = shift; |
203 | # If the base class returns true, do an extra check to make sure the |
204 | # lexemes also match. |
205 | my $answer = $self->$orig( $other ); |
206 | if( $answer ) { |
207 | if( $self->disambiguated && $other->disambiguated ) { |
208 | my $rform = join( '//', map { $_->form->to_string } $self->lexemes ); |
209 | my $uform = join( '//', map { $_->form->to_string } $other->lexemes ); |
210 | $answer = undef unless $rform eq $uform; |
211 | } elsif( $self->disambiguated xor $other->disambiguated ) { |
212 | $answer = undef; |
213 | } |
214 | } |
215 | return $answer; |
216 | }; |
217 | |
218 | around 'is_combinable' => sub { |
219 | my $orig = shift; |
220 | my $self = shift; |
221 | # If the reading is marked with invalid grammar or as a nonsense reading, |
222 | # it is no longer combinable. |
223 | return undef if $self->grammar_invalid || $self->is_nonsense; |
224 | return $self->$orig(); |
225 | }; |
226 | |
227 | after '_combine' => sub { |
228 | my $self = shift; |
229 | my $other = shift; |
230 | my $joinstr = shift; |
231 | $self->normal_form( |
232 | join( $joinstr, $self->normal_form, $other->normal_form ) ); |
233 | # Combine the lexemes present in the readings |
234 | if( $self->has_lexemes && $other->has_lexemes ) { |
235 | $self->add_lexeme( $other->lexemes ); |
236 | } |
237 | }; |
238 | |
e92d4229 |
239 | 1; |
240 | |
241 | =head1 LICENSE |
242 | |
243 | This package is free software and is provided "as is" without express |
244 | or implied warranty. You can redistribute it and/or modify it under |
245 | the same terms as Perl itself. |
246 | |
247 | =head1 AUTHOR |
248 | |
249 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |