Commit | Line | Data |
784877d9 |
1 | package Text::Tradition::Collation::Reading; |
2 | |
8e1394aa |
3 | use Moose; |
10e4b1ac |
4 | use Moose::Util::TypeConstraints; |
7604424b |
5 | use JSON qw/ from_json /; |
6ad2ce78 |
6 | use Module::Load; |
70745e70 |
7 | use Text::Tradition::Error; |
10e4b1ac |
8 | use XML::Easy::Syntax qw( $xml10_name_rx $xml10_namestartchar_rx ); |
7cd9f181 |
9 | use YAML::XS; |
e4b0f464 |
10 | use overload '""' => \&_stringify, 'fallback' => 1; |
784877d9 |
11 | |
10e4b1ac |
12 | subtype 'ReadingID', |
13 | as 'Str', |
14 | where { $_ =~ /\A$xml10_name_rx\z/ }, |
15 | message { 'Reading ID must be a valid XML attribute string' }; |
16 | |
17 | no Moose::Util::TypeConstraints; |
18 | |
3a2ebbf4 |
19 | =head1 NAME |
784877d9 |
20 | |
4aea6e9b |
21 | Text::Tradition::Collation::Reading - represents a reading (usually a word) |
22 | in a collation. |
23 | |
3a2ebbf4 |
24 | =head1 DESCRIPTION |
784877d9 |
25 | |
3a2ebbf4 |
26 | Text::Tradition is a library for representation and analysis of collated |
27 | texts, particularly medieval ones. A 'reading' refers to a unit of text, |
28 | usually a word, that appears in one or more witnesses (manuscripts) of the |
29 | tradition; the text of a given witness is composed of a set of readings in |
30 | a particular sequence |
784877d9 |
31 | |
3a2ebbf4 |
32 | =head1 METHODS |
1ca1163d |
33 | |
3a2ebbf4 |
34 | =head2 new |
8e1394aa |
35 | |
4aea6e9b |
36 | Creates a new reading in the given collation with the given attributes. |
3a2ebbf4 |
37 | Options include: |
94c00c71 |
38 | |
3a2ebbf4 |
39 | =over 4 |
784877d9 |
40 | |
4aea6e9b |
41 | =item collation - The Text::Tradition::Collation object to which this |
42 | reading belongs. Required. |
e2902068 |
43 | |
3a2ebbf4 |
44 | =item id - A unique identifier for this reading. Required. |
910a0a6d |
45 | |
3a2ebbf4 |
46 | =item text - The word or other text of the reading. |
784877d9 |
47 | |
3a2ebbf4 |
48 | =item is_start - The reading is the starting point for the collation. |
3265b0ce |
49 | |
3a2ebbf4 |
50 | =item is_end - The reading is the ending point for the collation. |
784877d9 |
51 | |
3a2ebbf4 |
52 | =item is_lacuna - The 'reading' represents a known gap in the text. |
de51424a |
53 | |
4aea6e9b |
54 | =item is_ph - A temporary placeholder for apparatus parsing purposes. Do |
55 | not use unless you know what you are doing. |
12720144 |
56 | |
4aea6e9b |
57 | =item rank - The sequence number of the reading. This should probably not |
58 | be set manually. |
d047cd52 |
59 | |
3a2ebbf4 |
60 | =back |
8e1394aa |
61 | |
3a2ebbf4 |
62 | One of 'text', 'is_start', 'is_end', or 'is_lacuna' is required. |
8e1394aa |
63 | |
3a2ebbf4 |
64 | =head2 collation |
94c00c71 |
65 | |
3a2ebbf4 |
66 | =head2 id |
94c00c71 |
67 | |
3a2ebbf4 |
68 | =head2 text |
4cdd82f1 |
69 | |
3a2ebbf4 |
70 | =head2 is_start |
4cdd82f1 |
71 | |
3a2ebbf4 |
72 | =head2 is_end |
4a8828f0 |
73 | |
3a2ebbf4 |
74 | =head2 is_lacuna |
4a8828f0 |
75 | |
3a2ebbf4 |
76 | =head2 rank |
4a8828f0 |
77 | |
3a2ebbf4 |
78 | Accessor methods for the given attributes. |
d047cd52 |
79 | |
3a2ebbf4 |
80 | =cut |
d047cd52 |
81 | |
3a2ebbf4 |
82 | has 'collation' => ( |
83 | is => 'ro', |
84 | isa => 'Text::Tradition::Collation', |
85 | # required => 1, |
86 | weak_ref => 1, |
87 | ); |
d047cd52 |
88 | |
3a2ebbf4 |
89 | has 'id' => ( |
90 | is => 'ro', |
10e4b1ac |
91 | isa => 'ReadingID', |
3a2ebbf4 |
92 | required => 1, |
93 | ); |
d047cd52 |
94 | |
3a2ebbf4 |
95 | has 'text' => ( |
96 | is => 'ro', |
97 | isa => 'Str', |
98 | required => 1, |
49d4f2ac |
99 | writer => 'alter_text', |
3a2ebbf4 |
100 | ); |
0e47f4f6 |
101 | |
fae52efd |
102 | has 'language' => ( |
103 | is => 'ro', |
104 | isa => 'Str', |
6ad2ce78 |
105 | predicate => 'has_language', |
fae52efd |
106 | ); |
107 | |
3a2ebbf4 |
108 | has 'is_start' => ( |
109 | is => 'ro', |
110 | isa => 'Bool', |
111 | default => undef, |
112 | ); |
113 | |
114 | has 'is_end' => ( |
115 | is => 'ro', |
116 | isa => 'Bool', |
117 | default => undef, |
118 | ); |
119 | |
120 | has 'is_lacuna' => ( |
121 | is => 'ro', |
122 | isa => 'Bool', |
123 | default => undef, |
124 | ); |
12720144 |
125 | |
126 | has 'is_ph' => ( |
127 | is => 'ro', |
128 | isa => 'Bool', |
129 | default => undef, |
130 | ); |
d4b75f44 |
131 | |
132 | has 'is_common' => ( |
133 | is => 'rw', |
134 | isa => 'Bool', |
135 | default => undef, |
136 | ); |
3a2ebbf4 |
137 | |
138 | has 'rank' => ( |
139 | is => 'rw', |
140 | isa => 'Int', |
141 | predicate => 'has_rank', |
ca6e6095 |
142 | clearer => 'clear_rank', |
3a2ebbf4 |
143 | ); |
fd602649 |
144 | |
145 | ## For morphological analysis |
146 | |
a8928d1d |
147 | has 'grammar_invalid' => ( |
148 | is => 'rw', |
149 | isa => 'Bool', |
150 | default => undef, |
151 | ); |
152 | |
153 | has 'is_nonsense' => ( |
154 | is => 'rw', |
155 | isa => 'Bool', |
0e6e9e7a |
156 | default => undef, |
a8928d1d |
157 | ); |
158 | |
fd602649 |
159 | has 'normal_form' => ( |
160 | is => 'rw', |
161 | isa => 'Str', |
367e901b |
162 | predicate => '_has_normal_form', |
163 | clearer => '_clear_normal_form', |
fd602649 |
164 | ); |
165 | |
7cd9f181 |
166 | # Holds the lexemes for the reading. |
d3e7842a |
167 | has 'reading_lexemes' => ( |
4d9593df |
168 | traits => ['Array'], |
d3e7842a |
169 | isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]', |
4d9593df |
170 | handles => { |
da83693e |
171 | lexeme => 'get', |
4d9593df |
172 | lexemes => 'elements', |
cca4f996 |
173 | has_lexemes => 'count', |
d3e7842a |
174 | clear_lexemes => 'clear', |
175 | add_lexeme => 'push', |
4d9593df |
176 | }, |
d3e7842a |
177 | default => sub { [] }, |
fd602649 |
178 | ); |
179 | |
629e27b0 |
180 | ## For prefix/suffix readings |
181 | |
182 | has 'join_prior' => ( |
183 | is => 'ro', |
184 | isa => 'Bool', |
185 | default => undef, |
186 | ); |
187 | |
188 | has 'join_next' => ( |
189 | is => 'ro', |
190 | isa => 'Bool', |
191 | default => undef, |
192 | ); |
193 | |
3a2ebbf4 |
194 | |
195 | around BUILDARGS => sub { |
196 | my $orig = shift; |
197 | my $class = shift; |
198 | my $args; |
199 | if( @_ == 1 ) { |
200 | $args = shift; |
201 | } else { |
202 | $args = { @_ }; |
203 | } |
b0b4421a |
204 | |
3a2ebbf4 |
205 | # If one of our special booleans is set, we change the text and the |
206 | # ID to match. |
44924224 |
207 | if( exists $args->{'is_lacuna'} && $args->{'is_lacuna'} && !exists $args->{'text'} ) { |
56eefa04 |
208 | $args->{'text'} = '#LACUNA#'; |
44924224 |
209 | } elsif( exists $args->{'is_start'} && $args->{'is_start'} ) { |
10e4b1ac |
210 | $args->{'id'} = '__START__'; # Change the ID to ensure we have only one |
3a2ebbf4 |
211 | $args->{'text'} = '#START#'; |
212 | $args->{'rank'} = 0; |
44924224 |
213 | } elsif( exists $args->{'is_end'} && $args->{'is_end'} ) { |
10e4b1ac |
214 | $args->{'id'} = '__END__'; # Change the ID to ensure we have only one |
3a2ebbf4 |
215 | $args->{'text'} = '#END#'; |
44924224 |
216 | } elsif( exists $args->{'is_ph'} && $args->{'is_ph'} ) { |
12720144 |
217 | $args->{'text'} = $args->{'id'}; |
3a2ebbf4 |
218 | } |
219 | |
10e4b1ac |
220 | # Backwards compatibility for non-XMLname IDs |
221 | my $rid = $args->{'id'}; |
222 | $rid =~ s/\#/__/g; |
223 | $rid =~ s/[\/,]/./g; |
224 | if( $rid !~ /^$xml10_namestartchar_rx/ ) { |
225 | $rid = 'r'.$rid; |
226 | } |
227 | $args->{'id'} = $rid; |
228 | |
3a2ebbf4 |
229 | $class->$orig( $args ); |
230 | }; |
231 | |
70745e70 |
232 | # Look for a lexeme-string argument in the build args. |
233 | sub BUILD { |
234 | my( $self, $args ) = @_; |
235 | if( exists $args->{'lexemes'} ) { |
236 | $self->_deserialize_lexemes( $args->{'lexemes'} ); |
237 | } |
238 | } |
239 | |
367e901b |
240 | # Make normal_form default to text, transparently. |
241 | around 'normal_form' => sub { |
242 | my $orig = shift; |
243 | my $self = shift; |
244 | my( $arg ) = @_; |
245 | if( $arg && $arg eq $self->text ) { |
246 | $self->_clear_normal_form; |
247 | return $arg; |
248 | } elsif( !$arg && !$self->_has_normal_form ) { |
249 | return $self->text; |
250 | } else { |
251 | $self->$orig( @_ ); |
252 | } |
253 | }; |
254 | |
3a2ebbf4 |
255 | =head2 is_meta |
256 | |
257 | A meta attribute (ha ha), which should be true if any of our 'special' |
258 | booleans are true. Implies that the reading does not represent a bit |
259 | of text found in a witness. |
260 | |
261 | =cut |
262 | |
263 | sub is_meta { |
264 | my $self = shift; |
12720144 |
265 | return $self->is_start || $self->is_end || $self->is_lacuna || $self->is_ph; |
3a2ebbf4 |
266 | } |
267 | |
027d819c |
268 | =head1 Convenience methods |
269 | |
270 | =head2 related_readings |
271 | |
272 | Calls Collation's related_readings with $self as the first argument. |
273 | |
274 | =cut |
275 | |
3a2ebbf4 |
276 | sub related_readings { |
277 | my $self = shift; |
278 | return $self->collation->related_readings( $self, @_ ); |
279 | } |
280 | |
7f52eac8 |
281 | =head2 witnesses |
282 | |
283 | Calls Collation's reading_witnesses with $self as the first argument. |
284 | |
285 | =cut |
286 | |
287 | sub witnesses { |
288 | my $self = shift; |
289 | return $self->collation->reading_witnesses( $self, @_ ); |
290 | } |
291 | |
027d819c |
292 | =head2 predecessors |
293 | |
294 | Returns a list of Reading objects that immediately precede $self in the collation. |
295 | |
296 | =cut |
297 | |
22222af9 |
298 | sub predecessors { |
299 | my $self = shift; |
300 | my @pred = $self->collation->sequence->predecessors( $self->id ); |
301 | return map { $self->collation->reading( $_ ) } @pred; |
302 | } |
303 | |
027d819c |
304 | =head2 successors |
305 | |
306 | Returns a list of Reading objects that immediately follow $self in the collation. |
307 | |
308 | =cut |
309 | |
22222af9 |
310 | sub successors { |
311 | my $self = shift; |
312 | my @succ = $self->collation->sequence->successors( $self->id ); |
313 | return map { $self->collation->reading( $_ ) } @succ; |
314 | } |
315 | |
027d819c |
316 | =head2 set_identical( $other_reading) |
317 | |
318 | Backwards compatibility method, to add a transposition relationship |
319 | between $self and $other_reading. Don't use this. |
320 | |
321 | =cut |
322 | |
1d310495 |
323 | sub set_identical { |
324 | my( $self, $other ) = @_; |
325 | return $self->collation->add_relationship( $self, $other, |
326 | { 'type' => 'transposition' } ); |
327 | } |
328 | |
3a2ebbf4 |
329 | sub _stringify { |
330 | my $self = shift; |
331 | return $self->id; |
332 | } |
d047cd52 |
333 | |
4d9593df |
334 | =head1 MORPHOLOGY |
335 | |
7cd9f181 |
336 | Methods for the morphological information (if any) attached to readings. |
337 | A reading may be made up of multiple lexemes; the concatenated lexeme |
338 | strings ought to match the reading's normalized form. |
339 | |
340 | See L<Text::Tradition::Collation::Reading::Lexeme> for more information |
341 | on Lexeme objects and their attributes. |
342 | |
343 | =head2 has_lexemes |
344 | |
345 | Returns a true value if the reading has any attached lexemes. |
4d9593df |
346 | |
6ad2ce78 |
347 | =head2 lexemes |
06e7cbc7 |
348 | |
7cd9f181 |
349 | Returns the Lexeme objects (if any) attached to the reading. |
6ad2ce78 |
350 | |
351 | =head2 clear_lexemes |
352 | |
7cd9f181 |
353 | Wipes any associated Lexeme objects out of the reading. |
354 | |
355 | =head2 add_lexeme( $lexobj ) |
6ad2ce78 |
356 | |
7cd9f181 |
357 | Adds the Lexeme in $lexobj to the list of lexemes. |
358 | |
359 | =head2 lemmatize |
360 | |
361 | If the language of the reading is set, this method will use the appropriate |
362 | Language model to determine the lexemes that belong to this reading. See |
363 | L<Text::Tradition::lemmatize> if you wish to lemmatize an entire tradition. |
06e7cbc7 |
364 | |
4d9593df |
365 | =cut |
366 | |
6ad2ce78 |
367 | sub lemmatize { |
368 | my $self = shift; |
369 | unless( $self->has_language ) { |
370 | warn "Please set a language to lemmatize a tradition"; |
371 | return; |
372 | } |
373 | my $mod = "Text::Tradition::Language::" . $self->language; |
374 | load( $mod ); |
375 | $mod->can( 'reading_lookup' )->( $self ); |
376 | |
377 | } |
4d9593df |
378 | |
7604424b |
379 | # For graph serialization. Return a JSON representation of the associated |
7cd9f181 |
380 | # reading lexemes. |
381 | sub _serialize_lexemes { |
382 | my $self = shift; |
7604424b |
383 | my $json = JSON->new->allow_blessed(1)->convert_blessed(1); |
384 | return $json->encode( [ $self->lexemes ] ); |
7cd9f181 |
385 | } |
70745e70 |
386 | |
7604424b |
387 | # Given a JSON representation of the lexemes, instantiate them and add |
388 | # them to the reading. |
70745e70 |
389 | sub _deserialize_lexemes { |
7604424b |
390 | my( $self, $json ) = @_; |
391 | my $data = from_json( $json ); |
392 | return unless @$data; |
70745e70 |
393 | |
7604424b |
394 | # Need to have the lexeme module in order to have lexemes. |
395 | eval { use Text::Tradition::Collation::Reading::Lexeme; }; |
70745e70 |
396 | throw( $@ ) if $@; |
397 | |
398 | # Good to go - add the lexemes. |
399 | my @lexemes; |
7604424b |
400 | foreach my $lexhash ( @$data ) { |
401 | push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new( |
402 | 'JSON' => $lexhash ) ); |
70745e70 |
403 | } |
404 | $self->clear_lexemes; |
405 | $self->add_lexeme( @lexemes ); |
406 | } |
7cd9f181 |
407 | |
44924224 |
408 | sub disambiguated { |
409 | my $self = shift; |
410 | return 0 unless $self->has_lexemes; |
411 | return !grep { !$_->is_disambiguated } $self->lexemes; |
412 | } |
413 | |
4d9593df |
414 | ## Utility methods |
415 | |
2acf0892 |
416 | sub TO_JSON { |
417 | my $self = shift; |
418 | return $self->text; |
419 | } |
420 | |
70745e70 |
421 | sub throw { |
422 | Text::Tradition::Error->throw( |
423 | 'ident' => 'Reading error', |
424 | 'message' => $_[0], |
425 | ); |
426 | } |
4d9593df |
427 | |
428 | no Moose; |
429 | __PACKAGE__->meta->make_immutable; |
430 | |
021bdbac |
431 | 1; |