make witness plaintext parsing work
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation / Reading.pm
CommitLineData
784877d9 1package Text::Tradition::Collation::Reading;
2
8e1394aa 3use Moose;
e4b0f464 4use overload '""' => \&_stringify, 'fallback' => 1;
784877d9 5
3a2ebbf4 6=head1 NAME
784877d9 7
3a2ebbf4 8Text::Tradition::Collation::Reading - represents a reading (usually a word) in a collation.
eca16057 9
3a2ebbf4 10=head1 DESCRIPTION
784877d9 11
3a2ebbf4 12Text::Tradition is a library for representation and analysis of collated
13texts, particularly medieval ones. A 'reading' refers to a unit of text,
14usually a word, that appears in one or more witnesses (manuscripts) of the
15tradition; the text of a given witness is composed of a set of readings in
16a particular sequence
784877d9 17
3a2ebbf4 18=head1 METHODS
1ca1163d 19
3a2ebbf4 20=head2 new
8e1394aa 21
3a2ebbf4 22Creates a new reading in the given collation with the given attributes.
23Options include:
94c00c71 24
3a2ebbf4 25=over 4
784877d9 26
3a2ebbf4 27=item collation - The Text::Tradition::Collation object to which this reading belongs. Required.
e2902068 28
3a2ebbf4 29=item id - A unique identifier for this reading. Required.
910a0a6d 30
3a2ebbf4 31=item text - The word or other text of the reading.
784877d9 32
3a2ebbf4 33=item is_start - The reading is the starting point for the collation.
3265b0ce 34
3a2ebbf4 35=item is_end - The reading is the ending point for the collation.
784877d9 36
3a2ebbf4 37=item is_lacuna - The 'reading' represents a known gap in the text.
de51424a 38
12720144 39=item is_ph - A temporary placeholder for apparatus parsing purposes. Do not use unless you know what you are doing.
40
3a2ebbf4 41=item rank - The sequence number of the reading. This should probably not be set manually.
d047cd52 42
3a2ebbf4 43=back
8e1394aa 44
3a2ebbf4 45One of 'text', 'is_start', 'is_end', or 'is_lacuna' is required.
8e1394aa 46
3a2ebbf4 47=head2 collation
94c00c71 48
3a2ebbf4 49=head2 id
94c00c71 50
3a2ebbf4 51=head2 text
4cdd82f1 52
3a2ebbf4 53=head2 is_start
4cdd82f1 54
3a2ebbf4 55=head2 is_end
4a8828f0 56
3a2ebbf4 57=head2 is_lacuna
4a8828f0 58
3a2ebbf4 59=head2 rank
4a8828f0 60
3a2ebbf4 61Accessor methods for the given attributes.
d047cd52 62
3a2ebbf4 63=cut
d047cd52 64
3a2ebbf4 65has 'collation' => (
66 is => 'ro',
67 isa => 'Text::Tradition::Collation',
68 # required => 1,
69 weak_ref => 1,
70 );
d047cd52 71
3a2ebbf4 72has 'id' => (
73 is => 'ro',
74 isa => 'Str',
75 required => 1,
76 );
d047cd52 77
3a2ebbf4 78has 'text' => (
79 is => 'ro',
80 isa => 'Str',
81 required => 1,
49d4f2ac 82 writer => 'alter_text',
3a2ebbf4 83 );
0e47f4f6 84
fae52efd 85has 'language' => (
86 is => 'ro',
87 isa => 'Str',
88 default => 'Default',
89 );
90
3a2ebbf4 91has 'is_start' => (
92 is => 'ro',
93 isa => 'Bool',
94 default => undef,
95 );
96
97has 'is_end' => (
98 is => 'ro',
99 isa => 'Bool',
100 default => undef,
101 );
102
103has 'is_lacuna' => (
104 is => 'ro',
105 isa => 'Bool',
106 default => undef,
107 );
12720144 108
109has 'is_ph' => (
110 is => 'ro',
111 isa => 'Bool',
112 default => undef,
113 );
d4b75f44 114
115has 'is_common' => (
116 is => 'rw',
117 isa => 'Bool',
118 default => undef,
119 );
3a2ebbf4 120
121has 'rank' => (
122 is => 'rw',
123 isa => 'Int',
124 predicate => 'has_rank',
ca6e6095 125 clearer => 'clear_rank',
3a2ebbf4 126 );
fd602649 127
128## For morphological analysis
129
130has 'normal_form' => (
131 is => 'rw',
132 isa => 'Str',
133 predicate => 'has_normal_form',
134 );
135
136has 'lemma' => (
137 is => 'rw',
138 isa => 'Str',
139 predicate => 'has_lemma',
140 );
141
142has 'morphology' => (
4d9593df 143 traits => ['Array'],
144 isa => 'ArrayRef[HashRef[ArrayRef[Text::Tradition::Collation::Reading::Morphology]]]',
145 handles => {
146 lexemes => 'elements',
147 has_morphology => 'count',
148 _clear_morph => 'clear',
149 _add_morph => 'push',
150 },
fd602649 151 );
152
629e27b0 153## For prefix/suffix readings
154
155has 'join_prior' => (
156 is => 'ro',
157 isa => 'Bool',
158 default => undef,
159 );
160
161has 'join_next' => (
162 is => 'ro',
163 isa => 'Bool',
164 default => undef,
165 );
166
3a2ebbf4 167
168around BUILDARGS => sub {
169 my $orig = shift;
170 my $class = shift;
171 my $args;
172 if( @_ == 1 ) {
173 $args = shift;
174 } else {
175 $args = { @_ };
176 }
b0b4421a 177
3a2ebbf4 178 # If one of our special booleans is set, we change the text and the
179 # ID to match.
1d310495 180 if( exists $args->{'is_lacuna'} && !exists $args->{'text'} ) {
56eefa04 181 $args->{'text'} = '#LACUNA#';
3a2ebbf4 182 } elsif( exists $args->{'is_start'} ) {
183 $args->{'id'} = '#START#'; # Change the ID to ensure we have only one
184 $args->{'text'} = '#START#';
185 $args->{'rank'} = 0;
186 } elsif( exists $args->{'is_end'} ) {
187 $args->{'id'} = '#END#'; # Change the ID to ensure we have only one
188 $args->{'text'} = '#END#';
12720144 189 } elsif( exists $args->{'is_ph'} ) {
190 $args->{'text'} = $args->{'id'};
3a2ebbf4 191 }
192
193 $class->$orig( $args );
194};
195
196=head2 is_meta
197
198A meta attribute (ha ha), which should be true if any of our 'special'
199booleans are true. Implies that the reading does not represent a bit
200of text found in a witness.
201
202=cut
203
204sub is_meta {
205 my $self = shift;
12720144 206 return $self->is_start || $self->is_end || $self->is_lacuna || $self->is_ph;
3a2ebbf4 207}
208
027d819c 209=head1 Convenience methods
210
211=head2 related_readings
212
213Calls Collation's related_readings with $self as the first argument.
214
215=cut
216
3a2ebbf4 217sub related_readings {
218 my $self = shift;
219 return $self->collation->related_readings( $self, @_ );
220}
221
7f52eac8 222=head2 witnesses
223
224Calls Collation's reading_witnesses with $self as the first argument.
225
226=cut
227
228sub witnesses {
229 my $self = shift;
230 return $self->collation->reading_witnesses( $self, @_ );
231}
232
027d819c 233=head2 predecessors
234
235Returns a list of Reading objects that immediately precede $self in the collation.
236
237=cut
238
22222af9 239sub predecessors {
240 my $self = shift;
241 my @pred = $self->collation->sequence->predecessors( $self->id );
242 return map { $self->collation->reading( $_ ) } @pred;
243}
244
027d819c 245=head2 successors
246
247Returns a list of Reading objects that immediately follow $self in the collation.
248
249=cut
250
22222af9 251sub successors {
252 my $self = shift;
253 my @succ = $self->collation->sequence->successors( $self->id );
254 return map { $self->collation->reading( $_ ) } @succ;
255}
256
027d819c 257=head2 set_identical( $other_reading)
258
259Backwards compatibility method, to add a transposition relationship
260between $self and $other_reading. Don't use this.
261
262=cut
263
1d310495 264sub set_identical {
265 my( $self, $other ) = @_;
266 return $self->collation->add_relationship( $self, $other,
267 { 'type' => 'transposition' } );
268}
269
3a2ebbf4 270sub _stringify {
271 my $self = shift;
272 return $self->id;
273}
d047cd52 274
4d9593df 275=head1 MORPHOLOGY
276
277A few methods to try to tack on morphological information.
278
279=head2 is_disambiguated
280
281Returns true if there is only one tag per lexeme in this reading.
282
283=cut
284
285sub use_lexemes {
286 my( $self, @lexemes ) = @_;
287 # The lexemes need to be the same as $self->text.
288 my $cmpstr = $self->has_normal_form ? lc( $self->normal_form ) : lc( $self->text );
289 $cmpstr =~ s/[\s-]+//g;
290 my $lexstr = lc( join( '', @lexemes ) );
291 $lexstr =~ s/[\s-]+//g;
292 unless( $lexstr eq $cmpstr ) {
293 warn "Cannot split " . $self->text . " into " . join( '.', @lexemes );
294 return;
295 }
296 $self->_clear_morph;
297 map { $self->_add_morph( { $_ => [] } ) } @lexemes;
298}
299
300sub add_morphological_tag {
301 my( $self, $lexeme, $opts ) = @_;
302 my $struct;
303 unless( $opts ) {
304 # No lexeme was passed; use reading text.
305 $opts = $lexeme;
306 $lexeme = $self->text;
307 $self->use_lexemes( $lexeme );
308 }
309 # Get the correct container
310 ( $struct ) = grep { exists $_->{$lexeme} } $self->lexemes;
311 unless( $struct ) {
312 warn "No lexeme $lexeme exists in this reading";
313 return;
314 }
315 # Now make the morph object and add it to this lexeme.
316 my $morph_obj = Text::Tradition::Collation::Reading::Morphology->new( $opts );
317 # TODO Check for existence
318 push( @{$struct->{$lexeme}}, $morph_obj );
319}
320
321sub disambiguate {
322 my( $self, $lexeme, $index ) = @_;
323 my $struct;
324 unless( $index ) {
325 # No lexeme was passed; use reading text.
326 $index = $lexeme;
327 $lexeme = $self->text;
328 }
329 # Get the correct container
330 ( $struct ) = grep { exists $_->{$lexeme} } $self->lexemes;
331 unless( $struct ) {
332 warn "No lexeme $lexeme exists in this reading";
333 return;
334 }
335 # Keep the object at the selected index
336 my $selected = $struct->{$lexeme}->[$index];
337 $struct->{$lexeme} = [ $selected ];
338}
339
340sub is_disambiguated {
341 my $self = shift;
342 return undef unless $self->has_morphology;
343 foreach my $lexeme ( $self->lexemes ) {
344 my( $key ) = keys %$lexeme; # will be only one
345 return undef unless @{$lexeme->{$key}} == 1;
346 }
347 return 1;
348}
349
350## Utility methods
351
2acf0892 352sub TO_JSON {
353 my $self = shift;
354 return $self->text;
355}
356
4d9593df 357## TODO will need a throw() here
358
359no Moose;
360__PACKAGE__->meta->make_immutable;
361
362###################################################
363### Morphology objects, to be attached to readings
364###################################################
365
366package Text::Tradition::Collation::Reading::Morphology;
367
368use Moose;
369
370has 'lemma' => (
371 is => 'ro',
372 isa => 'Str',
373 required => 1,
374 );
375
376has 'code' => (
377 is => 'ro',
378 isa => 'Str',
379 required => 1,
380 );
381
382has 'language' => (
383 is => 'ro',
384 isa => 'Str',
385 required => 1,
386 );
387
388## Transmute codes into comparison arrays for our various languages.
389
390around BUILDARGS => sub {
391 my $orig = shift;
392 my $class = shift;
393 my $args;
394 if( @_ == 1 && ref( $_[0] ) ) {
395 $args = shift;
396 } else {
397 $args = { @_ };
398 }
399 if( exists( $args->{'serial'} ) ) {
400 my( $lemma, $code ) = split( /!!/, delete $args->{'serial'} );
401 $args->{'lemma'} = $lemma;
402 $args->{'code'} = $code;
403 }
404 $class->$orig( $args );
405};
406
407sub serialization {
408 my $self = shift;
409 return join( '!!', $self->lemma, $self->code );
410};
411
412sub comparison_array {
413 my $self = shift;
414 if( $self->language eq 'French' ) {
415 my @array;
416 my @bits = split( /\+/, $self->code );
417 # First push the non k/v parts.
418 while( @bits && $bits[0] !~ /=/ ) {
419 push( @array, shift @bits );
420 }
421 while( @array < 2 ) {
422 push( @array, undef );
423 }
424 # Now push the k/v parts in a known order.
425 my @fields = qw/ Pers Nb Temps Genre Spec Fonc /;
426 my %props;
427 map { my( $k, $v ) = split( /=/, $_ ); $props{$k} = $v; } @bits;
428 foreach my $k ( @fields ) {
429 push( @array, $props{$k} );
430 }
431 # Give the answer.
432 return @array;
433 } elsif( $self->language eq 'English' ) {
434 # Do something as yet undetermined
435 } else {
436 # Latin or Greek or Armenian, just split the chars
437 return split( '', $self->code );
438 }
439};
440
021bdbac 441no Moose;
442__PACKAGE__->meta->make_immutable;
d047cd52 443
021bdbac 4441;
d047cd52 445