read lexeme info in GraphML parsing
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation / Reading.pm
CommitLineData
784877d9 1package Text::Tradition::Collation::Reading;
2
8e1394aa 3use Moose;
6ad2ce78 4use Module::Load;
70745e70 5use Text::Tradition::Error;
7cd9f181 6use YAML::XS;
e4b0f464 7use overload '""' => \&_stringify, 'fallback' => 1;
784877d9 8
3a2ebbf4 9=head1 NAME
784877d9 10
4aea6e9b 11Text::Tradition::Collation::Reading - represents a reading (usually a word)
12in a collation.
13
3a2ebbf4 14=head1 DESCRIPTION
784877d9 15
3a2ebbf4 16Text::Tradition is a library for representation and analysis of collated
17texts, particularly medieval ones. A 'reading' refers to a unit of text,
18usually a word, that appears in one or more witnesses (manuscripts) of the
19tradition; the text of a given witness is composed of a set of readings in
20a particular sequence
784877d9 21
3a2ebbf4 22=head1 METHODS
1ca1163d 23
3a2ebbf4 24=head2 new
8e1394aa 25
4aea6e9b 26Creates a new reading in the given collation with the given attributes.
3a2ebbf4 27Options include:
94c00c71 28
3a2ebbf4 29=over 4
784877d9 30
4aea6e9b 31=item collation - The Text::Tradition::Collation object to which this
32reading belongs. Required.
e2902068 33
3a2ebbf4 34=item id - A unique identifier for this reading. Required.
910a0a6d 35
3a2ebbf4 36=item text - The word or other text of the reading.
784877d9 37
3a2ebbf4 38=item is_start - The reading is the starting point for the collation.
3265b0ce 39
3a2ebbf4 40=item is_end - The reading is the ending point for the collation.
784877d9 41
3a2ebbf4 42=item is_lacuna - The 'reading' represents a known gap in the text.
de51424a 43
4aea6e9b 44=item is_ph - A temporary placeholder for apparatus parsing purposes. Do
45not use unless you know what you are doing.
12720144 46
4aea6e9b 47=item rank - The sequence number of the reading. This should probably not
48be set manually.
d047cd52 49
3a2ebbf4 50=back
8e1394aa 51
3a2ebbf4 52One of 'text', 'is_start', 'is_end', or 'is_lacuna' is required.
8e1394aa 53
3a2ebbf4 54=head2 collation
94c00c71 55
3a2ebbf4 56=head2 id
94c00c71 57
3a2ebbf4 58=head2 text
4cdd82f1 59
3a2ebbf4 60=head2 is_start
4cdd82f1 61
3a2ebbf4 62=head2 is_end
4a8828f0 63
3a2ebbf4 64=head2 is_lacuna
4a8828f0 65
3a2ebbf4 66=head2 rank
4a8828f0 67
3a2ebbf4 68Accessor methods for the given attributes.
d047cd52 69
3a2ebbf4 70=cut
d047cd52 71
3a2ebbf4 72has 'collation' => (
73 is => 'ro',
74 isa => 'Text::Tradition::Collation',
75 # required => 1,
76 weak_ref => 1,
77 );
d047cd52 78
3a2ebbf4 79has 'id' => (
80 is => 'ro',
81 isa => 'Str',
82 required => 1,
83 );
d047cd52 84
3a2ebbf4 85has 'text' => (
86 is => 'ro',
87 isa => 'Str',
88 required => 1,
49d4f2ac 89 writer => 'alter_text',
3a2ebbf4 90 );
0e47f4f6 91
fae52efd 92has 'language' => (
93 is => 'ro',
94 isa => 'Str',
6ad2ce78 95 predicate => 'has_language',
fae52efd 96 );
97
3a2ebbf4 98has 'is_start' => (
99 is => 'ro',
100 isa => 'Bool',
101 default => undef,
102 );
103
104has 'is_end' => (
105 is => 'ro',
106 isa => 'Bool',
107 default => undef,
108 );
109
110has 'is_lacuna' => (
111 is => 'ro',
112 isa => 'Bool',
113 default => undef,
114 );
12720144 115
116has 'is_ph' => (
117 is => 'ro',
118 isa => 'Bool',
119 default => undef,
120 );
d4b75f44 121
122has 'is_common' => (
123 is => 'rw',
124 isa => 'Bool',
125 default => undef,
126 );
3a2ebbf4 127
128has 'rank' => (
129 is => 'rw',
130 isa => 'Int',
131 predicate => 'has_rank',
ca6e6095 132 clearer => 'clear_rank',
3a2ebbf4 133 );
fd602649 134
135## For morphological analysis
136
137has 'normal_form' => (
138 is => 'rw',
139 isa => 'Str',
140 predicate => 'has_normal_form',
141 );
142
7cd9f181 143# Holds the lexemes for the reading.
d3e7842a 144has 'reading_lexemes' => (
4d9593df 145 traits => ['Array'],
d3e7842a 146 isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
4d9593df 147 handles => {
148 lexemes => 'elements',
cca4f996 149 has_lexemes => 'count',
d3e7842a 150 clear_lexemes => 'clear',
151 add_lexeme => 'push',
4d9593df 152 },
d3e7842a 153 default => sub { [] },
fd602649 154 );
155
629e27b0 156## For prefix/suffix readings
157
158has 'join_prior' => (
159 is => 'ro',
160 isa => 'Bool',
161 default => undef,
162 );
163
164has 'join_next' => (
165 is => 'ro',
166 isa => 'Bool',
167 default => undef,
168 );
169
3a2ebbf4 170
171around BUILDARGS => sub {
172 my $orig = shift;
173 my $class = shift;
174 my $args;
175 if( @_ == 1 ) {
176 $args = shift;
177 } else {
178 $args = { @_ };
179 }
b0b4421a 180
3a2ebbf4 181 # If one of our special booleans is set, we change the text and the
182 # ID to match.
1d310495 183 if( exists $args->{'is_lacuna'} && !exists $args->{'text'} ) {
56eefa04 184 $args->{'text'} = '#LACUNA#';
3a2ebbf4 185 } elsif( exists $args->{'is_start'} ) {
186 $args->{'id'} = '#START#'; # Change the ID to ensure we have only one
187 $args->{'text'} = '#START#';
188 $args->{'rank'} = 0;
189 } elsif( exists $args->{'is_end'} ) {
190 $args->{'id'} = '#END#'; # Change the ID to ensure we have only one
191 $args->{'text'} = '#END#';
12720144 192 } elsif( exists $args->{'is_ph'} ) {
193 $args->{'text'} = $args->{'id'};
3a2ebbf4 194 }
195
196 $class->$orig( $args );
197};
198
70745e70 199# Look for a lexeme-string argument in the build args.
200sub BUILD {
201 my( $self, $args ) = @_;
202 if( exists $args->{'lexemes'} ) {
203 $self->_deserialize_lexemes( $args->{'lexemes'} );
204 }
205}
206
3a2ebbf4 207=head2 is_meta
208
209A meta attribute (ha ha), which should be true if any of our 'special'
210booleans are true. Implies that the reading does not represent a bit
211of text found in a witness.
212
213=cut
214
215sub is_meta {
216 my $self = shift;
12720144 217 return $self->is_start || $self->is_end || $self->is_lacuna || $self->is_ph;
3a2ebbf4 218}
219
027d819c 220=head1 Convenience methods
221
222=head2 related_readings
223
224Calls Collation's related_readings with $self as the first argument.
225
226=cut
227
3a2ebbf4 228sub related_readings {
229 my $self = shift;
230 return $self->collation->related_readings( $self, @_ );
231}
232
7f52eac8 233=head2 witnesses
234
235Calls Collation's reading_witnesses with $self as the first argument.
236
237=cut
238
239sub witnesses {
240 my $self = shift;
241 return $self->collation->reading_witnesses( $self, @_ );
242}
243
027d819c 244=head2 predecessors
245
246Returns a list of Reading objects that immediately precede $self in the collation.
247
248=cut
249
22222af9 250sub predecessors {
251 my $self = shift;
252 my @pred = $self->collation->sequence->predecessors( $self->id );
253 return map { $self->collation->reading( $_ ) } @pred;
254}
255
027d819c 256=head2 successors
257
258Returns a list of Reading objects that immediately follow $self in the collation.
259
260=cut
261
22222af9 262sub successors {
263 my $self = shift;
264 my @succ = $self->collation->sequence->successors( $self->id );
265 return map { $self->collation->reading( $_ ) } @succ;
266}
267
027d819c 268=head2 set_identical( $other_reading)
269
270Backwards compatibility method, to add a transposition relationship
271between $self and $other_reading. Don't use this.
272
273=cut
274
1d310495 275sub set_identical {
276 my( $self, $other ) = @_;
277 return $self->collation->add_relationship( $self, $other,
278 { 'type' => 'transposition' } );
279}
280
3a2ebbf4 281sub _stringify {
282 my $self = shift;
283 return $self->id;
284}
d047cd52 285
4d9593df 286=head1 MORPHOLOGY
287
7cd9f181 288Methods for the morphological information (if any) attached to readings.
289A reading may be made up of multiple lexemes; the concatenated lexeme
290strings ought to match the reading's normalized form.
291
292See L<Text::Tradition::Collation::Reading::Lexeme> for more information
293on Lexeme objects and their attributes.
294
295=head2 has_lexemes
296
297Returns a true value if the reading has any attached lexemes.
4d9593df 298
6ad2ce78 299=head2 lexemes
06e7cbc7 300
7cd9f181 301Returns the Lexeme objects (if any) attached to the reading.
6ad2ce78 302
303=head2 clear_lexemes
304
7cd9f181 305Wipes any associated Lexeme objects out of the reading.
306
307=head2 add_lexeme( $lexobj )
6ad2ce78 308
7cd9f181 309Adds the Lexeme in $lexobj to the list of lexemes.
310
311=head2 lemmatize
312
313If the language of the reading is set, this method will use the appropriate
314Language model to determine the lexemes that belong to this reading. See
315L<Text::Tradition::lemmatize> if you wish to lemmatize an entire tradition.
06e7cbc7 316
4d9593df 317=cut
318
6ad2ce78 319sub lemmatize {
320 my $self = shift;
321 unless( $self->has_language ) {
322 warn "Please set a language to lemmatize a tradition";
323 return;
324 }
325 my $mod = "Text::Tradition::Language::" . $self->language;
326 load( $mod );
327 $mod->can( 'reading_lookup' )->( $self );
328
329}
4d9593df 330
7cd9f181 331# For graph serialization. Return a string representation of the associated
332# reading lexemes.
70745e70 333# TODO Push this in to the Lexeme package.
7cd9f181 334sub _serialize_lexemes {
335 my $self = shift;
c3e04fb5 336 my @lexstrs;
337 foreach my $l ( $self->lexemes ) {
338 my @mf;
339 foreach my $wf ( $l->matching_forms ) {
340 push( @mf, $wf->to_string );
341 }
342 my $form = $l->form ? $l->form->to_string : '';
343 push( @lexstrs, join( '|L|', $l->language, $l->string, $form,
344 join( '|M|', @mf ) ) );
345 }
346 return join( '|R|', @lexstrs );
7cd9f181 347}
70745e70 348
349sub _deserialize_lexemes {
350 my( $self, $data ) = @_;
351 return unless $data;
352
353 # Need to have the lexeme modules in order to have lexemes.
354 eval {
355 use Text::Tradition::Collation::Reading::Lexeme;
356 use Text::Tradition::Collation::Reading::WordForm;
357 };
358 throw( $@ ) if $@;
359
360 # Good to go - add the lexemes.
361 my @lexemes;
362 foreach my $lexdata ( split( /\|R\|/, $data ) ) {
363 my( $lang, $lstring, $form, $allforms ) = split( /\|L\|/, $lexdata );
364 my @wfdata;
365 push( @wfdata, $form ) if $form;
366 push( @wfdata, split( /\|M\|/, $allforms ) );
367 my @wforms;
368 foreach my $wd ( @wfdata ) {
369 my $wf = Text::Tradition::Collation::Reading::WordForm->new(
370 'serial' => $wd );
371 push( @wforms, $wf );
372 }
373 my %largs = ( 'language' => $lang, 'string' => $lstring );
374 if( $form ) {
375 $largs{'form'} = shift @wforms;
376 $largs{'is_disambiguated'} = 1;
377 }
378 $largs{'wordform_matchlist'} = \@wforms;
379 push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new( %largs ) );
380 }
381 $self->clear_lexemes;
382 $self->add_lexeme( @lexemes );
383}
7cd9f181 384
4d9593df 385## Utility methods
386
2acf0892 387sub TO_JSON {
388 my $self = shift;
389 return $self->text;
390}
391
70745e70 392sub throw {
393 Text::Tradition::Error->throw(
394 'ident' => 'Reading error',
395 'message' => $_[0],
396 );
397}
4d9593df 398
399no Moose;
400__PACKAGE__->meta->make_immutable;
401
021bdbac 4021;