small fix to compress_readings
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation / Reading.pm
CommitLineData
784877d9 1package Text::Tradition::Collation::Reading;
2
8e1394aa 3use Moose;
10e4b1ac 4use Moose::Util::TypeConstraints;
7604424b 5use JSON qw/ from_json /;
6ad2ce78 6use Module::Load;
70745e70 7use Text::Tradition::Error;
10e4b1ac 8use XML::Easy::Syntax qw( $xml10_name_rx $xml10_namestartchar_rx );
7cd9f181 9use YAML::XS;
e4b0f464 10use overload '""' => \&_stringify, 'fallback' => 1;
784877d9 11
10e4b1ac 12subtype 'ReadingID',
13 as 'Str',
14 where { $_ =~ /\A$xml10_name_rx\z/ },
15 message { 'Reading ID must be a valid XML attribute string' };
16
17no Moose::Util::TypeConstraints;
18
3a2ebbf4 19=head1 NAME
784877d9 20
4aea6e9b 21Text::Tradition::Collation::Reading - represents a reading (usually a word)
22in a collation.
23
3a2ebbf4 24=head1 DESCRIPTION
784877d9 25
3a2ebbf4 26Text::Tradition is a library for representation and analysis of collated
27texts, particularly medieval ones. A 'reading' refers to a unit of text,
28usually a word, that appears in one or more witnesses (manuscripts) of the
29tradition; the text of a given witness is composed of a set of readings in
30a particular sequence
784877d9 31
3a2ebbf4 32=head1 METHODS
1ca1163d 33
3a2ebbf4 34=head2 new
8e1394aa 35
4aea6e9b 36Creates a new reading in the given collation with the given attributes.
3a2ebbf4 37Options include:
94c00c71 38
3a2ebbf4 39=over 4
784877d9 40
4aea6e9b 41=item collation - The Text::Tradition::Collation object to which this
42reading belongs. Required.
e2902068 43
3a2ebbf4 44=item id - A unique identifier for this reading. Required.
910a0a6d 45
3a2ebbf4 46=item text - The word or other text of the reading.
784877d9 47
3a2ebbf4 48=item is_start - The reading is the starting point for the collation.
3265b0ce 49
3a2ebbf4 50=item is_end - The reading is the ending point for the collation.
784877d9 51
3a2ebbf4 52=item is_lacuna - The 'reading' represents a known gap in the text.
de51424a 53
4aea6e9b 54=item is_ph - A temporary placeholder for apparatus parsing purposes. Do
55not use unless you know what you are doing.
12720144 56
4aea6e9b 57=item rank - The sequence number of the reading. This should probably not
58be set manually.
d047cd52 59
3a2ebbf4 60=back
8e1394aa 61
3a2ebbf4 62One of 'text', 'is_start', 'is_end', or 'is_lacuna' is required.
8e1394aa 63
3a2ebbf4 64=head2 collation
94c00c71 65
3a2ebbf4 66=head2 id
94c00c71 67
3a2ebbf4 68=head2 text
4cdd82f1 69
3a2ebbf4 70=head2 is_start
4cdd82f1 71
3a2ebbf4 72=head2 is_end
4a8828f0 73
3a2ebbf4 74=head2 is_lacuna
4a8828f0 75
3a2ebbf4 76=head2 rank
4a8828f0 77
3a2ebbf4 78Accessor methods for the given attributes.
d047cd52 79
3a2ebbf4 80=cut
d047cd52 81
3a2ebbf4 82has 'collation' => (
83 is => 'ro',
84 isa => 'Text::Tradition::Collation',
85 # required => 1,
86 weak_ref => 1,
87 );
d047cd52 88
3a2ebbf4 89has 'id' => (
90 is => 'ro',
10e4b1ac 91 isa => 'ReadingID',
3a2ebbf4 92 required => 1,
93 );
d047cd52 94
3a2ebbf4 95has 'text' => (
96 is => 'ro',
97 isa => 'Str',
98 required => 1,
49d4f2ac 99 writer => 'alter_text',
3a2ebbf4 100 );
0e47f4f6 101
fae52efd 102has 'language' => (
103 is => 'ro',
104 isa => 'Str',
6ad2ce78 105 predicate => 'has_language',
fae52efd 106 );
107
3a2ebbf4 108has 'is_start' => (
109 is => 'ro',
110 isa => 'Bool',
111 default => undef,
112 );
113
114has 'is_end' => (
115 is => 'ro',
116 isa => 'Bool',
117 default => undef,
118 );
119
120has 'is_lacuna' => (
121 is => 'ro',
122 isa => 'Bool',
123 default => undef,
124 );
12720144 125
126has 'is_ph' => (
127 is => 'ro',
128 isa => 'Bool',
129 default => undef,
130 );
d4b75f44 131
132has 'is_common' => (
133 is => 'rw',
134 isa => 'Bool',
135 default => undef,
136 );
3a2ebbf4 137
138has 'rank' => (
139 is => 'rw',
140 isa => 'Int',
141 predicate => 'has_rank',
ca6e6095 142 clearer => 'clear_rank',
3a2ebbf4 143 );
fd602649 144
145## For morphological analysis
146
a8928d1d 147has 'grammar_invalid' => (
148 is => 'rw',
149 isa => 'Bool',
150 default => undef,
151 );
152
153has 'is_nonsense' => (
154 is => 'rw',
155 isa => 'Bool',
0e6e9e7a 156 default => undef,
a8928d1d 157 );
158
fd602649 159has 'normal_form' => (
160 is => 'rw',
161 isa => 'Str',
367e901b 162 predicate => '_has_normal_form',
163 clearer => '_clear_normal_form',
fd602649 164 );
165
7cd9f181 166# Holds the lexemes for the reading.
d3e7842a 167has 'reading_lexemes' => (
4d9593df 168 traits => ['Array'],
d3e7842a 169 isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]',
4d9593df 170 handles => {
da83693e 171 lexeme => 'get',
4d9593df 172 lexemes => 'elements',
cca4f996 173 has_lexemes => 'count',
d3e7842a 174 clear_lexemes => 'clear',
175 add_lexeme => 'push',
4d9593df 176 },
d3e7842a 177 default => sub { [] },
fd602649 178 );
179
629e27b0 180## For prefix/suffix readings
181
182has 'join_prior' => (
183 is => 'ro',
184 isa => 'Bool',
185 default => undef,
339786dd 186 writer => '_set_join_prior',
629e27b0 187 );
188
189has 'join_next' => (
190 is => 'ro',
191 isa => 'Bool',
192 default => undef,
339786dd 193 writer => '_set_join_next',
629e27b0 194 );
195
3a2ebbf4 196
197around BUILDARGS => sub {
198 my $orig = shift;
199 my $class = shift;
200 my $args;
201 if( @_ == 1 ) {
202 $args = shift;
203 } else {
204 $args = { @_ };
205 }
b0b4421a 206
3a2ebbf4 207 # If one of our special booleans is set, we change the text and the
208 # ID to match.
44924224 209 if( exists $args->{'is_lacuna'} && $args->{'is_lacuna'} && !exists $args->{'text'} ) {
56eefa04 210 $args->{'text'} = '#LACUNA#';
44924224 211 } elsif( exists $args->{'is_start'} && $args->{'is_start'} ) {
10e4b1ac 212 $args->{'id'} = '__START__'; # Change the ID to ensure we have only one
3a2ebbf4 213 $args->{'text'} = '#START#';
214 $args->{'rank'} = 0;
44924224 215 } elsif( exists $args->{'is_end'} && $args->{'is_end'} ) {
10e4b1ac 216 $args->{'id'} = '__END__'; # Change the ID to ensure we have only one
3a2ebbf4 217 $args->{'text'} = '#END#';
44924224 218 } elsif( exists $args->{'is_ph'} && $args->{'is_ph'} ) {
12720144 219 $args->{'text'} = $args->{'id'};
3a2ebbf4 220 }
221
10e4b1ac 222 # Backwards compatibility for non-XMLname IDs
223 my $rid = $args->{'id'};
224 $rid =~ s/\#/__/g;
225 $rid =~ s/[\/,]/./g;
226 if( $rid !~ /^$xml10_namestartchar_rx/ ) {
227 $rid = 'r'.$rid;
228 }
229 $args->{'id'} = $rid;
230
3a2ebbf4 231 $class->$orig( $args );
232};
233
70745e70 234# Look for a lexeme-string argument in the build args.
235sub BUILD {
236 my( $self, $args ) = @_;
237 if( exists $args->{'lexemes'} ) {
238 $self->_deserialize_lexemes( $args->{'lexemes'} );
239 }
240}
241
367e901b 242# Make normal_form default to text, transparently.
243around 'normal_form' => sub {
244 my $orig = shift;
245 my $self = shift;
246 my( $arg ) = @_;
247 if( $arg && $arg eq $self->text ) {
248 $self->_clear_normal_form;
249 return $arg;
250 } elsif( !$arg && !$self->_has_normal_form ) {
251 return $self->text;
252 } else {
253 $self->$orig( @_ );
254 }
255};
256
3a2ebbf4 257=head2 is_meta
258
259A meta attribute (ha ha), which should be true if any of our 'special'
260booleans are true. Implies that the reading does not represent a bit
261of text found in a witness.
262
263=cut
264
265sub is_meta {
266 my $self = shift;
12720144 267 return $self->is_start || $self->is_end || $self->is_lacuna || $self->is_ph;
3a2ebbf4 268}
269
027d819c 270=head1 Convenience methods
271
272=head2 related_readings
273
274Calls Collation's related_readings with $self as the first argument.
275
276=cut
277
3a2ebbf4 278sub related_readings {
279 my $self = shift;
280 return $self->collation->related_readings( $self, @_ );
281}
282
7f52eac8 283=head2 witnesses
284
285Calls Collation's reading_witnesses with $self as the first argument.
286
287=cut
288
289sub witnesses {
290 my $self = shift;
291 return $self->collation->reading_witnesses( $self, @_ );
292}
293
027d819c 294=head2 predecessors
295
296Returns a list of Reading objects that immediately precede $self in the collation.
297
298=cut
299
22222af9 300sub predecessors {
301 my $self = shift;
302 my @pred = $self->collation->sequence->predecessors( $self->id );
303 return map { $self->collation->reading( $_ ) } @pred;
304}
305
027d819c 306=head2 successors
307
308Returns a list of Reading objects that immediately follow $self in the collation.
309
310=cut
311
22222af9 312sub successors {
313 my $self = shift;
314 my @succ = $self->collation->sequence->successors( $self->id );
315 return map { $self->collation->reading( $_ ) } @succ;
316}
317
027d819c 318=head2 set_identical( $other_reading)
319
320Backwards compatibility method, to add a transposition relationship
321between $self and $other_reading. Don't use this.
322
323=cut
324
1d310495 325sub set_identical {
326 my( $self, $other ) = @_;
327 return $self->collation->add_relationship( $self, $other,
328 { 'type' => 'transposition' } );
329}
330
3a2ebbf4 331sub _stringify {
332 my $self = shift;
333 return $self->id;
334}
d047cd52 335
4d9593df 336=head1 MORPHOLOGY
337
7cd9f181 338Methods for the morphological information (if any) attached to readings.
339A reading may be made up of multiple lexemes; the concatenated lexeme
340strings ought to match the reading's normalized form.
341
342See L<Text::Tradition::Collation::Reading::Lexeme> for more information
343on Lexeme objects and their attributes.
344
345=head2 has_lexemes
346
347Returns a true value if the reading has any attached lexemes.
4d9593df 348
6ad2ce78 349=head2 lexemes
06e7cbc7 350
7cd9f181 351Returns the Lexeme objects (if any) attached to the reading.
6ad2ce78 352
353=head2 clear_lexemes
354
7cd9f181 355Wipes any associated Lexeme objects out of the reading.
356
357=head2 add_lexeme( $lexobj )
6ad2ce78 358
7cd9f181 359Adds the Lexeme in $lexobj to the list of lexemes.
360
361=head2 lemmatize
362
363If the language of the reading is set, this method will use the appropriate
364Language model to determine the lexemes that belong to this reading. See
365L<Text::Tradition::lemmatize> if you wish to lemmatize an entire tradition.
06e7cbc7 366
4d9593df 367=cut
368
6ad2ce78 369sub lemmatize {
370 my $self = shift;
371 unless( $self->has_language ) {
372 warn "Please set a language to lemmatize a tradition";
373 return;
374 }
375 my $mod = "Text::Tradition::Language::" . $self->language;
376 load( $mod );
377 $mod->can( 'reading_lookup' )->( $self );
378
379}
4d9593df 380
7604424b 381# For graph serialization. Return a JSON representation of the associated
7cd9f181 382# reading lexemes.
383sub _serialize_lexemes {
384 my $self = shift;
7604424b 385 my $json = JSON->new->allow_blessed(1)->convert_blessed(1);
386 return $json->encode( [ $self->lexemes ] );
7cd9f181 387}
70745e70 388
7604424b 389# Given a JSON representation of the lexemes, instantiate them and add
390# them to the reading.
70745e70 391sub _deserialize_lexemes {
7604424b 392 my( $self, $json ) = @_;
393 my $data = from_json( $json );
394 return unless @$data;
70745e70 395
7604424b 396 # Need to have the lexeme module in order to have lexemes.
397 eval { use Text::Tradition::Collation::Reading::Lexeme; };
70745e70 398 throw( $@ ) if $@;
399
400 # Good to go - add the lexemes.
401 my @lexemes;
7604424b 402 foreach my $lexhash ( @$data ) {
403 push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new(
404 'JSON' => $lexhash ) );
70745e70 405 }
406 $self->clear_lexemes;
407 $self->add_lexeme( @lexemes );
408}
7cd9f181 409
44924224 410sub disambiguated {
411 my $self = shift;
412 return 0 unless $self->has_lexemes;
413 return !grep { !$_->is_disambiguated } $self->lexemes;
414}
415
4d9593df 416## Utility methods
417
2acf0892 418sub TO_JSON {
419 my $self = shift;
420 return $self->text;
421}
422
70745e70 423sub throw {
424 Text::Tradition::Error->throw(
425 'ident' => 'Reading error',
426 'message' => $_[0],
427 );
428}
4d9593df 429
430no Moose;
431__PACKAGE__->meta->make_immutable;
432
021bdbac 4331;