a little script cleanup
[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
6771a1b1 3use feature 'say';
910a0a6d 4use Encode qw( decode_utf8 );
5use File::Temp;
bfcbcecb 6use File::Which;
c9bf3dbf 7use Graph;
8e1394aa 8use IPC::Run qw( run binary );
82fa4d57 9use Text::CSV;
5c0072ef 10use Text::Tradition::Collation::Data;
b15511bf 11use Text::Tradition::Collation::Reading;
22222af9 12use Text::Tradition::Collation::RelationshipStore;
63778331 13use Text::Tradition::Error;
cc31ebaa 14use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
5c0072ef 15use XML::LibXML;
16use XML::LibXML::XPathContext;
dd3b58b0 17use Moose;
18
5c0072ef 19has _data => (
20 isa => 'Text::Tradition::Collation::Data',
21 is => 'ro',
22 required => 1,
23 handles => [ qw(
24 sequence
25 paths
26 _set_relations
27 relations
28 _set_start
29 _set_end
30 ac_label
31 has_cached_table
32 relationships
33 related_readings
34 get_relationship
35 del_relationship
36 equivalence
37 equivalence_graph
38 readings
39 reading
40 _add_reading
41 del_reading
42 has_reading
43 wit_list_separator
44 baselabel
45 linear
46 wordsep
47 start
48 end
49 cached_table
50 _graphcalc_done
51 has_cached_svg
52 wipe_table
53 )]
54);
dd3b58b0 55
3a2ebbf4 56has 'tradition' => (
57 is => 'ro',
d047cd52 58 isa => 'Text::Tradition',
8cfd99c4 59 writer => '_set_tradition',
8d9a1cd8 60 weak_ref => 1,
d047cd52 61 );
dd3b58b0 62
4e5a7b2c 63=head1 NAME
64
65Text::Tradition::Collation - a software model for a text collation
66
67=head1 SYNOPSIS
68
69 use Text::Tradition;
70 my $t = Text::Tradition->new(
71 'name' => 'this is a text',
72 'input' => 'TEI',
73 'file' => '/path/to/tei_parallel_seg_file.xml' );
74
75 my $c = $t->collation;
76 my @readings = $c->readings;
77 my @paths = $c->paths;
78 my @relationships = $c->relationships;
79
80 my $svg_variant_graph = $t->collation->as_svg();
81
82=head1 DESCRIPTION
83
84Text::Tradition is a library for representation and analysis of collated
85texts, particularly medieval ones. The Collation is the central feature of
86a Tradition, where the text, its sequence of readings, and its relationships
87between readings are actually kept.
88
89=head1 CONSTRUCTOR
90
91=head2 new
92
93The constructor. Takes a hash or hashref of the following arguments:
94
95=over
96
97=item * tradition - The Text::Tradition object to which the collation
98belongs. Required.
99
100=item * linear - Whether the collation should be linear; that is, whether
101transposed readings should be treated as two linked readings rather than one,
102and therefore whether the collation graph is acyclic. Defaults to true.
103
4e5a7b2c 104=item * baselabel - The default label for the path taken by a base text
105(if any). Defaults to 'base text'.
106
107=item * wit_list_separator - The string to join a list of witnesses for
108purposes of making labels in display graphs. Defaults to ', '.
109
110=item * ac_label - The extra label to tack onto a witness sigil when
111representing another layer of path for the given witness - that is, when
112a text has more than one possible reading due to scribal corrections or
113the like. Defaults to ' (a.c.)'.
114
4e483aa5 115=item * wordsep - The string used to separate words in the original text.
116Defaults to ' '.
117
4e5a7b2c 118=back
119
120=head1 ACCESSORS
121
122=head2 tradition
123
124=head2 linear
125
4e5a7b2c 126=head2 wit_list_separator
127
128=head2 baselabel
129
130=head2 ac_label
131
4e483aa5 132=head2 wordsep
133
4e5a7b2c 134Simple accessors for collation attributes.
135
136=head2 start
137
138The meta-reading at the start of every witness path.
139
140=head2 end
141
142The meta-reading at the end of every witness path.
143
144=head2 readings
145
146Returns all Reading objects in the graph.
147
148=head2 reading( $id )
149
150Returns the Reading object corresponding to the given ID.
151
152=head2 add_reading( $reading_args )
153
154Adds a new reading object to the collation.
155See L<Text::Tradition::Collation::Reading> for the available arguments.
156
157=head2 del_reading( $object_or_id )
158
159Removes the given reading from the collation, implicitly removing its
160paths and relationships.
161
4e5a7b2c 162=head2 has_reading( $id )
163
164Predicate to see whether a given reading ID is in the graph.
165
166=head2 reading_witnesses( $object_or_id )
167
168Returns a list of sigils whose witnesses contain the reading.
169
170=head2 paths
171
172Returns all reading paths within the document - that is, all edges in the
173collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
174
175=head2 add_path( $source, $target, $sigil )
176
177Links the given readings in the collation in sequence, under the given witness
178sigil. The readings may be specified by object or ID.
179
180=head2 del_path( $source, $target, $sigil )
181
182Links the given readings in the collation in sequence, under the given witness
183sigil. The readings may be specified by object or ID.
184
185=head2 has_path( $source, $target );
186
187Returns true if the two readings are linked in sequence in any witness.
188The readings may be specified by object or ID.
189
190=head2 relationships
191
192Returns all Relationship objects in the collation.
193
194=head2 add_relationship( $reading, $other_reading, $options )
195
196Adds a new relationship of the type given in $options between the two readings,
197which may be specified by object or ID. Returns a value of ( $status, @vectors)
198where $status is true on success, and @vectors is a list of relationship edges
199that were ultimately added.
200See L<Text::Tradition::Collation::Relationship> for the available options.
201
202=cut
dd3b58b0 203
5c0072ef 204sub BUILDARGS {
205 my ( $class, @args ) = @_;
206 my %args = @args == 1 ? %{ $args[0] } : @args;
207 # TODO determine these from the Moose::Meta object
208 my @delegate_attrs = qw(sequence relations readings wit_list_separator baselabel
209 linear wordsep start end cached_table _graphcalc_done);
210 my %data_args;
211 for my $attr (@delegate_attrs) {
212 $data_args{$attr} = delete $args{$attr} if exists $args{$attr};
213 }
214 $args{_data} = Text::Tradition::Collation::Data->new(%data_args);
215 return \%args;
216}
217
d047cd52 218sub BUILD {
3a2ebbf4 219 my $self = shift;
22222af9 220 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
e4b73942 221 $self->_set_start( $self->add_reading(
222 { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
223 $self->_set_end( $self->add_reading(
224 { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
d047cd52 225}
784877d9 226
24efa55d 227sub register_relationship_type {
228 my $self = shift;
229 my %args = @_ == 1 ? %{$_[0]} : @_;
230 if( $self->relations->has_type( $args{name} ) ) {
231 throw( 'Relationship type ' . $args{name} . ' already registered' );
232 }
233 $self->relations->add_type( %args );
234}
235
3a2ebbf4 236### Reading construct/destruct functions
237
238sub add_reading {
239 my( $self, $reading ) = @_;
240 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
241 my %args = %$reading;
e4b73942 242 if( $args{'init'} ) {
243 # If we are initializing an empty collation, don't assume that we
244 # have set a tradition.
245 delete $args{'init'};
8943ff68 246 } elsif( $self->tradition->can('language') && $self->tradition->has_language
247 && !exists $args{'language'} ) {
6ad2ce78 248 $args{'language'} = $self->tradition->language;
249 }
3a2ebbf4 250 $reading = Text::Tradition::Collation::Reading->new(
251 'collation' => $self,
252 %args );
253 }
254 # First check to see if a reading with this ID exists.
255 if( $self->reading( $reading->id ) ) {
63778331 256 throw( "Collation already has a reading with id " . $reading->id );
3a2ebbf4 257 }
c1915ab9 258 $self->_graphcalc_done(0);
3a2ebbf4 259 $self->_add_reading( $reading->id => $reading );
260 # Once the reading has been added, put it in both graphs.
261 $self->sequence->add_vertex( $reading->id );
cecbe56d 262 $self->relations->add_reading( $reading->id );
3a2ebbf4 263 return $reading;
eca16057 264};
265
3a2ebbf4 266around del_reading => sub {
267 my $orig = shift;
268 my $self = shift;
269 my $arg = shift;
270
cecbe56d 271 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
272 $arg = $arg->id;
3a2ebbf4 273 }
3a2ebbf4 274 # Remove the reading from the graphs.
c1915ab9 275 $self->_graphcalc_done(0);
4e483aa5 276 $self->_clear_cache; # Explicitly clear caches to GC the reading
cecbe56d 277 $self->sequence->delete_vertex( $arg );
278 $self->relations->delete_reading( $arg );
3a2ebbf4 279
280 # Carry on.
cecbe56d 281 $self->$orig( $arg );
3a2ebbf4 282};
7854e12e 283
3c234eb6 284=head2 merge_readings( $main, $second, $concatenate, $with_str )
285
286Merges the $second reading into the $main one. If $concatenate is true, then
287the merged node will carry the text of both readings, concatenated with either
288$with_str (if specified) or a sensible default (the empty string if the
289appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
290
291The first two arguments may be either readings or reading IDs.
292
4e483aa5 293=begin testing
294
295use Text::Tradition;
296
297my $cxfile = 't/data/Collatex-16.xml';
298my $t = Text::Tradition->new(
299 'name' => 'inline',
300 'input' => 'CollateX',
301 'file' => $cxfile,
302 );
303my $c = $t->collation;
304
305my $rno = scalar $c->readings;
4ef65ab4 306# Split n21 ('unto') for testing purposes
4e483aa5 307my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
308my $old_r = $c->reading( 'n21' );
309$old_r->alter_text( 'to' );
310$c->del_path( 'n20', 'n21', 'A' );
311$c->add_path( 'n20', 'n21p0', 'A' );
312$c->add_path( 'n21p0', 'n21', 'A' );
7a0956c1 313$c->add_relationship( 'n21', 'n22', { type => 'collated', scope => 'local' } );
4e483aa5 314$c->flatten_ranks();
315ok( $c->reading( 'n21p0' ), "New reading exists" );
316is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
317
679f17e1 318# Combine n3 and n4 ( with his )
4e483aa5 319$c->merge_readings( 'n3', 'n4', 1 );
320ok( !$c->reading('n4'), "Reading n4 is gone" );
321is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
322
679f17e1 323# Collapse n9 and n10 ( rood / root )
324$c->merge_readings( 'n9', 'n10' );
325ok( !$c->reading('n10'), "Reading n10 is gone" );
326is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
4e483aa5 327
328# Combine n21 and n21p0
329my $remaining = $c->reading('n21');
330$remaining ||= $c->reading('n22'); # one of these should still exist
331$c->merge_readings( 'n21p0', $remaining, 1 );
332ok( !$c->reading('n21'), "Reading $remaining is gone" );
333is( $c->reading('n21p0')->text, 'unto', "Reading n21p0 merged correctly" );
334
335=end testing
336
337=cut
7854e12e 338
3a2ebbf4 339sub merge_readings {
340 my $self = shift;
341
56772e8c 342 # Sanity check
343 my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
344 my $mergemeta = $kept_obj->is_meta;
345 throw( "Cannot merge meta and non-meta reading" )
346 unless ( $mergemeta && $del_obj->is_meta )
347 || ( !$mergemeta && !$del_obj->is_meta );
348 if( $mergemeta ) {
349 throw( "Cannot merge with start or end node" )
350 if( $kept_obj eq $self->start || $kept_obj eq $self->end
351 || $del_obj eq $self->start || $del_obj eq $self->end );
a445ce40 352 throw( "Cannot combine text of meta readings" ) if $combine;
56772e8c 353 }
3a2ebbf4 354 # We only need the IDs for adding paths to the graph, not the reading
355 # objects themselves.
56772e8c 356 my $kept = $kept_obj->id;
357 my $deleted = $del_obj->id;
c1915ab9 358 $self->_graphcalc_done(0);
10943ab0 359
3a2ebbf4 360 # The kept reading should inherit the paths and the relationships
361 # of the deleted reading.
362 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
363 my @vector = ( $kept );
364 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
365 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 366 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 367 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
368 $self->sequence->add_edge( @vector );
369 my $fwits = $self->sequence->get_edge_attributes( @vector );
370 @wits{keys %$fwits} = values %$fwits;
371 $self->sequence->set_edge_attributes( @vector, \%wits );
372 }
cecbe56d 373 $self->relations->merge_readings( $kept, $deleted, $combine );
3a2ebbf4 374
375 # Do the deletion deed.
4e483aa5 376 if( $combine ) {
869a1ada 377 # Combine the text of the readings
4e483aa5 378 my $joinstr = $combine_char;
379 unless( defined $joinstr ) {
380 $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
381 $joinstr = $self->wordsep unless defined $joinstr;
382 }
a445ce40 383 $kept_obj->_combine( $del_obj, $joinstr );
49d4f2ac 384 }
3a2ebbf4 385 $self->del_reading( $deleted );
386}
7854e12e 387
6771a1b1 388=head2 compress_readings
389
390Where possible in the graph, compresses plain sequences of readings into a
391single reading. The sequences must consist of readings with no
392relationships to other readings, with only a single witness path between
393them and no other witness paths from either that would skip the other. The
394readings must also not be marked as nonsense or bad grammar.
395
396WARNING: This operation cannot be undone.
397
398=cut
399
400sub compress_readings {
401 my $self = shift;
402 # Anywhere in the graph that there is a reading that joins only to a single
403 # successor, and neither of these have any relationships, just join the two
404 # readings.
6771a1b1 405 foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
428bcf0b 406 # Now look for readings that can be joined to their successors.
a445ce40 407 next unless $rdg->is_combinable;
6771a1b1 408 my %seen;
409 while( $self->sequence->successors( $rdg ) == 1 ) {
410 my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
411 throw( "Infinite loop" ) if $seen{$next->id};
412 $seen{$next->id} = 1;
413 last if $self->sequence->predecessors( $next ) > 1;
a445ce40 414 last unless $next->is_combinable;
6771a1b1 415 say "Joining readings $rdg and $next";
416 $self->merge_readings( $rdg, $next, 1 );
417 }
418 }
419 # Make sure we haven't screwed anything up
420 foreach my $wit ( $self->tradition->witnesses ) {
421 my $pathtext = $self->path_text( $wit->sigil );
422 my $origtext = join( ' ', @{$wit->text} );
423 throw( "Text differs for witness " . $wit->sigil )
424 unless $pathtext eq $origtext;
425 if( $wit->is_layered ) {
426 $pathtext = $self->path_text( $wit->sigil.$self->ac_label );
427 $origtext = join( ' ', @{$wit->layertext} );
428 throw( "Ante-corr text differs for witness " . $wit->sigil )
429 unless $pathtext eq $origtext;
430 }
431 }
432
433 $self->relations->rebuild_equivalence();
434 $self->calculate_ranks();
435}
3265b0ce 436
3a2ebbf4 437# Helper function for manipulating the graph.
438sub _stringify_args {
4e483aa5 439 my( $self, $first, $second, @args ) = @_;
3a2ebbf4 440 $first = $first->id
441 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
442 $second = $second->id
443 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
4e483aa5 444 return( $first, $second, @args );
3a2ebbf4 445}
df6d9812 446
4e5a7b2c 447# Helper function for manipulating the graph.
448sub _objectify_args {
449 my( $self, $first, $second, $arg ) = @_;
450 $first = $self->reading( $first )
451 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
452 $second = $self->reading( $second )
453 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
454 return( $first, $second, $arg );
455}
f97ef19e 456
457=head2 duplicate_reading( $reading, @witlist )
458
459Split the given reading into two, so that the new reading is in the path for
ef73c20a 460the witnesses given in @witlist. If the result is that certain non-colocated
461relationships (e.g. transpositions) are no longer valid, these will be removed.
462Returns the newly-created reading.
f97ef19e 463
464=begin testing
465
466use Text::Tradition;
467
468my $st = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/collatecorr.xml' );
469is( ref( $st ), 'Text::Tradition', "Got a tradition from test file" );
470ok( $st->has_witness('Ba96'), "Tradition has the affected witness" );
471
472my $sc = $st->collation;
473my $numr = 17;
474ok( $sc->reading('n131'), "Tradition has the affected reading" );
475is( scalar( $sc->readings ), $numr, "There are $numr readings in the graph" );
476is( $sc->end->rank, 14, "There are fourteen ranks in the graph" );
477
478# Detach the erroneously collated reading
ef73c20a 479my $newr = $sc->duplicate_reading( 'n131', 'Ba96' );
480ok( $newr, "New reading was created" );
f97ef19e 481ok( $sc->reading('n131_0'), "Detached the bad collation with a new reading" );
482is( scalar( $sc->readings ), $numr + 1, "A reading was added to the graph" );
483is( $sc->end->rank, 10, "There are now only ten ranks in the graph" );
3c234eb6 484my $csucc = $sc->common_successor( 'n131', 'n131_0' );
485is( $csucc->id, 'n136', "Found correct common successor to duped reading" );
f97ef19e 486
487# Check that the bad transposition is gone
488is( $sc->get_relationship( 'n130', 'n135' ), undef, "Bad transposition relationship is gone" );
489
e19635f8 490# The collation should not be fixed
491my @pairs = $sc->identical_readings();
492is( scalar @pairs, 0, "Not re-collated yet" );
f97ef19e 493# Fix the collation
3c234eb6 494ok( $sc->merge_readings( 'n124', 'n131_0' ), "Collated the readings correctly" );
e19635f8 495@pairs = $sc->identical_readings( start => 'n124', end => $csucc->id );
3c234eb6 496is( scalar @pairs, 3, "Found three more identical readings" );
e19635f8 497is( $sc->end->rank, 11, "The ranks shifted appropriately" );
3c234eb6 498$sc->flatten_ranks();
f97ef19e 499is( scalar( $sc->readings ), $numr - 3, "Now we are collated correctly" );
500
501=end testing
502
503=cut
504
505sub duplicate_reading {
506 my( $self, $r, @wits ) = @_;
507 # Add the new reading, duplicating $r.
508 unless( ref( $r ) eq 'Text::Tradition::Collation::Reading' ) {
509 $r = $self->reading( $r );
510 }
511 throw( "Cannot duplicate a meta-reading" )
512 if $r->is_meta;
513
514 # Get all the reading attributes and duplicate them.
515 my $rmeta = Text::Tradition::Collation::Reading->meta;
516 my %args;
517 foreach my $attr( $rmeta->get_all_attributes ) {
518 next if $attr->name =~ /^_/;
519 my $acc = $attr->get_read_method;
520 if( !$acc && $attr->has_applied_traits ) {
521 my $tr = $attr->applied_traits;
522 if( $tr->[0] =~ /::(Array|Hash)$/ ) {
523 my $which = $1;
524 my %methods = reverse %{$attr->handles};
525 $acc = $methods{elements};
526 $args{$attr->name} = $which eq 'Array'
527 ? [ $r->$acc ] : { $r->$acc };
528 }
529 } else {
530 $args{$attr->name} = $r->$acc if $acc;
531 }
532 }
533 # By definition the new reading will no longer be common.
534 $args{is_common} = 0;
535 # The new reading also needs its own ID.
536 $args{id} = $self->_generate_dup_id( $r->id );
537
538 # Try to make the new reading.
539 my $newr = $self->add_reading( \%args );
540 # The old reading is also no longer common.
541 $r->is_common( 0 );
542
543 # For each of the witnesses, dissociate from the old reading and
544 # associate with the new.
545 foreach my $wit ( @wits ) {
546 my $prior = $self->prior_reading( $r, $wit );
547 my $next = $self->next_reading( $r, $wit );
548 $self->del_path( $prior, $r, $wit );
549 $self->add_path( $prior, $newr, $wit );
550 $self->del_path( $r, $next, $wit );
551 $self->add_path( $newr, $next, $wit );
552 }
553
3c234eb6 554 # If the graph is ranked, we need to look for relationships that are now
555 # invalid (i.e. 'non-colocation' types that might now be colocated) and
556 # remove them. If not, we can skip it.
f97ef19e 557 my $succ;
3c234eb6 558 my %rrk;
f97ef19e 559 if( $self->end->has_rank ) {
3c234eb6 560 # Find the point where we can stop checking
f97ef19e 561 $succ = $self->common_successor( $r, $newr );
3c234eb6 562
563 # Hash the existing ranks
f97ef19e 564 foreach my $rdg ( $self->readings ) {
565 $rrk{$rdg->id} = $rdg->rank;
566 }
3c234eb6 567 # Calculate the new ranks
568 $self->calculate_ranks();
f97ef19e 569
3c234eb6 570 # Check for invalid non-colocated relationships among changed-rank readings
571 # from where the ranks start changing up to $succ
f97ef19e 572 my $lastrank = $succ->rank;
573 foreach my $rdg ( $self->readings ) {
574 next if $rdg->rank > $lastrank;
575 next if $rdg->rank == $rrk{$rdg->id};
576 my @noncolo = $rdg->related_readings( sub { !$_[0]->colocated } );
577 next unless @noncolo;
578 foreach my $nc ( @noncolo ) {
579 $self->relations->verify_or_delete( $rdg, $nc );
580 }
581 }
ef73c20a 582 }
583 return $newr;
f97ef19e 584}
585
586sub _generate_dup_id {
587 my( $self, $rid ) = @_;
588 my $newid;
589 my $i = 0;
590 while( !$newid ) {
591 $newid = $rid."_$i";
592 if( $self->has_reading( $newid ) ) {
593 $newid = '';
594 $i++;
595 }
596 }
597 return $newid;
598}
599
3a2ebbf4 600### Path logic
601
602sub add_path {
603 my $self = shift;
604
605 # We only need the IDs for adding paths to the graph, not the reading
606 # objects themselves.
cecbe56d 607 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
3a2ebbf4 608
c1915ab9 609 $self->_graphcalc_done(0);
3a2ebbf4 610 # Connect the readings
359944f7 611 unless( $self->sequence->has_edge( $source, $target ) ) {
612 $self->sequence->add_edge( $source, $target );
613 $self->relations->add_equivalence_edge( $source, $target );
614 }
3a2ebbf4 615 # Note the witness in question
616 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
359944f7 617}
b15511bf 618
3a2ebbf4 619sub del_path {
620 my $self = shift;
49d4f2ac 621 my @args;
622 if( ref( $_[0] ) eq 'ARRAY' ) {
623 my $e = shift @_;
624 @args = ( @$e, @_ );
625 } else {
626 @args = @_;
627 }
3a2ebbf4 628
f97ef19e 629 # We only need the IDs for removing paths from the graph, not the reading
3a2ebbf4 630 # objects themselves.
49d4f2ac 631 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 632
c1915ab9 633 $self->_graphcalc_done(0);
3a2ebbf4 634 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 635 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 636 }
30897024 637 unless( $self->sequence->has_edge_attributes( $source, $target ) ) {
3a2ebbf4 638 $self->sequence->delete_edge( $source, $target );
359944f7 639 $self->relations->delete_equivalence_edge( $source, $target );
3a2ebbf4 640 }
784877d9 641}
642
3a2ebbf4 643
15d2d3df 644# Extra graph-alike utility
645sub has_path {
3a2ebbf4 646 my $self = shift;
647 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
648 return undef unless $self->sequence->has_edge( $source, $target );
649 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 650}
651
4e5a7b2c 652=head2 clear_witness( @sigil_list )
3a2ebbf4 653
4e5a7b2c 654Clear the given witnesses out of the collation entirely, removing references
655to them in paths, and removing readings that belong only to them. Should only
656be called via $tradition->del_witness.
3a2ebbf4 657
658=cut
659
4e5a7b2c 660sub clear_witness {
661 my( $self, @sigils ) = @_;
662
c1915ab9 663 $self->_graphcalc_done(0);
4e5a7b2c 664 # Clear the witness(es) out of the paths
665 foreach my $e ( $self->paths ) {
666 foreach my $sig ( @sigils ) {
667 $self->del_path( $e, $sig );
668 }
669 }
670
671 # Clear out the newly unused readings
672 foreach my $r ( $self->readings ) {
673 unless( $self->reading_witnesses( $r ) ) {
674 $self->del_reading( $r );
675 }
676 }
677}
3a2ebbf4 678
679sub add_relationship {
680 my $self = shift;
22222af9 681 my( $source, $target, $opts ) = $self->_stringify_args( @_ );
414cc046 682 my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
864ee4bf 683 foreach my $v ( @vectors ) {
684 next unless $self->get_relationship( $v )->colocated;
685 if( $self->reading( $v->[0] )->has_rank && $self->reading( $v->[1] )->has_rank
686 && $self->reading( $v->[0] )->rank ne $self->reading( $v->[1] )->rank ) {
687 $self->_graphcalc_done(0);
688 $self->_clear_cache;
689 last;
690 }
691 }
63778331 692 return @vectors;
22222af9 693}
ef9d481f 694
ca6e6095 695around qw/ get_relationship del_relationship / => sub {
696 my $orig = shift;
697 my $self = shift;
698 my @args = @_;
699 if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
700 @args = @{$_[0]};
701 }
e19635f8 702 my @stringargs = $self->_stringify_args( @args );
703 $self->$orig( @stringargs );
ca6e6095 704};
705
22222af9 706=head2 reading_witnesses( $reading )
910a0a6d 707
22222af9 708Return a list of sigils corresponding to the witnesses in which the reading appears.
3265b0ce 709
22222af9 710=cut
1d310495 711
1d310495 712sub reading_witnesses {
713 my( $self, $reading ) = @_;
714 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 715 # arbitrarily chosen "incoming". Thus, special-case the start node.
716 if( $reading eq $self->start ) {
bed6ce83 717 return map { $_->sigil } grep { $_->is_collated } $self->tradition->witnesses;
96dc90ec 718 }
1d310495 719 my %all_witnesses;
720 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
721 my $wits = $self->sequence->get_edge_attributes( @$e );
722 @all_witnesses{ keys %$wits } = 1;
723 }
c12bb878 724 my $acstr = $self->ac_label;
725 foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
726 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
727 }
1d310495 728 return keys %all_witnesses;
910a0a6d 729}
730
4e5a7b2c 731=head1 OUTPUT METHODS
8e1394aa 732
0ecb975c 733=head2 as_svg( \%options )
8e1394aa 734
0068967c 735Returns an SVG string that represents the graph, via as_dot and graphviz.
bfcbcecb 736See as_dot for a list of options. Must have GraphViz (dot) installed to run.
8e1394aa 737
738=cut
739
740sub as_svg {
0ecb975c 741 my( $self, $opts ) = @_;
bfcbcecb 742 throw( "Need GraphViz installed to output SVG" )
743 unless File::Which::which( 'dot' );
e247aad1 744 my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
1ff82d4f 745 $self->calculate_ranks()
746 unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
be3af600 747 my @cmd = qw/dot -Tsvg/;
748 my( $svg, $err );
749 my $dotfile = File::Temp->new();
750 ## USE FOR DEBUGGING
751 # $dotfile->unlink_on_destroy(0);
752 binmode $dotfile, ':utf8';
753 print $dotfile $self->as_dot( $opts );
754 push( @cmd, $dotfile->filename );
755 run( \@cmd, ">", binary(), \$svg );
756 $svg = decode_utf8( $svg );
757 return $svg;
8e1394aa 758}
759
b22576c6 760
0ecb975c 761=head2 as_dot( \%options )
b22576c6 762
0ecb975c 763Returns a string that is the collation graph expressed in dot
764(i.e. GraphViz) format. Options include:
b22576c6 765
0ecb975c 766=over 4
b22576c6 767
0ecb975c 768=item * from
b22576c6 769
0ecb975c 770=item * to
df6d9812 771
0ecb975c 772=item * color_common
773
774=back
df6d9812 775
776=cut
777
778sub as_dot {
0ecb975c 779 my( $self, $opts ) = @_;
780 my $startrank = $opts->{'from'} if $opts;
781 my $endrank = $opts->{'to'} if $opts;
782 my $color_common = $opts->{'color_common'} if $opts;
b365fbae 783 my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
784 && $self->end->rank > 100;
6648ee3d 785 $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
b365fbae 786
b22576c6 787 # Check the arguments
788 if( $startrank ) {
789 return if $endrank && $startrank > $endrank;
790 return if $startrank > $self->end->rank;
791 }
792 if( defined $endrank ) {
793 return if $endrank < 0;
f1b3b33a 794 $endrank = undef if $endrank == $self->end->rank;
b22576c6 795 }
796
67da8d6c 797 my $graph_name = $self->tradition->name;
798 $graph_name =~ s/[^\w\s]//g;
799 $graph_name = join( '_', split( /\s+/, $graph_name ) );
f13b5582 800
801 my %graph_attrs = (
802 'rankdir' => 'LR',
803 'bgcolor' => 'none',
804 );
805 my %node_attrs = (
b8990398 806 'fontsize' => 14,
f13b5582 807 'fillcolor' => 'white',
808 'style' => 'filled',
809 'shape' => 'ellipse'
810 );
811 my %edge_attrs = (
812 'arrowhead' => 'open',
813 'color' => '#000000',
814 'fontcolor' => '#000000',
815 );
816
67da8d6c 817 my $dot = sprintf( "digraph %s {\n", $graph_name );
f13b5582 818 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
819 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
df6d9812 820
b22576c6 821 # Output substitute start/end readings if necessary
822 if( $startrank ) {
43c94341 823 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
b22576c6 824 }
825 if( $endrank ) {
43c94341 826 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
b22576c6 827 }
b365fbae 828 if( $STRAIGHTENHACK ) {
829 ## HACK part 1
43c94341 830 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
831 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
b365fbae 832 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
833 }
b22576c6 834 my %used; # Keep track of the readings that actually appear in the graph
30ddc24c 835 # Sort the readings by rank if we have ranks; this speeds layout.
836 my @all_readings = $self->end->has_rank
837 ? sort { $a->rank <=> $b->rank } $self->readings
838 : $self->readings;
4633f9e4 839 # TODO Refrain from outputting lacuna nodes - just grey out the edges.
30ddc24c 840 foreach my $reading ( @all_readings ) {
b22576c6 841 # Only output readings within our rank range.
842 next if $startrank && $reading->rank < $startrank;
843 next if $endrank && $reading->rank > $endrank;
844 $used{$reading->id} = 1;
910a0a6d 845 # Need not output nodes without separate labels
3a2ebbf4 846 next if $reading->id eq $reading->text;
d4b75f44 847 my $rattrs;
30f0df34 848 my $label = $reading->text;
629e27b0 849 $label .= '-' if $reading->join_next;
850 $label = "-$label" if $reading->join_prior;
8f9cab7b 851 $label =~ s/\"/\\\"/g;
d4b75f44 852 $rattrs->{'label'} = $label;
10e4b1ac 853 $rattrs->{'id'} = $reading->id;
0ecb975c 854 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
d4b75f44 855 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
df6d9812 856 }
3a2ebbf4 857
30ddc24c 858 # Add the real edges. Need to weight one edge per rank jump, in a
859 # continuous line.
b365fbae 860 # my $weighted = $self->_add_edge_weights;
b22576c6 861 my @edges = $self->paths;
3bdec618 862 my( %substart, %subend );
b22576c6 863 foreach my $edge ( @edges ) {
864 # Do we need to output this edge?
508fd430 865 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
027d819c 866 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
f13b5582 867 my $variables = { %edge_attrs, 'label' => $label };
30ddc24c 868
b22576c6 869 # Account for the rank gap if necessary
30ddc24c 870 my $rank0 = $self->reading( $edge->[0] )->rank
871 if $self->reading( $edge->[0] )->has_rank;
872 my $rank1 = $self->reading( $edge->[1] )->rank
873 if $self->reading( $edge->[1] )->has_rank;
874 if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
875 $variables->{'minlen'} = $rank1 - $rank0;
876 }
877
878 # Add the calculated edge weights
b365fbae 879 # if( exists $weighted->{$edge->[0]}
e247aad1 880 # && $weighted->{$edge->[0]} eq $edge->[1] ) {
881 # # $variables->{'color'} = 'red';
882 # $variables->{'weight'} = 3.0;
883 # }
30ddc24c 884
508fd430 885 # EXPERIMENTAL: make edge width reflect no. of witnesses
886 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
887 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
888
f13b5582 889 my $varopts = _dot_attr_string( $variables );
890 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
891 $edge->[0], $edge->[1], $varopts );
3bdec618 892 } elsif( $used{$edge->[0]} ) {
96ba0418 893 $subend{$edge->[0]} = $edge->[1];
3bdec618 894 } elsif( $used{$edge->[1]} ) {
96ba0418 895 $substart{$edge->[1]} = $edge->[0];
b22576c6 896 }
df6d9812 897 }
bed6ce83 898
899 # If we are asked to, add relationship links
900 if( exists $opts->{show_relations} ) {
901 my $filter = $opts->{show_relations}; # can be 'transposition' or 'all'
902 if( $filter eq 'transposition' ) {
903 $filter =~ qr/^transposition$/;
904 }
905 foreach my $redge ( $self->relationships ) {
906 if( $used{$redge->[0]} && $used{$redge->[1]} ) {
907 if( $filter ne 'all' ) {
908 my $rel = $self->get_relationship( $redge );
909 next unless $rel->type =~ /$filter/;
910 my $variables = {
911 arrowhead => 'none',
912 color => '#FFA14F',
913 constraint => 'false',
914 label => uc( substr( $rel->type, 0, 4 ) ),
915 penwidth => '3',
916 };
917 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
918 $redge->[0], $redge->[1], _dot_attr_string( $variables ) );
919 }
920 }
921 }
922 }
923
3bdec618 924 # Add substitute start and end edges if necessary
925 foreach my $node ( keys %substart ) {
96ba0418 926 my $witstr = $self->_path_display_label ( $self->path_witnesses( $substart{$node}, $node ) );
f13b5582 927 my $variables = { %edge_attrs, 'label' => $witstr };
96ba0418 928 my $nrdg = $self->reading( $node );
929 if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
930 # Substart is actually one lower than $startrank
931 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
932 }
f13b5582 933 my $varopts = _dot_attr_string( $variables );
96ba0418 934 $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
3bdec618 935 }
936 foreach my $node ( keys %subend ) {
96ba0418 937 my $witstr = $self->_path_display_label ( $self->path_witnesses( $node, $subend{$node} ) );
f13b5582 938 my $variables = { %edge_attrs, 'label' => $witstr };
939 my $varopts = _dot_attr_string( $variables );
96ba0418 940 $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
3bdec618 941 }
b365fbae 942 # HACK part 2
943 if( $STRAIGHTENHACK ) {
43c94341 944 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
945 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
b365fbae 946 }
30ddc24c 947
df6d9812 948 $dot .= "}\n";
949 return $dot;
950}
951
f13b5582 952sub _dot_attr_string {
953 my( $hash ) = @_;
954 my @attrs;
955 foreach my $k ( sort keys %$hash ) {
956 my $v = $hash->{$k};
957 push( @attrs, $k.'="'.$v.'"' );
958 }
959 return( '[ ' . join( ', ', @attrs ) . ' ]' );
960}
961
30ddc24c 962sub _add_edge_weights {
963 my $self = shift;
964 # Walk the graph from START to END, choosing the successor node with
965 # the largest number of witness paths each time.
966 my $weighted = {};
967 my $curr = $self->start->id;
008fc8a6 968 my $ranked = $self->end->has_rank;
30ddc24c 969 while( $curr ne $self->end->id ) {
008fc8a6 970 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
30ddc24c 971 my @succ = sort { $self->path_witnesses( $curr, $a )
972 <=> $self->path_witnesses( $curr, $b ) }
973 $self->sequence->successors( $curr );
974 my $next = pop @succ;
008fc8a6 975 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
30ddc24c 976 # Try to avoid lacunae in the weighted path.
008fc8a6 977 while( @succ &&
978 ( $self->reading( $next )->is_lacuna ||
979 $nextrank - $rank > 1 ) ){
30ddc24c 980 $next = pop @succ;
981 }
982 $weighted->{$curr} = $next;
983 $curr = $next;
984 }
985 return $weighted;
986}
987
027d819c 988=head2 path_witnesses( $edge )
989
990Returns the list of sigils whose witnesses are associated with the given edge.
991The edge can be passed as either an array or an arrayref of ( $source, $target ).
992
993=cut
994
3a2ebbf4 995sub path_witnesses {
996 my( $self, @edge ) = @_;
997 # If edge is an arrayref, cope.
998 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
999 my $e = shift @edge;
1000 @edge = @$e;
1001 }
1002 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
508fd430 1003 return @wits;
3a2ebbf4 1004}
1005
7f9f05e8 1006# Helper function. Make a display label for the given witnesses, showing a.c.
1007# witnesses only where the main witness is not also in the list.
027d819c 1008sub _path_display_label {
508fd430 1009 my $self = shift;
7f9f05e8 1010 my %wits;
1011 map { $wits{$_} = 1 } @_;
1012
1013 # If an a.c. wit is listed, remove it if the main wit is also listed.
1014 # Otherwise keep it for explicit listing.
1015 my $aclabel = $self->ac_label;
1016 my @disp_ac;
1017 foreach my $w ( sort keys %wits ) {
1018 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
1019 if( exists $wits{$1} ) {
1020 delete $wits{$w};
1021 } else {
1022 push( @disp_ac, $w );
1023 }
1024 }
1025 }
1026
1027 # See if we are in a majority situation.
8f9cab7b 1028 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
1ff82d4f 1029 $maj = $maj > 5 ? $maj : 5;
7f9f05e8 1030 if( scalar keys %wits > $maj ) {
1031 unshift( @disp_ac, 'majority' );
1032 return join( ', ', @disp_ac );
8f9cab7b 1033 } else {
7f9f05e8 1034 return join( ', ', sort keys %wits );
8f9cab7b 1035 }
1036}
1dd07bda 1037
bf6e338d 1038=head2 readings_at_rank( $rank )
1dd07bda 1039
bf6e338d 1040Returns a list of readings at a given rank, taken from the alignment table.
1dd07bda 1041
1042=cut
1043
bf6e338d 1044sub readings_at_rank {
1dd07bda 1045 my( $self, $rank ) = @_;
bf6e338d 1046 my $table = $self->alignment_table;
1047 # Table rank is real rank - 1.
1048 my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
1049 my %readings;
1050 foreach my $e ( @elements ) {
1051 next unless ref( $e ) eq 'HASH';
1052 next unless exists $e->{'t'};
1053 $readings{$e->{'t'}->id} = $e->{'t'};
1054 }
1055 return values %readings;
1dd07bda 1056}
8f9cab7b 1057
4e5a7b2c 1058=head2 as_graphml
8e1394aa 1059
4e5a7b2c 1060Returns a GraphML representation of the collation. The GraphML will contain
1061two graphs. The first expresses the attributes of the readings and the witness
1062paths that link them; the second expresses the relationships that link the
1063readings. This is the native transfer format for a tradition.
8e1394aa 1064
56eefa04 1065=begin testing
1066
1067use Text::Tradition;
951ddfe8 1068use TryCatch;
56eefa04 1069
1070my $READINGS = 311;
1071my $PATHS = 361;
1072
1073my $datafile = 't/data/florilegium_tei_ps.xml';
1074my $tradition = Text::Tradition->new( 'input' => 'TEI',
1075 'name' => 'test0',
1076 'file' => $datafile,
1077 'linear' => 1 );
1078
1079ok( $tradition, "Got a tradition object" );
1080is( scalar $tradition->witnesses, 13, "Found all witnesses" );
1081ok( $tradition->collation, "Tradition has a collation" );
1082
1083my $c = $tradition->collation;
1084is( scalar $c->readings, $READINGS, "Collation has all readings" );
1085is( scalar $c->paths, $PATHS, "Collation has all paths" );
1086is( scalar $c->relationships, 0, "Collation has all relationships" );
1087
1088# Add a few relationships
1089$c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
1090$c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
1091$c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
1092
1093# Now write it to GraphML and parse it again.
1094
1095my $graphml = $c->as_graphml;
1096my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
1097is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
1098is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
1099is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
1100
9fef629b 1101# Now add a stemma, write to GraphML, and look at the output.
951ddfe8 1102SKIP: {
37bf09f4 1103 skip "Analysis module not present", 3 unless $tradition->can( 'add_stemma' );
951ddfe8 1104 my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
1105 is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
1106 is( $tradition->stemmata, 1, "Tradition now has the stemma" );
1107 $graphml = $c->as_graphml;
1108 like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
1109}
2a812726 1110
56eefa04 1111=end testing
1112
8e1394aa 1113=cut
1114
a445ce40 1115## TODO MOVE this to Tradition.pm and modularize it better
8e1394aa 1116sub as_graphml {
a30ca502 1117 my( $self, $options ) = @_;
3d14b48e 1118 $self->calculate_ranks unless $self->_graphcalc_done;
1119
a30ca502 1120 my $start = $options->{'from'}
1121 ? $self->reading( $options->{'from'} ) : $self->start;
1122 my $end = $options->{'to'}
1123 ? $self->reading( $options->{'to'} ) : $self->end;
1124 if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
1125 throw( 'Start node must be before end node' );
1126 }
1127 # The readings need to be ranked for this to work.
1128 $start = $self->start unless $start->has_rank;
1129 $end = $self->end unless $end->has_rank;
414cc046 1130 my $rankoffset = 0;
1131 unless( $start eq $self->start ) {
1132 $rankoffset = $start->rank - 1;
1133 }
a30ca502 1134 my %use_readings;
1135
8e1394aa 1136 # Some namespaces
1137 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
1138 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
1139 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 1140 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 1141
1142 # Create the document and root node
428bcf0b 1143 require XML::LibXML;
8e1394aa 1144 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
1145 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
1146 $graphml->setDocumentElement( $root );
1147 $root->setNamespace( $xsi_ns, 'xsi', 0 );
1148 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
bbd064a9 1149
1150 # List of attribute types to save on our objects and their corresponding
1151 # GraphML types
1152 my %save_types = (
1153 'Str' => 'string',
1154 'Int' => 'int',
1155 'Bool' => 'boolean',
10e4b1ac 1156 'ReadingID' => 'string',
bbd064a9 1157 'RelationshipType' => 'string',
1158 'RelationshipScope' => 'string',
1159 );
1160
bbd064a9 1161 # Add the data keys for the graph. Include an extra key 'version' for the
1162 # GraphML output version.
e309421a 1163 my %graph_data_keys;
1164 my $gdi = 0;
bbd064a9 1165 my %graph_attributes = ( 'version' => 'string' );
1166 # Graph attributes include those of Tradition and those of Collation.
1167 my %gattr_from;
f97ef19e 1168 # TODO Use meta introspection method from duplicate_reading to do this
1169 # instead of naming custom keys.
bbd064a9 1170 my $tmeta = $self->tradition->meta;
1171 my $cmeta = $self->meta;
1172 map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
1173 map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
1174 foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
1175 next if $attr->name =~ /^_/;
bbd064a9 1176 next unless $save_types{$attr->type_constraint->name};
1177 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1178 }
9fef629b 1179 # Extra custom keys for complex objects that should be saved in some form.
1180 # The subroutine should return a string, or undef/empty.
951ddfe8 1181 if( $tmeta->has_method('stemmata') ) {
1182 $graph_attributes{'stemmata'} = sub {
1183 my @stemstrs;
1184 map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1185 $self->tradition->stemmata;
1186 join( "\n", @stemstrs );
1187 };
1188 }
1189
8943ff68 1190 if( $tmeta->has_method('user') ) {
1191 $graph_attributes{'user'} = sub {
1192 $self->tradition->user ? $self->tradition->user->id : undef
1193 };
1194 }
bbd064a9 1195
1196 foreach my $datum ( sort keys %graph_attributes ) {
e309421a 1197 $graph_data_keys{$datum} = 'dg'.$gdi++;
1198 my $key = $root->addNewChild( $graphml_ns, 'key' );
9fef629b 1199 my $dtype = ref( $graph_attributes{$datum} ) ? 'string'
1200 : $graph_attributes{$datum};
e309421a 1201 $key->setAttribute( 'attr.name', $datum );
9fef629b 1202 $key->setAttribute( 'attr.type', $dtype );
e309421a 1203 $key->setAttribute( 'for', 'graph' );
1204 $key->setAttribute( 'id', $graph_data_keys{$datum} );
1205 }
f6066bac 1206
bbd064a9 1207 # Add the data keys for reading nodes
1208 my %reading_attributes;
1209 my $rmeta = Text::Tradition::Collation::Reading->meta;
1210 foreach my $attr( $rmeta->get_all_attributes ) {
1211 next if $attr->name =~ /^_/;
bbd064a9 1212 next unless $save_types{$attr->type_constraint->name};
1213 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1214 }
a445ce40 1215 if( $self->start->does('Text::Tradition::Morphology' ) ) {
1216 # Extra custom key for the reading morphology
1217 $reading_attributes{'lexemes'} = 'string';
1218 }
7cd9f181 1219
ef9d481f 1220 my %node_data_keys;
1221 my $ndi = 0;
bbd064a9 1222 foreach my $datum ( sort keys %reading_attributes ) {
910a0a6d 1223 $node_data_keys{$datum} = 'dn'.$ndi++;
1224 my $key = $root->addNewChild( $graphml_ns, 'key' );
1225 $key->setAttribute( 'attr.name', $datum );
bbd064a9 1226 $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
910a0a6d 1227 $key->setAttribute( 'for', 'node' );
1228 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 1229 }
1230
bbd064a9 1231 # Add the data keys for edges, that is, paths and relationships. Path
1232 # data does not come from a Moose class so is here manually.
ef9d481f 1233 my $edi = 0;
1234 my %edge_data_keys;
bbd064a9 1235 my %edge_attributes = (
3a2ebbf4 1236 witness => 'string', # ID/label for a path
3a2ebbf4 1237 extra => 'boolean', # Path key
3a2ebbf4 1238 );
bbd064a9 1239 my @path_attributes = keys %edge_attributes; # track our manual additions
1240 my $pmeta = Text::Tradition::Collation::Relationship->meta;
1241 foreach my $attr( $pmeta->get_all_attributes ) {
1242 next if $attr->name =~ /^_/;
bbd064a9 1243 next unless $save_types{$attr->type_constraint->name};
1244 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1245 }
1246 foreach my $datum ( sort keys %edge_attributes ) {
3a2ebbf4 1247 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 1248 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 1249 $key->setAttribute( 'attr.name', $datum );
bbd064a9 1250 $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
910a0a6d 1251 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 1252 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 1253 }
3a2ebbf4 1254
cc31ebaa 1255 # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1256 my $xmlidname = $self->tradition->name;
1257 $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1258 if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1259 $xmlidname = '_'.$xmlidname;
1260 }
2c669bca 1261 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1262 $sgraph->setAttribute( 'edgedefault', 'directed' );
cc31ebaa 1263 $sgraph->setAttribute( 'id', $xmlidname );
2c669bca 1264 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
cc31ebaa 1265 $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
2c669bca 1266 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
cc31ebaa 1267 $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
2c669bca 1268 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
22222af9 1269
2a812726 1270 # Tradition/collation attribute data
bbd064a9 1271 foreach my $datum ( keys %graph_attributes ) {
1272 my $value;
1273 if( $datum eq 'version' ) {
2a812726 1274 $value = '3.2';
9fef629b 1275 } elsif( ref( $graph_attributes{$datum} ) ) {
1276 my $sub = $graph_attributes{$datum};
1277 $value = &$sub();
bbd064a9 1278 } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1279 $value = $self->tradition->$datum;
1280 } else {
1281 $value = $self->$datum;
1282 }
2c669bca 1283 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 1284 }
8e1394aa 1285
1286 my $node_ctr = 0;
1287 my %node_hash;
22222af9 1288 # Add our readings to the graph
3a2ebbf4 1289 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
a30ca502 1290 next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1291 ( $n->rank < $start->rank || $n->rank > $end->rank );
1292 $use_readings{$n->id} = 1;
2c669bca 1293 # Add to the main graph
1294 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 1295 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 1296 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 1297 $node_el->setAttribute( 'id', $node_xmlid );
bbd064a9 1298 foreach my $d ( keys %reading_attributes ) {
255875b8 1299 my $nval = $n->$d;
7cd9f181 1300 # Custom serialization
1301 if( $d eq 'lexemes' ) {
1302 # If nval is a true value, we have lexemes so we need to
1303 # serialize them. Otherwise set nval to undef so that the
1304 # key is excluded from this reading.
1305 $nval = $nval ? $n->_serialize_lexemes : undef;
18c64d55 1306 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1307 $nval = undef;
7cd9f181 1308 }
cc31ebaa 1309 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
414cc046 1310 # Adjust the ranks within the subgraph.
cc31ebaa 1311 $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1312 : $nval - $rankoffset;
414cc046 1313 }
255875b8 1314 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1315 if defined $nval;
1316 }
b15511bf 1317 }
1318
2c669bca 1319 # Add the path edges to the sequence graph
df6d9812 1320 my $edge_ctr = 0;
3a2ebbf4 1321 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1322 # We add an edge in the graphml for every witness in $e.
a30ca502 1323 next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1324 my @edge_wits = sort $self->path_witnesses( $e );
cc31ebaa 1325 $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1326 $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1327 # Skip any path from start to end; that witness is not in the subgraph.
1328 next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
a30ca502 1329 foreach my $wit ( @edge_wits ) {
3a2ebbf4 1330 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1331 $node_hash{ $e->[0] },
1332 $node_hash{ $e->[1] } );
2c669bca 1333 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 1334 $edge_el->setAttribute( 'source', $from );
1335 $edge_el->setAttribute( 'target', $to );
1336 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 1337
1338 # It's a witness path, so add the witness
1339 my $base = $wit;
1340 my $key = $edge_data_keys{'witness'};
1341 # Is this an ante-corr witness?
1342 my $aclabel = $self->ac_label;
1343 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1344 # Keep the base witness
1345 $base = $1;
1346 # ...and record that this is an 'extra' reading path
1347 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1348 }
1349 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1350 }
1351 }
1352
cc31ebaa 1353 # Report the actual number of nodes and edges that went in
1354 $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1355 $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1356
22222af9 1357 # Add the relationship graph to the XML
bbd064a9 1358 map { delete $edge_data_keys{$_} } @path_attributes;
826d8773 1359 $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1360 $node_data_keys{'id'}, \%edge_data_keys );
8e1394aa 1361
94c00c71 1362 # Save and return the thing
1363 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 1364 return $result;
df6d9812 1365}
1366
b15511bf 1367sub _add_graphml_data {
1368 my( $el, $key, $value ) = @_;
b15511bf 1369 return unless defined $value;
c9bf3dbf 1370 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 1371 $data_el->setAttribute( 'key', $key );
1372 $data_el->appendText( $value );
8e1394aa 1373}
1374
4e5a7b2c 1375=head2 as_csv
910a0a6d 1376
1377Returns a CSV alignment table representation of the collation graph, one
2c669bca 1378row per witness (or witness uncorrected.)
910a0a6d 1379
1380=cut
1381
1382sub as_csv {
3a2ebbf4 1383 my( $self ) = @_;
1dd07bda 1384 my $table = $self->alignment_table;
82fa4d57 1385 my $csv = Text::CSV->new( { binary => 1, quote_null => 0 } );
910a0a6d 1386 my @result;
2c669bca 1387 # Make the header row
1388 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1389 push( @result, decode_utf8( $csv->string ) );
1390 # Make the rest of the rows
1391 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 1392 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1dd07bda 1393 my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
2c669bca 1394 $csv->combine( @row );
910a0a6d 1395 push( @result, decode_utf8( $csv->string ) );
1396 }
3a2ebbf4 1397 return join( "\n", @result );
910a0a6d 1398}
1399
248276a2 1400=head2 alignment_table
2c669bca 1401
566f4595 1402Return a reference to an alignment table, in a slightly enhanced CollateX
1403format which looks like this:
1404
1405 $table = { alignment => [ { witness => "SIGIL",
4e5a7b2c 1406 tokens => [ { t => "TEXT" }, ... ] },
566f4595 1407 { witness => "SIG2",
4e5a7b2c 1408 tokens => [ { t => "TEXT" }, ... ] },
566f4595 1409 ... ],
1410 length => TEXTLEN };
1411
2c669bca 1412=cut
9f3ba6f7 1413
1dd07bda 1414sub alignment_table {
1415 my( $self ) = @_;
1dd07bda 1416 return $self->cached_table if $self->has_cached_table;
1417
0ecb975c 1418 # Make sure we can do this
1419 throw( "Need a linear graph in order to make an alignment table" )
1420 unless $self->linear;
b39e7cb5 1421 $self->calculate_ranks()
1422 unless $self->_graphcalc_done && $self->end->has_rank;
1423
2c669bca 1424 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 1425 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 1426 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
6771a1b1 1427 # say STDERR "Making witness row(s) for " . $wit->sigil;
1f7aa795 1428 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1dd07bda 1429 my @row = _make_witness_row( \@wit_path, \@all_pos );
bed6ce83 1430 my $witobj = { 'witness' => $wit->sigil, 'tokens' => \@row };
1431 $witobj->{'identifier'} = $wit->identifier if $wit->identifier;
1432 push( @{$table->{'alignment'}}, $witobj );
1f7aa795 1433 if( $wit->is_layered ) {
1434 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
861c3e27 1435 $wit->sigil.$self->ac_label );
1dd07bda 1436 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
bed6ce83 1437 my $witacobj = { 'witness' => $wit->sigil.$self->ac_label,
1438 'tokens' => \@ac_row };
1439 $witacobj->{'identifier'} = $wit->identifier if $wit->identifier;
1440 push( @{$table->{'alignment'}}, $witacobj );
910a0a6d 1441 }
1442 }
1dd07bda 1443 $self->cached_table( $table );
1444 return $table;
910a0a6d 1445}
1446
1447sub _make_witness_row {
1dd07bda 1448 my( $path, $positions ) = @_;
910a0a6d 1449 my %char_hash;
1450 map { $char_hash{$_} = undef } @$positions;
2c669bca 1451 my $debug = 0;
910a0a6d 1452 foreach my $rdg ( @$path ) {
6771a1b1 1453 say STDERR "rank " . $rdg->rank if $debug;
1454 # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
1dd07bda 1455 $char_hash{$rdg->rank} = { 't' => $rdg };
910a0a6d 1456 }
1457 my @row = map { $char_hash{$_} } @$positions;
eca16057 1458 # Fill in lacuna markers for undef spots in the row
1459 my $last_el = shift @row;
1460 my @filled_row = ( $last_el );
1461 foreach my $el ( @row ) {
0e476982 1462 # If we are using node reference, make the lacuna node appear many times
1463 # in the table. If not, use the lacuna tag.
1dd07bda 1464 if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1465 $el = $last_el;
eca16057 1466 }
1467 push( @filled_row, $el );
1468 $last_el = $el;
1469 }
1470 return @filled_row;
910a0a6d 1471}
1472
248276a2 1473
4e5a7b2c 1474=head1 NAVIGATION METHODS
910a0a6d 1475
4e5a7b2c 1476=head2 reading_sequence( $first, $last, $sigil, $backup )
e2902068 1477
1478Returns the ordered list of readings, starting with $first and ending
4e5a7b2c 1479with $last, for the witness given in $sigil. If a $backup sigil is
1480specified (e.g. when walking a layered witness), it will be used wherever
1481no $sigil path exists. If there is a base text reading, that will be
1482used wherever no path exists for $sigil or $backup.
e2902068 1483
1484=cut
1485
910a0a6d 1486# TODO Think about returning some lazy-eval iterator.
b0b4421a 1487# TODO Get rid of backup; we should know from what witness is whether we need it.
910a0a6d 1488
e2902068 1489sub reading_sequence {
861c3e27 1490 my( $self, $start, $end, $witness ) = @_;
e2902068 1491
930ff666 1492 $witness = $self->baselabel unless $witness;
e2902068 1493 my @readings = ( $start );
1494 my %seen;
1495 my $n = $start;
3a2ebbf4 1496 while( $n && $n->id ne $end->id ) {
1497 if( exists( $seen{$n->id} ) ) {
63778331 1498 throw( "Detected loop for $witness at " . $n->id );
910a0a6d 1499 }
3a2ebbf4 1500 $seen{$n->id} = 1;
910a0a6d 1501
861c3e27 1502 my $next = $self->next_reading( $n, $witness );
44771cf2 1503 unless( $next ) {
63778331 1504 throw( "Did not find any path for $witness from reading " . $n->id );
44771cf2 1505 }
910a0a6d 1506 push( @readings, $next );
1507 $n = $next;
e2902068 1508 }
1509 # Check that the last reading is our end reading.
1510 my $last = $readings[$#readings];
63778331 1511 throw( "Last reading found from " . $start->text .
1512 " for witness $witness is not the end!" ) # TODO do we get this far?
3a2ebbf4 1513 unless $last->id eq $end->id;
e2902068 1514
1515 return @readings;
1516}
1517
4e5a7b2c 1518=head2 next_reading( $reading, $sigil );
8e1394aa 1519
4a8828f0 1520Returns the reading that follows the given reading along the given witness
930ff666 1521path.
8e1394aa 1522
1523=cut
1524
4a8828f0 1525sub next_reading {
e2902068 1526 # Return the successor via the corresponding path.
8e1394aa 1527 my $self = shift;
3a2ebbf4 1528 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 1529 return undef unless $answer;
3a2ebbf4 1530 return $self->reading( $answer );
8e1394aa 1531}
1532
4e5a7b2c 1533=head2 prior_reading( $reading, $sigil )
8e1394aa 1534
4a8828f0 1535Returns the reading that precedes the given reading along the given witness
930ff666 1536path.
8e1394aa 1537
1538=cut
1539
4a8828f0 1540sub prior_reading {
e2902068 1541 # Return the predecessor via the corresponding path.
8e1394aa 1542 my $self = shift;
3a2ebbf4 1543 my $answer = $self->_find_linked_reading( 'prior', @_ );
1544 return $self->reading( $answer );
8e1394aa 1545}
1546
4a8828f0 1547sub _find_linked_reading {
861c3e27 1548 my( $self, $direction, $node, $path ) = @_;
1549
1550 # Get a backup if we are dealing with a layered witness
1551 my $alt_path;
1552 my $aclabel = $self->ac_label;
1553 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1554 $alt_path = $1;
1555 }
1556
e2902068 1557 my @linked_paths = $direction eq 'next'
3a2ebbf4 1558 ? $self->sequence->edges_from( $node )
1559 : $self->sequence->edges_to( $node );
e2902068 1560 return undef unless scalar( @linked_paths );
8e1394aa 1561
e2902068 1562 # We have to find the linked path that contains all of the
1563 # witnesses supplied in $path.
1564 my( @path_wits, @alt_path_wits );
4e5a7b2c 1565 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1566 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 1567 my $base_le;
1568 my $alt_le;
1569 foreach my $le ( @linked_paths ) {
3a2ebbf4 1570 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 1571 $base_le = $le;
910a0a6d 1572 }
508fd430 1573 my @le_wits = sort $self->path_witnesses( $le );
3a2ebbf4 1574 if( _is_within( \@path_wits, \@le_wits ) ) {
1575 # This is the right path.
1576 return $direction eq 'next' ? $le->[1] : $le->[0];
1577 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1578 $alt_le = $le;
1579 }
8e1394aa 1580 }
e2902068 1581 # Got this far? Return the alternate path if it exists.
3a2ebbf4 1582 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 1583 if $alt_le;
e2902068 1584
1585 # Got this far? Return the base path if it exists.
3a2ebbf4 1586 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 1587 if $base_le;
e2902068 1588
1589 # Got this far? We have no appropriate path.
2c669bca 1590 warn "Could not find $direction node from " . $node->id
910a0a6d 1591 . " along path $path";
8e1394aa 1592 return undef;
1593}
1594
4a8828f0 1595# Some set logic.
1596sub _is_within {
1597 my( $set1, $set2 ) = @_;
7854e12e 1598 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 1599 foreach my $el ( @$set1 ) {
910a0a6d 1600 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 1601 }
1602 return $ret;
1603}
1604
4e5a7b2c 1605# Return the string that joins together a list of witnesses for
1606# display on a single path.
1607sub _witnesses_of_label {
1608 my( $self, $label ) = @_;
1609 my $regex = $self->wit_list_separator;
1610 my @answer = split( /\Q$regex\E/, $label );
1611 return @answer;
b0b4421a 1612}
1613
d4b75f44 1614=head2 common_readings
1615
1616Returns the list of common readings in the graph (i.e. those readings that are
1617shared by all non-lacunose witnesses.)
1618
1619=cut
1620
1621sub common_readings {
1622 my $self = shift;
1623 my @common = grep { $_->is_common } $self->readings;
1624 return @common;
1625}
1626
fae52efd 1627=head2 path_text( $sigil, [, $start, $end ] )
b0b4421a 1628
1629Returns the text of a witness (plus its backup, if we are using a layer)
1630as stored in the collation. The text is returned as a string, where the
1631individual readings are joined with spaces and the meta-readings (e.g.
1632lacunae) are omitted. Optional specification of $start and $end allows
1633the generation of a subset of the witness text.
4e5a7b2c 1634
b0b4421a 1635=cut
1636
1637sub path_text {
861c3e27 1638 my( $self, $wit, $start, $end ) = @_;
b0b4421a 1639 $start = $self->start unless $start;
1640 $end = $self->end unless $end;
861c3e27 1641 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
629e27b0 1642 my $pathtext = '';
1643 my $last;
1644 foreach my $r ( @path ) {
6ad2ce78 1645 unless ( $r->join_prior || !$last || $last->join_next ) {
1646 $pathtext .= ' ';
1647 }
1648 $pathtext .= $r->text;
629e27b0 1649 $last = $r;
1650 }
1651 return $pathtext;
b0b4421a 1652}
4e5a7b2c 1653
1654=head1 INITIALIZATION METHODS
1655
1656These are mostly for use by parsers.
1657
1658=head2 make_witness_path( $witness )
1659
1660Link the array of readings contained in $witness->path (and in
1661$witness->uncorrected_path if it exists) into collation paths.
1662Clear out the arrays when finished.
de51424a 1663
4e5a7b2c 1664=head2 make_witness_paths
1665
1666Call make_witness_path for all witnesses in the tradition.
1667
1668=cut
930ff666 1669
7e450e44 1670# For use when a collation is constructed from a base text and an apparatus.
1671# We have the sequences of readings and just need to add path edges.
1f7aa795 1672# When we are done, clear out the witness path attributes, as they are no
1673# longer needed.
1674# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 1675
6a222840 1676sub make_witness_paths {
1677 my( $self ) = @_;
910a0a6d 1678 foreach my $wit ( $self->tradition->witnesses ) {
6771a1b1 1679 # say STDERR "Making path for " . $wit->sigil;
910a0a6d 1680 $self->make_witness_path( $wit );
7854e12e 1681 }
7854e12e 1682}
1683
6a222840 1684sub make_witness_path {
7854e12e 1685 my( $self, $wit ) = @_;
1686 my @chain = @{$wit->path};
15d2d3df 1687 my $sig = $wit->sigil;
fae52efd 1688 # Add start and end if necessary
1689 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1690 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
7854e12e 1691 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 1692 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 1693 }
1f7aa795 1694 if( $wit->is_layered ) {
d9e873d0 1695 @chain = @{$wit->uncorrected_path};
fae52efd 1696 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1697 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
d9e873d0 1698 foreach my $idx( 0 .. $#chain-1 ) {
1699 my $source = $chain[$idx];
1700 my $target = $chain[$idx+1];
1701 $self->add_path( $source, $target, $sig.$self->ac_label )
1702 unless $self->has_path( $source, $target, $sig );
1703 }
15d2d3df 1704 }
1f7aa795 1705 $wit->clear_path;
1706 $wit->clear_uncorrected_path;
e2902068 1707}
1708
4e5a7b2c 1709=head2 calculate_ranks
1710
1711Calculate the reading ranks (that is, their aligned positions relative
1712to each other) for the graph. This can only be called on linear collations.
1713
b365fbae 1714=begin testing
1715
1716use Text::Tradition;
1717
1718my $cxfile = 't/data/Collatex-16.xml';
1719my $t = Text::Tradition->new(
1720 'name' => 'inline',
1721 'input' => 'CollateX',
1722 'file' => $cxfile,
1723 );
1724my $c = $t->collation;
1725
1726# Make an svg
bfcbcecb 1727my $table = $c->alignment_table;
1728ok( $c->has_cached_table, "Alignment table was cached" );
1729is( $c->alignment_table, $table, "Cached table returned upon second call" );
b365fbae 1730$c->calculate_ranks;
bfcbcecb 1731is( $c->alignment_table, $table, "Cached table retained with no rank change" );
864ee4bf 1732$c->add_relationship( 'n13', 'n23', { type => 'repetition' } );
1733is( $c->alignment_table, $table, "Alignment table unchanged after non-colo relationship add" );
1734$c->add_relationship( 'n24', 'n23', { type => 'spelling' } );
1735isnt( $c->alignment_table, $table, "Alignment table changed after colo relationship add" );
b365fbae 1736
1737=end testing
1738
4e5a7b2c 1739=cut
1740
910a0a6d 1741sub calculate_ranks {
1742 my $self = shift;
b365fbae 1743 # Save the existing ranks, in case we need to invalidate the cached SVG.
1744 my %existing_ranks;
ac4d7ac2 1745 map { $existing_ranks{$_} = $_->rank } $self->readings;
359944f7 1746
1747 # Do the rankings based on the relationship equivalence graph, starting
1748 # with the start node.
56772e8c 1749 my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
1750
910a0a6d 1751 # Transfer our rankings from the topological graph to the real one.
1752 foreach my $r ( $self->readings ) {
cecbe56d 1753 if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
359944f7 1754 $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
67da8d6c 1755 } else {
63778331 1756 # Die. Find the last rank we calculated.
359944f7 1757 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
1758 <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
63778331 1759 $self->readings;
1760 my $last = pop @all_defined;
1761 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
67da8d6c 1762 }
de51424a 1763 }
bfcbcecb 1764 # Do we need to invalidate the cached data?
be3af600 1765 if( $self->has_cached_table ) {
b365fbae 1766 foreach my $r ( $self->readings ) {
7c293912 1767 next if defined( $existing_ranks{$r} )
1768 && $existing_ranks{$r} == $r->rank;
c1915ab9 1769 # Something has changed, so clear the cache
bfcbcecb 1770 $self->_clear_cache;
c1915ab9 1771 # ...and recalculate the common readings.
1772 $self->calculate_common_readings();
b365fbae 1773 last;
1774 }
1775 }
c1915ab9 1776 # The graph calculation information is now up to date.
1777 $self->_graphcalc_done(1);
8e1394aa 1778}
3a1f2523 1779
c1915ab9 1780sub _clear_cache {
1781 my $self = shift;
c1915ab9 1782 $self->wipe_table if $self->has_cached_table;
1783}
1784
1785
4e5a7b2c 1786=head2 flatten_ranks
1787
1788A convenience method for parsing collation data. Searches the graph for readings
1789with the same text at the same rank, and merges any that are found.
1790
1791=cut
1792
0e476982 1793sub flatten_ranks {
4ef65ab4 1794 my ( $self, %args ) = shift;
0e476982 1795 my %unique_rank_rdg;
bf6e338d 1796 my $changed;
4ef65ab4 1797 foreach my $p ( $self->identical_readings( %args ) ) {
1798 # say STDERR "Combining readings at same rank: @$p";
1799 $changed = 1;
1800 $self->merge_readings( @$p );
1801 # TODO see if this now makes a common point.
7a0956c1 1802 }
1803 # If we merged readings, the ranks are still fine but the alignment
1804 # table is wrong. Wipe it.
1805 $self->wipe_table() if $changed;
1806}
1807
1808=head2 identical_readings
1809=head2 identical_readings( start => $startnode, end => $endnode )
1810=head2 identical_readings( startrank => $startrank, endrank => $endrank )
1811
1812Goes through the graph identifying all pairs of readings that appear to be
1813identical, and therefore able to be merged into a single reading. Returns the
1814relevant identical pairs. Can be restricted to run over only a part of the
1815graph, specified either by node or by rank.
1816
1817=cut
1818
1819sub identical_readings {
1820 my ( $self, %args ) = @_;
1821 # Find where we should start and end.
1822 my $startrank = $args{startrank} || 0;
1823 if( $args{start} ) {
1824 throw( "Starting reading has no rank" ) unless $self->reading( $args{start} )
1825 && $self->reading( $args{start} )->has_rank;
1826 $startrank = $self->reading( $args{start} )->rank;
1827 }
1828 my $endrank = $args{endrank} || $self->end->rank;
1829 if( $args{end} ) {
1830 throw( "Ending reading has no rank" ) unless $self->reading( $args{end} )
1831 && $self->reading( $args{end} )->has_rank;
3c234eb6 1832 $endrank = $self->reading( $args{end} )->rank;
7a0956c1 1833 }
1834
1835 # Make sure the ranks are correct.
1836 unless( $self->_graphcalc_done ) {
1837 $self->calculate_ranks;
1838 }
1839 # Go through the readings looking for duplicates.
1840 my %unique_rank_rdg;
1841 my @pairs;
0e476982 1842 foreach my $rdg ( $self->readings ) {
1843 next unless $rdg->has_rank;
7a0956c1 1844 my $rk = $rdg->rank;
1845 next if $rk > $endrank || $rk < $startrank;
1846 my $key = $rk . "||" . $rdg->text;
0e476982 1847 if( exists $unique_rank_rdg{$key} ) {
07e6765f 1848 # Make sure they don't have different grammatical forms
1849 my $ur = $unique_rank_rdg{$key};
a445ce40 1850 if( $rdg->is_identical( $ur ) ) {
7a0956c1 1851 push( @pairs, [ $ur, $rdg ] );
07e6765f 1852 }
0e476982 1853 } else {
1854 $unique_rank_rdg{$key} = $rdg;
1855 }
7a0956c1 1856 }
1857
1858 return @pairs;
0e476982 1859}
4633f9e4 1860
1861
d4b75f44 1862=head2 calculate_common_readings
1863
1864Goes through the graph identifying the readings that appear in every witness
1865(apart from those with lacunae at that spot.) Marks them as common and returns
1866the list.
1867
1868=begin testing
1869
1870use Text::Tradition;
1871
1872my $cxfile = 't/data/Collatex-16.xml';
1873my $t = Text::Tradition->new(
1874 'name' => 'inline',
1875 'input' => 'CollateX',
1876 'file' => $cxfile,
1877 );
1878my $c = $t->collation;
1879
1880my @common = $c->calculate_common_readings();
1881is( scalar @common, 8, "Found correct number of common readings" );
1882my @marked = sort $c->common_readings();
1883is( scalar @common, 8, "All common readings got marked as such" );
679f17e1 1884my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
d4b75f44 1885is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1886
1887=end testing
1888
1889=cut
1890
1891sub calculate_common_readings {
1892 my $self = shift;
1893 my @common;
c1915ab9 1894 map { $_->is_common( 0 ) } $self->readings;
1895 # Implicitly calls calculate_ranks
1dd07bda 1896 my $table = $self->alignment_table;
d4b75f44 1897 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
7f52eac8 1898 my @row = map { $_->{'tokens'}->[$idx]
1899 ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1900 @{$table->{'alignment'}};
d4b75f44 1901 my %hash;
1902 foreach my $r ( @row ) {
1903 if( $r ) {
1904 $hash{$r->id} = $r unless $r->is_meta;
1905 } else {
1906 $hash{'UNDEF'} = $r;
1907 }
1908 }
1909 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1910 my( $r ) = values %hash;
1911 $r->is_common( 1 );
1912 push( @common, $r );
1913 }
1914 }
1915 return @common;
1916}
1917
861c3e27 1918=head2 text_from_paths
1919
1920Calculate the text array for all witnesses from the path, for later consistency
1921checking. Only to be used if there is no non-graph-based way to know the
1922original texts.
1923
1924=cut
1925
1926sub text_from_paths {
1927 my $self = shift;
1928 foreach my $wit ( $self->tradition->witnesses ) {
5164a6f0 1929 my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1930 my @text;
1931 foreach my $r ( @readings ) {
1932 next if $r->is_meta;
1933 push( @text, $r->text );
1934 }
861c3e27 1935 $wit->text( \@text );
1936 if( $wit->is_layered ) {
5164a6f0 1937 my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
1938 $wit->sigil.$self->ac_label );
1939 my @uctext;
1940 foreach my $r ( @ucrdgs ) {
1941 next if $r->is_meta;
1942 push( @uctext, $r->text );
1943 }
1944 $wit->layertext( \@uctext );
861c3e27 1945 }
1946 }
1947}
0e476982 1948
4e5a7b2c 1949=head1 UTILITY FUNCTIONS
1950
1951=head2 common_predecessor( $reading_a, $reading_b )
8e1394aa 1952
4e5a7b2c 1953Find the last reading that occurs in sequence before both the given readings.
414cc046 1954At the very least this should be $self->start.
4e5a7b2c 1955
1956=head2 common_successor( $reading_a, $reading_b )
1957
1958Find the first reading that occurs in sequence after both the given readings.
414cc046 1959At the very least this should be $self->end.
4e5a7b2c 1960
22222af9 1961=begin testing
1962
1963use Text::Tradition;
1964
1965my $cxfile = 't/data/Collatex-16.xml';
1966my $t = Text::Tradition->new(
1967 'name' => 'inline',
1968 'input' => 'CollateX',
1969 'file' => $cxfile,
1970 );
1971my $c = $t->collation;
1972
679f17e1 1973is( $c->common_predecessor( 'n24', 'n23' )->id,
22222af9 1974 'n20', "Found correct common predecessor" );
679f17e1 1975is( $c->common_successor( 'n24', 'n23' )->id,
10e4b1ac 1976 '__END__', "Found correct common successor" );
22222af9 1977
4e5a7b2c 1978is( $c->common_predecessor( 'n19', 'n17' )->id,
22222af9 1979 'n16', "Found correct common predecessor for readings on same path" );
679f17e1 1980is( $c->common_successor( 'n21', 'n10' )->id,
10e4b1ac 1981 '__END__', "Found correct common successor for readings on same path" );
22222af9 1982
1983=end testing
1984
1985=cut
1986
1987## Return the closest reading that is a predecessor of both the given readings.
1988sub common_predecessor {
1989 my $self = shift;
4e5a7b2c 1990 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1991 return $self->_common_in_path( $r1, $r2, 'predecessors' );
22222af9 1992}
1993
1994sub common_successor {
1995 my $self = shift;
4e5a7b2c 1996 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1997 return $self->_common_in_path( $r1, $r2, 'successors' );
22222af9 1998}
1999
414cc046 2000
2001# TODO think about how to do this without ranks...
027d819c 2002sub _common_in_path {
22222af9 2003 my( $self, $r1, $r2, $dir ) = @_;
414cc046 2004 my $iter = $self->end->rank;
22222af9 2005 my @candidates;
414cc046 2006 my @last_r1 = ( $r1 );
2007 my @last_r2 = ( $r2 );
2008 # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
22222af9 2009 my %all_seen;
6771a1b1 2010 # say STDERR "Finding common $dir for $r1, $r2";
22222af9 2011 while( !@candidates ) {
414cc046 2012 last unless $iter--; # Avoid looping infinitely
2013 # Iterate separately down the graph from r1 and r2
2014 my( @new_lc1, @new_lc2 );
2015 foreach my $lc ( @last_r1 ) {
2016 foreach my $p ( $lc->$dir ) {
2017 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
6771a1b1 2018 # say STDERR "Path candidate $p from $lc";
414cc046 2019 push( @candidates, $p );
002e3600 2020 } elsif( !$all_seen{$p->id} ) {
414cc046 2021 $all_seen{$p->id} = 'r1';
2022 push( @new_lc1, $p );
2023 }
2024 }
2025 }
2026 foreach my $lc ( @last_r2 ) {
22222af9 2027 foreach my $p ( $lc->$dir ) {
414cc046 2028 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
6771a1b1 2029 # say STDERR "Path candidate $p from $lc";
22222af9 2030 push( @candidates, $p );
002e3600 2031 } elsif( !$all_seen{$p->id} ) {
414cc046 2032 $all_seen{$p->id} = 'r2';
2033 push( @new_lc2, $p );
22222af9 2034 }
2035 }
2036 }
414cc046 2037 @last_r1 = @new_lc1;
2038 @last_r2 = @new_lc2;
22222af9 2039 }
2040 my @answer = sort { $a->rank <=> $b->rank } @candidates;
2041 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
2042}
2043
63778331 2044sub throw {
2045 Text::Tradition::Error->throw(
2046 'ident' => 'Collation error',
2047 'message' => $_[0],
2048 );
2049}
2050
dd3b58b0 2051no Moose;
2052__PACKAGE__->meta->make_immutable;
e867486f 2053
a445ce40 2054=head1 BUGS/TODO
2055
2056=over
2057
2058=item * Rework XML serialization in a more modular way
2059
2060=back
2061
027d819c 2062=head1 LICENSE
e867486f 2063
027d819c 2064This package is free software and is provided "as is" without express
2065or implied warranty. You can redistribute it and/or modify it under
2066the same terms as Perl itself.
e867486f 2067
027d819c 2068=head1 AUTHOR
e867486f 2069
027d819c 2070Tara L Andrews E<lt>aurum@cpan.orgE<gt>