fix change detection bug
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
bfcbcecb 5use File::Which;
c9bf3dbf 6use Graph;
8e1394aa 7use IPC::Run qw( run binary );
910a0a6d 8use Text::CSV_XS;
b15511bf 9use Text::Tradition::Collation::Reading;
22222af9 10use Text::Tradition::Collation::RelationshipStore;
63778331 11use Text::Tradition::Error;
df6d9812 12use XML::LibXML;
a344ffcf 13use XML::LibXML::XPathContext;
dd3b58b0 14use Moose;
15
3a2ebbf4 16has 'sequence' => (
d047cd52 17 is => 'ro',
3a2ebbf4 18 isa => 'Graph',
19 default => sub { Graph->new() },
d047cd52 20 handles => {
3a2ebbf4 21 paths => 'edges',
d047cd52 22 },
d047cd52 23 );
3a2ebbf4 24
25has 'relations' => (
26 is => 'ro',
22222af9 27 isa => 'Text::Tradition::Collation::RelationshipStore',
28 handles => {
29 relationships => 'relationships',
30 related_readings => 'related_readings',
202ccb18 31 get_relationship => 'get_relationship',
ee801e17 32 del_relationship => 'del_relationship',
22222af9 33 },
34 writer => '_set_relations',
3a2ebbf4 35 );
dd3b58b0 36
3a2ebbf4 37has 'tradition' => (
38 is => 'ro',
d047cd52 39 isa => 'Text::Tradition',
8d9a1cd8 40 weak_ref => 1,
d047cd52 41 );
dd3b58b0 42
3a2ebbf4 43has 'readings' => (
44 isa => 'HashRef[Text::Tradition::Collation::Reading]',
45 traits => ['Hash'],
46 handles => {
47 reading => 'get',
48 _add_reading => 'set',
49 del_reading => 'delete',
50 has_reading => 'exists',
51 readings => 'values',
52 },
53 default => sub { {} },
54 );
910a0a6d 55
4a8828f0 56has 'wit_list_separator' => (
7854e12e 57 is => 'rw',
58 isa => 'Str',
59 default => ', ',
60 );
61
62has 'baselabel' => (
63 is => 'rw',
64 isa => 'Str',
65 default => 'base text',
66 );
4a8828f0 67
15d2d3df 68has 'linear' => (
69 is => 'rw',
70 isa => 'Bool',
71 default => 1,
72 );
c84275ff 73
ef9d481f 74has 'ac_label' => (
75 is => 'rw',
76 isa => 'Str',
77 default => ' (a.c.)',
78 );
3a2ebbf4 79
80has 'start' => (
81 is => 'ro',
82 isa => 'Text::Tradition::Collation::Reading',
83 writer => '_set_start',
84 weak_ref => 1,
85 );
86
87has 'end' => (
88 is => 'ro',
89 isa => 'Text::Tradition::Collation::Reading',
90 writer => '_set_end',
91 weak_ref => 1,
92 );
b365fbae 93
94has 'cached_svg' => (
95 is => 'rw',
96 isa => 'Str',
97 predicate => 'has_cached_svg',
98 clearer => 'wipe_svg',
99 );
1dd07bda 100
101has 'cached_table' => (
102 is => 'rw',
103 isa => 'HashRef',
104 predicate => 'has_cached_table',
105 clearer => 'wipe_table',
106 );
c1915ab9 107
108has '_graphcalc_done' => (
109 is => 'rw',
110 isa => 'Bool',
111 default => undef,
112 );
1f563ac3 113
4e5a7b2c 114=head1 NAME
115
116Text::Tradition::Collation - a software model for a text collation
117
118=head1 SYNOPSIS
119
120 use Text::Tradition;
121 my $t = Text::Tradition->new(
122 'name' => 'this is a text',
123 'input' => 'TEI',
124 'file' => '/path/to/tei_parallel_seg_file.xml' );
125
126 my $c = $t->collation;
127 my @readings = $c->readings;
128 my @paths = $c->paths;
129 my @relationships = $c->relationships;
130
131 my $svg_variant_graph = $t->collation->as_svg();
132
133=head1 DESCRIPTION
134
135Text::Tradition is a library for representation and analysis of collated
136texts, particularly medieval ones. The Collation is the central feature of
137a Tradition, where the text, its sequence of readings, and its relationships
138between readings are actually kept.
139
140=head1 CONSTRUCTOR
141
142=head2 new
143
144The constructor. Takes a hash or hashref of the following arguments:
145
146=over
147
148=item * tradition - The Text::Tradition object to which the collation
149belongs. Required.
150
151=item * linear - Whether the collation should be linear; that is, whether
152transposed readings should be treated as two linked readings rather than one,
153and therefore whether the collation graph is acyclic. Defaults to true.
154
4e5a7b2c 155=item * baselabel - The default label for the path taken by a base text
156(if any). Defaults to 'base text'.
157
158=item * wit_list_separator - The string to join a list of witnesses for
159purposes of making labels in display graphs. Defaults to ', '.
160
161=item * ac_label - The extra label to tack onto a witness sigil when
162representing another layer of path for the given witness - that is, when
163a text has more than one possible reading due to scribal corrections or
164the like. Defaults to ' (a.c.)'.
165
166=back
167
168=head1 ACCESSORS
169
170=head2 tradition
171
172=head2 linear
173
4e5a7b2c 174=head2 wit_list_separator
175
176=head2 baselabel
177
178=head2 ac_label
179
180Simple accessors for collation attributes.
181
182=head2 start
183
184The meta-reading at the start of every witness path.
185
186=head2 end
187
188The meta-reading at the end of every witness path.
189
190=head2 readings
191
192Returns all Reading objects in the graph.
193
194=head2 reading( $id )
195
196Returns the Reading object corresponding to the given ID.
197
198=head2 add_reading( $reading_args )
199
200Adds a new reading object to the collation.
201See L<Text::Tradition::Collation::Reading> for the available arguments.
202
203=head2 del_reading( $object_or_id )
204
205Removes the given reading from the collation, implicitly removing its
206paths and relationships.
207
208=head2 merge_readings( $main, $second )
209
210Merges the $second reading into the $main one.
211The arguments may be either readings or reading IDs.
212
213=head2 has_reading( $id )
214
215Predicate to see whether a given reading ID is in the graph.
216
217=head2 reading_witnesses( $object_or_id )
218
219Returns a list of sigils whose witnesses contain the reading.
220
221=head2 paths
222
223Returns all reading paths within the document - that is, all edges in the
224collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
225
226=head2 add_path( $source, $target, $sigil )
227
228Links the given readings in the collation in sequence, under the given witness
229sigil. The readings may be specified by object or ID.
230
231=head2 del_path( $source, $target, $sigil )
232
233Links the given readings in the collation in sequence, under the given witness
234sigil. The readings may be specified by object or ID.
235
236=head2 has_path( $source, $target );
237
238Returns true if the two readings are linked in sequence in any witness.
239The readings may be specified by object or ID.
240
241=head2 relationships
242
243Returns all Relationship objects in the collation.
244
245=head2 add_relationship( $reading, $other_reading, $options )
246
247Adds a new relationship of the type given in $options between the two readings,
248which may be specified by object or ID. Returns a value of ( $status, @vectors)
249where $status is true on success, and @vectors is a list of relationship edges
250that were ultimately added.
251See L<Text::Tradition::Collation::Relationship> for the available options.
252
253=cut
dd3b58b0 254
d047cd52 255sub BUILD {
3a2ebbf4 256 my $self = shift;
22222af9 257 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
3a2ebbf4 258 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
259 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 260}
784877d9 261
3a2ebbf4 262### Reading construct/destruct functions
263
264sub add_reading {
265 my( $self, $reading ) = @_;
266 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
267 my %args = %$reading;
268 $reading = Text::Tradition::Collation::Reading->new(
269 'collation' => $self,
270 %args );
271 }
272 # First check to see if a reading with this ID exists.
273 if( $self->reading( $reading->id ) ) {
63778331 274 throw( "Collation already has a reading with id " . $reading->id );
3a2ebbf4 275 }
c1915ab9 276 $self->_graphcalc_done(0);
3a2ebbf4 277 $self->_add_reading( $reading->id => $reading );
278 # Once the reading has been added, put it in both graphs.
279 $self->sequence->add_vertex( $reading->id );
22222af9 280 $self->relations->add_reading( $reading->id );
3a2ebbf4 281 return $reading;
eca16057 282};
283
3a2ebbf4 284around del_reading => sub {
285 my $orig = shift;
286 my $self = shift;
287 my $arg = shift;
288
289 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
290 $arg = $arg->id;
291 }
3a2ebbf4 292 # Remove the reading from the graphs.
c1915ab9 293 $self->_graphcalc_done(0);
3a2ebbf4 294 $self->sequence->delete_vertex( $arg );
22222af9 295 $self->relations->delete_reading( $arg );
3a2ebbf4 296
297 # Carry on.
298 $self->$orig( $arg );
299};
7854e12e 300
3a2ebbf4 301# merge_readings( $main, $to_be_deleted );
7854e12e 302
3a2ebbf4 303sub merge_readings {
304 my $self = shift;
305
306 # We only need the IDs for adding paths to the graph, not the reading
307 # objects themselves.
49d4f2ac 308 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
c1915ab9 309 $self->_graphcalc_done(0);
3a2ebbf4 310
311 # The kept reading should inherit the paths and the relationships
312 # of the deleted reading.
313 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
314 my @vector = ( $kept );
315 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
316 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 317 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 318 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
319 $self->sequence->add_edge( @vector );
320 my $fwits = $self->sequence->get_edge_attributes( @vector );
321 @wits{keys %$fwits} = values %$fwits;
322 $self->sequence->set_edge_attributes( @vector, \%wits );
323 }
22222af9 324 $self->relations->merge_readings( $kept, $deleted, $combine_char );
3a2ebbf4 325
326 # Do the deletion deed.
49d4f2ac 327 if( $combine_char ) {
328 my $kept_obj = $self->reading( $kept );
329 my $new_text = join( $combine_char, $kept_obj->text,
330 $self->reading( $deleted )->text );
331 $kept_obj->alter_text( $new_text );
332 }
3a2ebbf4 333 $self->del_reading( $deleted );
334}
7854e12e 335
3265b0ce 336
3a2ebbf4 337# Helper function for manipulating the graph.
338sub _stringify_args {
339 my( $self, $first, $second, $arg ) = @_;
340 $first = $first->id
341 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
342 $second = $second->id
343 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
344 return( $first, $second, $arg );
345}
df6d9812 346
4e5a7b2c 347# Helper function for manipulating the graph.
348sub _objectify_args {
349 my( $self, $first, $second, $arg ) = @_;
350 $first = $self->reading( $first )
351 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
352 $second = $self->reading( $second )
353 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
354 return( $first, $second, $arg );
355}
3a2ebbf4 356### Path logic
357
358sub add_path {
359 my $self = shift;
360
361 # We only need the IDs for adding paths to the graph, not the reading
362 # objects themselves.
363 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
364
c1915ab9 365 $self->_graphcalc_done(0);
3a2ebbf4 366 # Connect the readings
367 $self->sequence->add_edge( $source, $target );
368 # Note the witness in question
369 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 370};
371
3a2ebbf4 372sub del_path {
373 my $self = shift;
49d4f2ac 374 my @args;
375 if( ref( $_[0] ) eq 'ARRAY' ) {
376 my $e = shift @_;
377 @args = ( @$e, @_ );
378 } else {
379 @args = @_;
380 }
3a2ebbf4 381
382 # We only need the IDs for adding paths to the graph, not the reading
383 # objects themselves.
49d4f2ac 384 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 385
c1915ab9 386 $self->_graphcalc_done(0);
3a2ebbf4 387 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 388 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 389 }
390 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
391 $self->sequence->delete_edge( $source, $target );
392 }
784877d9 393}
394
3a2ebbf4 395
15d2d3df 396# Extra graph-alike utility
397sub has_path {
3a2ebbf4 398 my $self = shift;
399 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
400 return undef unless $self->sequence->has_edge( $source, $target );
401 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 402}
403
4e5a7b2c 404=head2 clear_witness( @sigil_list )
3a2ebbf4 405
4e5a7b2c 406Clear the given witnesses out of the collation entirely, removing references
407to them in paths, and removing readings that belong only to them. Should only
408be called via $tradition->del_witness.
3a2ebbf4 409
410=cut
411
4e5a7b2c 412sub clear_witness {
413 my( $self, @sigils ) = @_;
414
c1915ab9 415 $self->_graphcalc_done(0);
4e5a7b2c 416 # Clear the witness(es) out of the paths
417 foreach my $e ( $self->paths ) {
418 foreach my $sig ( @sigils ) {
419 $self->del_path( $e, $sig );
420 }
421 }
422
423 # Clear out the newly unused readings
424 foreach my $r ( $self->readings ) {
425 unless( $self->reading_witnesses( $r ) ) {
426 $self->del_reading( $r );
427 }
428 }
429}
3a2ebbf4 430
431sub add_relationship {
432 my $self = shift;
22222af9 433 my( $source, $target, $opts ) = $self->_stringify_args( @_ );
63778331 434 my( @vectors ) = $self->relations->add_relationship( $source,
64ae6270 435 $self->reading( $source ), $target, $self->reading( $target ), $opts );
c1915ab9 436 $self->_graphcalc_done(0);
63778331 437 return @vectors;
22222af9 438}
ef9d481f 439
ca6e6095 440around qw/ get_relationship del_relationship / => sub {
441 my $orig = shift;
442 my $self = shift;
443 my @args = @_;
444 if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
445 @args = @{$_[0]};
446 }
447 my( $source, $target ) = $self->_stringify_args( @args );
448 $self->$orig( $source, $target );
449};
450
22222af9 451=head2 reading_witnesses( $reading )
910a0a6d 452
22222af9 453Return a list of sigils corresponding to the witnesses in which the reading appears.
3265b0ce 454
22222af9 455=cut
1d310495 456
1d310495 457sub reading_witnesses {
458 my( $self, $reading ) = @_;
459 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 460 # arbitrarily chosen "incoming". Thus, special-case the start node.
461 if( $reading eq $self->start ) {
462 return map { $_->sigil } $self->tradition->witnesses;
463 }
1d310495 464 my %all_witnesses;
465 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
466 my $wits = $self->sequence->get_edge_attributes( @$e );
467 @all_witnesses{ keys %$wits } = 1;
468 }
c12bb878 469 my $acstr = $self->ac_label;
470 foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
471 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
472 }
1d310495 473 return keys %all_witnesses;
910a0a6d 474}
475
4e5a7b2c 476=head1 OUTPUT METHODS
8e1394aa 477
0ecb975c 478=head2 as_svg( \%options )
8e1394aa 479
0068967c 480Returns an SVG string that represents the graph, via as_dot and graphviz.
bfcbcecb 481See as_dot for a list of options. Must have GraphViz (dot) installed to run.
8e1394aa 482
483=cut
484
485sub as_svg {
0ecb975c 486 my( $self, $opts ) = @_;
bfcbcecb 487 throw( "Need GraphViz installed to output SVG" )
488 unless File::Which::which( 'dot' );
e247aad1 489 my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
c1915ab9 490 $self->calculate_ranks() unless $self->_graphcalc_done;
e247aad1 491 if( !$self->has_cached_svg || $opts->{'recalc'} || $want_subgraph ) {
b365fbae 492 my @cmd = qw/dot -Tsvg/;
493 my( $svg, $err );
494 my $dotfile = File::Temp->new();
495 ## USE FOR DEBUGGING
496 # $dotfile->unlink_on_destroy(0);
497 binmode $dotfile, ':utf8';
498 print $dotfile $self->as_dot( $opts );
499 push( @cmd, $dotfile->filename );
500 run( \@cmd, ">", binary(), \$svg );
e247aad1 501 $svg = decode_utf8( $svg );
502 $self->cached_svg( $svg ) unless $want_subgraph;
503 return $svg;
504 } else {
505 return $self->cached_svg;
b365fbae 506 }
8e1394aa 507}
508
b22576c6 509
0ecb975c 510=head2 as_dot( \%options )
b22576c6 511
0ecb975c 512Returns a string that is the collation graph expressed in dot
513(i.e. GraphViz) format. Options include:
b22576c6 514
0ecb975c 515=over 4
b22576c6 516
0ecb975c 517=item * from
b22576c6 518
0ecb975c 519=item * to
df6d9812 520
0ecb975c 521=item * color_common
522
523=back
df6d9812 524
525=cut
526
527sub as_dot {
0ecb975c 528 my( $self, $opts ) = @_;
529 my $startrank = $opts->{'from'} if $opts;
530 my $endrank = $opts->{'to'} if $opts;
531 my $color_common = $opts->{'color_common'} if $opts;
b365fbae 532 my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
533 && $self->end->rank > 100;
534
b22576c6 535 # Check the arguments
536 if( $startrank ) {
537 return if $endrank && $startrank > $endrank;
538 return if $startrank > $self->end->rank;
539 }
540 if( defined $endrank ) {
541 return if $endrank < 0;
f1b3b33a 542 $endrank = undef if $endrank == $self->end->rank;
b22576c6 543 }
544
67da8d6c 545 my $graph_name = $self->tradition->name;
546 $graph_name =~ s/[^\w\s]//g;
547 $graph_name = join( '_', split( /\s+/, $graph_name ) );
f13b5582 548
549 my %graph_attrs = (
550 'rankdir' => 'LR',
551 'bgcolor' => 'none',
552 );
553 my %node_attrs = (
b8990398 554 'fontsize' => 14,
f13b5582 555 'fillcolor' => 'white',
556 'style' => 'filled',
557 'shape' => 'ellipse'
558 );
559 my %edge_attrs = (
560 'arrowhead' => 'open',
561 'color' => '#000000',
562 'fontcolor' => '#000000',
563 );
564
67da8d6c 565 my $dot = sprintf( "digraph %s {\n", $graph_name );
f13b5582 566 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
567 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
df6d9812 568
b22576c6 569 # Output substitute start/end readings if necessary
570 if( $startrank ) {
571 $dot .= "\t\"#SUBSTART#\" [ label=\"...\" ];\n";
572 }
573 if( $endrank ) {
574 $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n";
575 }
b365fbae 576 if( $STRAIGHTENHACK ) {
577 ## HACK part 1
578 $dot .= "\tsubgraph { rank=same \"#START#\" \"#SILENT#\" }\n";
579 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
580 }
b22576c6 581 my %used; # Keep track of the readings that actually appear in the graph
30ddc24c 582 # Sort the readings by rank if we have ranks; this speeds layout.
583 my @all_readings = $self->end->has_rank
584 ? sort { $a->rank <=> $b->rank } $self->readings
585 : $self->readings;
4633f9e4 586 # TODO Refrain from outputting lacuna nodes - just grey out the edges.
30ddc24c 587 foreach my $reading ( @all_readings ) {
b22576c6 588 # Only output readings within our rank range.
589 next if $startrank && $reading->rank < $startrank;
590 next if $endrank && $reading->rank > $endrank;
591 $used{$reading->id} = 1;
910a0a6d 592 # Need not output nodes without separate labels
3a2ebbf4 593 next if $reading->id eq $reading->text;
d4b75f44 594 my $rattrs;
30f0df34 595 my $label = $reading->text;
629e27b0 596 $label .= '-' if $reading->join_next;
597 $label = "-$label" if $reading->join_prior;
8f9cab7b 598 $label =~ s/\"/\\\"/g;
d4b75f44 599 $rattrs->{'label'} = $label;
0ecb975c 600 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
d4b75f44 601 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
df6d9812 602 }
3a2ebbf4 603
30ddc24c 604 # Add the real edges. Need to weight one edge per rank jump, in a
605 # continuous line.
b365fbae 606 # my $weighted = $self->_add_edge_weights;
b22576c6 607 my @edges = $self->paths;
3bdec618 608 my( %substart, %subend );
b22576c6 609 foreach my $edge ( @edges ) {
610 # Do we need to output this edge?
508fd430 611 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
027d819c 612 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
f13b5582 613 my $variables = { %edge_attrs, 'label' => $label };
30ddc24c 614
b22576c6 615 # Account for the rank gap if necessary
30ddc24c 616 my $rank0 = $self->reading( $edge->[0] )->rank
617 if $self->reading( $edge->[0] )->has_rank;
618 my $rank1 = $self->reading( $edge->[1] )->rank
619 if $self->reading( $edge->[1] )->has_rank;
620 if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
621 $variables->{'minlen'} = $rank1 - $rank0;
622 }
623
624 # Add the calculated edge weights
b365fbae 625 # if( exists $weighted->{$edge->[0]}
e247aad1 626 # && $weighted->{$edge->[0]} eq $edge->[1] ) {
627 # # $variables->{'color'} = 'red';
628 # $variables->{'weight'} = 3.0;
629 # }
30ddc24c 630
508fd430 631 # EXPERIMENTAL: make edge width reflect no. of witnesses
632 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
633 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
634
f13b5582 635 my $varopts = _dot_attr_string( $variables );
636 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
637 $edge->[0], $edge->[1], $varopts );
3bdec618 638 } elsif( $used{$edge->[0]} ) {
639 $subend{$edge->[0]} = 1;
640 } elsif( $used{$edge->[1]} ) {
641 $substart{$edge->[1]} = 1;
b22576c6 642 }
df6d9812 643 }
3bdec618 644 # Add substitute start and end edges if necessary
645 foreach my $node ( keys %substart ) {
027d819c 646 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 647 my $variables = { %edge_attrs, 'label' => $witstr };
648 my $varopts = _dot_attr_string( $variables );
649 $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;";
3bdec618 650 }
651 foreach my $node ( keys %subend ) {
027d819c 652 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 653 my $variables = { %edge_attrs, 'label' => $witstr };
654 my $varopts = _dot_attr_string( $variables );
655 $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;";
3bdec618 656 }
b365fbae 657 # HACK part 2
658 if( $STRAIGHTENHACK ) {
659 $dot .= "\t\"#END#\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
660 }
30ddc24c 661
df6d9812 662 $dot .= "}\n";
663 return $dot;
664}
665
f13b5582 666sub _dot_attr_string {
667 my( $hash ) = @_;
668 my @attrs;
669 foreach my $k ( sort keys %$hash ) {
670 my $v = $hash->{$k};
671 push( @attrs, $k.'="'.$v.'"' );
672 }
673 return( '[ ' . join( ', ', @attrs ) . ' ]' );
674}
675
30ddc24c 676sub _add_edge_weights {
677 my $self = shift;
678 # Walk the graph from START to END, choosing the successor node with
679 # the largest number of witness paths each time.
680 my $weighted = {};
681 my $curr = $self->start->id;
008fc8a6 682 my $ranked = $self->end->has_rank;
30ddc24c 683 while( $curr ne $self->end->id ) {
008fc8a6 684 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
30ddc24c 685 my @succ = sort { $self->path_witnesses( $curr, $a )
686 <=> $self->path_witnesses( $curr, $b ) }
687 $self->sequence->successors( $curr );
688 my $next = pop @succ;
008fc8a6 689 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
30ddc24c 690 # Try to avoid lacunae in the weighted path.
008fc8a6 691 while( @succ &&
692 ( $self->reading( $next )->is_lacuna ||
693 $nextrank - $rank > 1 ) ){
30ddc24c 694 $next = pop @succ;
695 }
696 $weighted->{$curr} = $next;
697 $curr = $next;
698 }
699 return $weighted;
700}
701
027d819c 702=head2 path_witnesses( $edge )
703
704Returns the list of sigils whose witnesses are associated with the given edge.
705The edge can be passed as either an array or an arrayref of ( $source, $target ).
706
707=cut
708
3a2ebbf4 709sub path_witnesses {
710 my( $self, @edge ) = @_;
711 # If edge is an arrayref, cope.
712 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
713 my $e = shift @edge;
714 @edge = @$e;
715 }
716 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
508fd430 717 return @wits;
3a2ebbf4 718}
719
027d819c 720sub _path_display_label {
508fd430 721 my $self = shift;
722 my @wits = sort @_;
8f9cab7b 723 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
724 if( scalar @wits > $maj ) {
f13b5582 725 # TODO break out a.c. wits
8f9cab7b 726 return 'majority';
727 } else {
728 return join( ', ', @wits );
729 }
730}
1dd07bda 731
732=head2 witnesses_at_rank
733
734Returns a list of witnesses that are not lacunose, for a given rank.
735
736=cut
737
738sub witnesses_at_rank {
739 my( $self, $rank ) = @_;
740}
8f9cab7b 741
4e5a7b2c 742=head2 as_graphml
8e1394aa 743
4e5a7b2c 744Returns a GraphML representation of the collation. The GraphML will contain
745two graphs. The first expresses the attributes of the readings and the witness
746paths that link them; the second expresses the relationships that link the
747readings. This is the native transfer format for a tradition.
8e1394aa 748
56eefa04 749=begin testing
750
751use Text::Tradition;
752
753my $READINGS = 311;
754my $PATHS = 361;
755
756my $datafile = 't/data/florilegium_tei_ps.xml';
757my $tradition = Text::Tradition->new( 'input' => 'TEI',
758 'name' => 'test0',
759 'file' => $datafile,
760 'linear' => 1 );
761
762ok( $tradition, "Got a tradition object" );
763is( scalar $tradition->witnesses, 13, "Found all witnesses" );
764ok( $tradition->collation, "Tradition has a collation" );
765
766my $c = $tradition->collation;
767is( scalar $c->readings, $READINGS, "Collation has all readings" );
768is( scalar $c->paths, $PATHS, "Collation has all paths" );
769is( scalar $c->relationships, 0, "Collation has all relationships" );
770
771# Add a few relationships
772$c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
773$c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
774$c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
775
776# Now write it to GraphML and parse it again.
777
778my $graphml = $c->as_graphml;
779my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
780is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
781is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
782is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
783
784=end testing
785
8e1394aa 786=cut
787
788sub as_graphml {
3a2ebbf4 789 my( $self ) = @_;
3d14b48e 790 $self->calculate_ranks unless $self->_graphcalc_done;
791
8e1394aa 792 # Some namespaces
793 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
794 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
795 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 796 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 797
798 # Create the document and root node
799 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
800 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
801 $graphml->setDocumentElement( $root );
802 $root->setNamespace( $xsi_ns, 'xsi', 0 );
803 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
804
e309421a 805 # Add the data keys for the graph
806 my %graph_data_keys;
807 my $gdi = 0;
1d310495 808 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
e309421a 809 foreach my $datum ( @graph_attributes ) {
810 $graph_data_keys{$datum} = 'dg'.$gdi++;
811 my $key = $root->addNewChild( $graphml_ns, 'key' );
812 $key->setAttribute( 'attr.name', $datum );
813 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
814 $key->setAttribute( 'for', 'graph' );
815 $key->setAttribute( 'id', $graph_data_keys{$datum} );
816 }
f6066bac 817
8e1394aa 818 # Add the data keys for nodes
ef9d481f 819 my %node_data_keys;
820 my $ndi = 0;
3a2ebbf4 821 my %node_data = (
822 id => 'string',
255875b8 823 text => 'string',
3a2ebbf4 824 rank => 'string',
825 is_start => 'boolean',
826 is_end => 'boolean',
827 is_lacuna => 'boolean',
629e27b0 828 is_common => 'boolean',
829 join_prior => 'boolean',
830 join_next => 'boolean',
3a2ebbf4 831 );
832 foreach my $datum ( keys %node_data ) {
910a0a6d 833 $node_data_keys{$datum} = 'dn'.$ndi++;
834 my $key = $root->addNewChild( $graphml_ns, 'key' );
835 $key->setAttribute( 'attr.name', $datum );
3a2ebbf4 836 $key->setAttribute( 'attr.type', $node_data{$datum} );
910a0a6d 837 $key->setAttribute( 'for', 'node' );
838 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 839 }
840
df6d9812 841 # Add the data keys for edges, i.e. witnesses
ef9d481f 842 my $edi = 0;
843 my %edge_data_keys;
3a2ebbf4 844 my %edge_data = (
f523c7a8 845 class => 'string', # Class, deprecated soon
3a2ebbf4 846 witness => 'string', # ID/label for a path
847 relationship => 'string', # ID/label for a relationship
848 extra => 'boolean', # Path key
c84275ff 849 scope => 'string', # Relationship key
3d14b48e 850 annotation => 'string', # Relationship key
3a2ebbf4 851 non_correctable => 'boolean', # Relationship key
852 non_independent => 'boolean', # Relationship key
853 );
854 foreach my $datum ( keys %edge_data ) {
855 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 856 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 857 $key->setAttribute( 'attr.name', $datum );
858 $key->setAttribute( 'attr.type', $edge_data{$datum} );
910a0a6d 859 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 860 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 861 }
3a2ebbf4 862
22222af9 863 # Add the collation graph itself
2c669bca 864 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
865 $sgraph->setAttribute( 'edgedefault', 'directed' );
866 $sgraph->setAttribute( 'id', $self->tradition->name );
867 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
868 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
869 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
870 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
871 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
22222af9 872
e309421a 873 # Collation attribute data
874 foreach my $datum ( @graph_attributes ) {
2c669bca 875 my $value = $datum eq 'version' ? '3.0' : $self->$datum;
876 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 877 }
8e1394aa 878
879 my $node_ctr = 0;
880 my %node_hash;
22222af9 881 # Add our readings to the graph
3a2ebbf4 882 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
2c669bca 883 # Add to the main graph
884 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 885 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 886 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 887 $node_el->setAttribute( 'id', $node_xmlid );
255875b8 888 foreach my $d ( keys %node_data ) {
889 my $nval = $n->$d;
890 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
891 if defined $nval;
892 }
b15511bf 893 }
894
2c669bca 895 # Add the path edges to the sequence graph
df6d9812 896 my $edge_ctr = 0;
3a2ebbf4 897 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
898 # We add an edge in the graphml for every witness in $e.
508fd430 899 foreach my $wit ( sort $self->path_witnesses( $e ) ) {
3a2ebbf4 900 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
901 $node_hash{ $e->[0] },
902 $node_hash{ $e->[1] } );
2c669bca 903 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 904 $edge_el->setAttribute( 'source', $from );
905 $edge_el->setAttribute( 'target', $to );
906 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 907
908 # It's a witness path, so add the witness
909 my $base = $wit;
910 my $key = $edge_data_keys{'witness'};
911 # Is this an ante-corr witness?
912 my $aclabel = $self->ac_label;
913 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
914 # Keep the base witness
915 $base = $1;
916 # ...and record that this is an 'extra' reading path
917 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
918 }
919 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
f523c7a8 920 _add_graphml_data( $edge_el, $edge_data_keys{'class'}, 'path' );
3a2ebbf4 921 }
922 }
923
22222af9 924 # Add the relationship graph to the XML
027d819c 925 $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
2626f709 926 $node_data_keys{'id'}, \%edge_data_keys );
8e1394aa 927
94c00c71 928 # Save and return the thing
929 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 930 return $result;
df6d9812 931}
932
b15511bf 933sub _add_graphml_data {
934 my( $el, $key, $value ) = @_;
b15511bf 935 return unless defined $value;
c9bf3dbf 936 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 937 $data_el->setAttribute( 'key', $key );
938 $data_el->appendText( $value );
8e1394aa 939}
940
4e5a7b2c 941=head2 as_csv
910a0a6d 942
943Returns a CSV alignment table representation of the collation graph, one
2c669bca 944row per witness (or witness uncorrected.)
910a0a6d 945
946=cut
947
948sub as_csv {
3a2ebbf4 949 my( $self ) = @_;
1dd07bda 950 my $table = $self->alignment_table;
910a0a6d 951 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
952 my @result;
2c669bca 953 # Make the header row
954 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
955 push( @result, decode_utf8( $csv->string ) );
956 # Make the rest of the rows
957 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 958 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1dd07bda 959 my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
2c669bca 960 $csv->combine( @row );
910a0a6d 961 push( @result, decode_utf8( $csv->string ) );
962 }
3a2ebbf4 963 return join( "\n", @result );
910a0a6d 964}
965
1dd07bda 966=head2 alignment_table( $use_refs, $include_witnesses )
2c669bca 967
566f4595 968Return a reference to an alignment table, in a slightly enhanced CollateX
969format which looks like this:
970
971 $table = { alignment => [ { witness => "SIGIL",
4e5a7b2c 972 tokens => [ { t => "TEXT" }, ... ] },
566f4595 973 { witness => "SIG2",
4e5a7b2c 974 tokens => [ { t => "TEXT" }, ... ] },
566f4595 975 ... ],
976 length => TEXTLEN };
977
978If $use_refs is set to 1, the reading object is returned in the table
979instead of READINGTEXT; if not, the text of the reading is returned.
4e5a7b2c 980
981If $include_witnesses is set to a hashref, only the witnesses whose sigil
566f4595 982keys have a true hash value will be included.
2c669bca 983
984=cut
9f3ba6f7 985
1dd07bda 986sub alignment_table {
987 my( $self ) = @_;
c1915ab9 988 $self->calculate_ranks() unless $self->_graphcalc_done;
1dd07bda 989 return $self->cached_table if $self->has_cached_table;
990
0ecb975c 991 # Make sure we can do this
992 throw( "Need a linear graph in order to make an alignment table" )
993 unless $self->linear;
994 $self->calculate_ranks unless $self->end->has_rank;
995
2c669bca 996 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 997 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 998 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
eca16057 999 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 1000 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1dd07bda 1001 my @row = _make_witness_row( \@wit_path, \@all_pos );
2c669bca 1002 push( @{$table->{'alignment'}},
1003 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 1004 if( $wit->is_layered ) {
1005 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
861c3e27 1006 $wit->sigil.$self->ac_label );
1dd07bda 1007 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
2c669bca 1008 push( @{$table->{'alignment'}},
1009 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 1010 }
1011 }
1dd07bda 1012 $self->cached_table( $table );
1013 return $table;
910a0a6d 1014}
1015
1016sub _make_witness_row {
1dd07bda 1017 my( $path, $positions ) = @_;
910a0a6d 1018 my %char_hash;
1019 map { $char_hash{$_} = undef } @$positions;
2c669bca 1020 my $debug = 0;
910a0a6d 1021 foreach my $rdg ( @$path ) {
eca16057 1022 my $rtext = $rdg->text;
1023 $rtext = '#LACUNA#' if $rdg->is_lacuna;
2c669bca 1024 print STDERR "rank " . $rdg->rank . "\n" if $debug;
3a2ebbf4 1025 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
1dd07bda 1026 $char_hash{$rdg->rank} = { 't' => $rdg };
910a0a6d 1027 }
1028 my @row = map { $char_hash{$_} } @$positions;
eca16057 1029 # Fill in lacuna markers for undef spots in the row
1030 my $last_el = shift @row;
1031 my @filled_row = ( $last_el );
1032 foreach my $el ( @row ) {
0e476982 1033 # If we are using node reference, make the lacuna node appear many times
1034 # in the table. If not, use the lacuna tag.
1dd07bda 1035 if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1036 $el = $last_el;
eca16057 1037 }
1038 push( @filled_row, $el );
1039 $last_el = $el;
1040 }
1041 return @filled_row;
910a0a6d 1042}
1043
4e5a7b2c 1044=head1 NAVIGATION METHODS
910a0a6d 1045
4e5a7b2c 1046=head2 reading_sequence( $first, $last, $sigil, $backup )
e2902068 1047
1048Returns the ordered list of readings, starting with $first and ending
4e5a7b2c 1049with $last, for the witness given in $sigil. If a $backup sigil is
1050specified (e.g. when walking a layered witness), it will be used wherever
1051no $sigil path exists. If there is a base text reading, that will be
1052used wherever no path exists for $sigil or $backup.
e2902068 1053
1054=cut
1055
910a0a6d 1056# TODO Think about returning some lazy-eval iterator.
b0b4421a 1057# TODO Get rid of backup; we should know from what witness is whether we need it.
910a0a6d 1058
e2902068 1059sub reading_sequence {
861c3e27 1060 my( $self, $start, $end, $witness ) = @_;
e2902068 1061
930ff666 1062 $witness = $self->baselabel unless $witness;
e2902068 1063 my @readings = ( $start );
1064 my %seen;
1065 my $n = $start;
3a2ebbf4 1066 while( $n && $n->id ne $end->id ) {
1067 if( exists( $seen{$n->id} ) ) {
63778331 1068 throw( "Detected loop for $witness at " . $n->id );
910a0a6d 1069 }
3a2ebbf4 1070 $seen{$n->id} = 1;
910a0a6d 1071
861c3e27 1072 my $next = $self->next_reading( $n, $witness );
44771cf2 1073 unless( $next ) {
63778331 1074 throw( "Did not find any path for $witness from reading " . $n->id );
44771cf2 1075 }
910a0a6d 1076 push( @readings, $next );
1077 $n = $next;
e2902068 1078 }
1079 # Check that the last reading is our end reading.
1080 my $last = $readings[$#readings];
63778331 1081 throw( "Last reading found from " . $start->text .
1082 " for witness $witness is not the end!" ) # TODO do we get this far?
3a2ebbf4 1083 unless $last->id eq $end->id;
e2902068 1084
1085 return @readings;
1086}
1087
4e5a7b2c 1088=head2 next_reading( $reading, $sigil );
8e1394aa 1089
4a8828f0 1090Returns the reading that follows the given reading along the given witness
930ff666 1091path.
8e1394aa 1092
1093=cut
1094
4a8828f0 1095sub next_reading {
e2902068 1096 # Return the successor via the corresponding path.
8e1394aa 1097 my $self = shift;
3a2ebbf4 1098 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 1099 return undef unless $answer;
3a2ebbf4 1100 return $self->reading( $answer );
8e1394aa 1101}
1102
4e5a7b2c 1103=head2 prior_reading( $reading, $sigil )
8e1394aa 1104
4a8828f0 1105Returns the reading that precedes the given reading along the given witness
930ff666 1106path.
8e1394aa 1107
1108=cut
1109
4a8828f0 1110sub prior_reading {
e2902068 1111 # Return the predecessor via the corresponding path.
8e1394aa 1112 my $self = shift;
3a2ebbf4 1113 my $answer = $self->_find_linked_reading( 'prior', @_ );
1114 return $self->reading( $answer );
8e1394aa 1115}
1116
4a8828f0 1117sub _find_linked_reading {
861c3e27 1118 my( $self, $direction, $node, $path ) = @_;
1119
1120 # Get a backup if we are dealing with a layered witness
1121 my $alt_path;
1122 my $aclabel = $self->ac_label;
1123 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1124 $alt_path = $1;
1125 }
1126
e2902068 1127 my @linked_paths = $direction eq 'next'
3a2ebbf4 1128 ? $self->sequence->edges_from( $node )
1129 : $self->sequence->edges_to( $node );
e2902068 1130 return undef unless scalar( @linked_paths );
8e1394aa 1131
e2902068 1132 # We have to find the linked path that contains all of the
1133 # witnesses supplied in $path.
1134 my( @path_wits, @alt_path_wits );
4e5a7b2c 1135 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1136 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 1137 my $base_le;
1138 my $alt_le;
1139 foreach my $le ( @linked_paths ) {
3a2ebbf4 1140 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 1141 $base_le = $le;
910a0a6d 1142 }
508fd430 1143 my @le_wits = sort $self->path_witnesses( $le );
3a2ebbf4 1144 if( _is_within( \@path_wits, \@le_wits ) ) {
1145 # This is the right path.
1146 return $direction eq 'next' ? $le->[1] : $le->[0];
1147 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1148 $alt_le = $le;
1149 }
8e1394aa 1150 }
e2902068 1151 # Got this far? Return the alternate path if it exists.
3a2ebbf4 1152 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 1153 if $alt_le;
e2902068 1154
1155 # Got this far? Return the base path if it exists.
3a2ebbf4 1156 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 1157 if $base_le;
e2902068 1158
1159 # Got this far? We have no appropriate path.
2c669bca 1160 warn "Could not find $direction node from " . $node->id
910a0a6d 1161 . " along path $path";
8e1394aa 1162 return undef;
1163}
1164
4a8828f0 1165# Some set logic.
1166sub _is_within {
1167 my( $set1, $set2 ) = @_;
7854e12e 1168 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 1169 foreach my $el ( @$set1 ) {
910a0a6d 1170 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 1171 }
1172 return $ret;
1173}
1174
4e5a7b2c 1175# Return the string that joins together a list of witnesses for
1176# display on a single path.
1177sub _witnesses_of_label {
1178 my( $self, $label ) = @_;
1179 my $regex = $self->wit_list_separator;
1180 my @answer = split( /\Q$regex\E/, $label );
1181 return @answer;
b0b4421a 1182}
1183
d4b75f44 1184=head2 common_readings
1185
1186Returns the list of common readings in the graph (i.e. those readings that are
1187shared by all non-lacunose witnesses.)
1188
1189=cut
1190
1191sub common_readings {
1192 my $self = shift;
1193 my @common = grep { $_->is_common } $self->readings;
1194 return @common;
1195}
1196
b0b4421a 1197=head2 path_text( $sigil, $mainsigil [, $start, $end ] )
1198
1199Returns the text of a witness (plus its backup, if we are using a layer)
1200as stored in the collation. The text is returned as a string, where the
1201individual readings are joined with spaces and the meta-readings (e.g.
1202lacunae) are omitted. Optional specification of $start and $end allows
1203the generation of a subset of the witness text.
4e5a7b2c 1204
b0b4421a 1205=cut
1206
1207sub path_text {
861c3e27 1208 my( $self, $wit, $start, $end ) = @_;
b0b4421a 1209 $start = $self->start unless $start;
1210 $end = $self->end unless $end;
861c3e27 1211 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
629e27b0 1212 my $pathtext = '';
1213 my $last;
1214 foreach my $r ( @path ) {
1215 if( $r->join_prior || !$last || $last->join_next ) {
1216 $pathtext .= $r->text;
1217 } else {
1218 $pathtext .= ' ' . $r->text;
1219 }
1220 $last = $r;
1221 }
1222 return $pathtext;
b0b4421a 1223}
4e5a7b2c 1224
1225=head1 INITIALIZATION METHODS
1226
1227These are mostly for use by parsers.
1228
1229=head2 make_witness_path( $witness )
1230
1231Link the array of readings contained in $witness->path (and in
1232$witness->uncorrected_path if it exists) into collation paths.
1233Clear out the arrays when finished.
de51424a 1234
4e5a7b2c 1235=head2 make_witness_paths
1236
1237Call make_witness_path for all witnesses in the tradition.
1238
1239=cut
930ff666 1240
7e450e44 1241# For use when a collation is constructed from a base text and an apparatus.
1242# We have the sequences of readings and just need to add path edges.
1f7aa795 1243# When we are done, clear out the witness path attributes, as they are no
1244# longer needed.
1245# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 1246
6a222840 1247sub make_witness_paths {
1248 my( $self ) = @_;
910a0a6d 1249 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 1250 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 1251 $self->make_witness_path( $wit );
7854e12e 1252 }
7854e12e 1253}
1254
6a222840 1255sub make_witness_path {
7854e12e 1256 my( $self, $wit ) = @_;
1257 my @chain = @{$wit->path};
15d2d3df 1258 my $sig = $wit->sigil;
7854e12e 1259 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 1260 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 1261 }
1f7aa795 1262 if( $wit->is_layered ) {
d9e873d0 1263 @chain = @{$wit->uncorrected_path};
1264 foreach my $idx( 0 .. $#chain-1 ) {
1265 my $source = $chain[$idx];
1266 my $target = $chain[$idx+1];
1267 $self->add_path( $source, $target, $sig.$self->ac_label )
1268 unless $self->has_path( $source, $target, $sig );
1269 }
15d2d3df 1270 }
1f7aa795 1271 $wit->clear_path;
1272 $wit->clear_uncorrected_path;
e2902068 1273}
1274
4e5a7b2c 1275=head2 calculate_ranks
1276
1277Calculate the reading ranks (that is, their aligned positions relative
1278to each other) for the graph. This can only be called on linear collations.
1279
b365fbae 1280=begin testing
1281
1282use Text::Tradition;
1283
1284my $cxfile = 't/data/Collatex-16.xml';
1285my $t = Text::Tradition->new(
1286 'name' => 'inline',
1287 'input' => 'CollateX',
1288 'file' => $cxfile,
1289 );
1290my $c = $t->collation;
1291
1292# Make an svg
bfcbcecb 1293my $table = $c->alignment_table;
1294ok( $c->has_cached_table, "Alignment table was cached" );
1295is( $c->alignment_table, $table, "Cached table returned upon second call" );
b365fbae 1296$c->calculate_ranks;
bfcbcecb 1297is( $c->alignment_table, $table, "Cached table retained with no rank change" );
b365fbae 1298$c->add_relationship( 'n9', 'n23', { 'type' => 'spelling' } );
bfcbcecb 1299isnt( $c->alignment_table, $table, "Alignment table changed after relationship add" );
b365fbae 1300
1301=end testing
1302
4e5a7b2c 1303=cut
1304
910a0a6d 1305sub calculate_ranks {
1306 my $self = shift;
b365fbae 1307 # Save the existing ranks, in case we need to invalidate the cached SVG.
1308 my %existing_ranks;
910a0a6d 1309 # Walk a version of the graph where every node linked by a relationship
1310 # edge is fundamentally the same node, and do a topological ranking on
1311 # the nodes in this graph.
c9bf3dbf 1312 my $topo_graph = Graph->new();
910a0a6d 1313 my %rel_containers;
1314 my $rel_ctr = 0;
1315 # Add the nodes
1316 foreach my $r ( $self->readings ) {
3a2ebbf4 1317 next if exists $rel_containers{$r->id};
910a0a6d 1318 my @rels = $r->related_readings( 'colocated' );
1319 if( @rels ) {
1320 # Make a relationship container.
1321 push( @rels, $r );
c9bf3dbf 1322 my $rn = 'rel_container_' . $rel_ctr++;
1323 $topo_graph->add_vertex( $rn );
910a0a6d 1324 foreach( @rels ) {
3a2ebbf4 1325 $rel_containers{$_->id} = $rn;
910a0a6d 1326 }
1327 } else {
1328 # Add a new node to mirror the old node.
3a2ebbf4 1329 $rel_containers{$r->id} = $r->id;
1330 $topo_graph->add_vertex( $r->id );
910a0a6d 1331 }
4a8828f0 1332 }
3a1f2523 1333
3a2ebbf4 1334 # Add the edges.
910a0a6d 1335 foreach my $r ( $self->readings ) {
b365fbae 1336 $existing_ranks{$r} = $r->rank;
3a2ebbf4 1337 foreach my $n ( $self->sequence->successors( $r->id ) ) {
1338 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
1339 $rel_containers{$n} );
4e5a7b2c 1340 # $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 1341 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 1342 }
1343 }
1344
1345 # Now do the rankings, starting with the start node.
3a2ebbf4 1346 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 1347 my $node_ranks = { $topo_start => 0 };
910a0a6d 1348 my @curr_origin = ( $topo_start );
1349 # A little iterative function.
1350 while( @curr_origin ) {
c9bf3dbf 1351 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 1352 }
1353 # Transfer our rankings from the topological graph to the real one.
1354 foreach my $r ( $self->readings ) {
3a2ebbf4 1355 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1356 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 1357 } else {
63778331 1358 # Die. Find the last rank we calculated.
1359 my @all_defined = sort { $node_ranks->{$rel_containers{$a->id}}
1360 <=> $node_ranks->{$rel_containers{$b->id}} }
1361 $self->readings;
1362 my $last = pop @all_defined;
1363 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
67da8d6c 1364 }
de51424a 1365 }
bfcbcecb 1366 # Do we need to invalidate the cached data?
1367 if( $self->has_cached_svg || $self->has_cached_table ) {
b365fbae 1368 foreach my $r ( $self->readings ) {
7c293912 1369 next if defined( $existing_ranks{$r} )
1370 && $existing_ranks{$r} == $r->rank;
c1915ab9 1371 # Something has changed, so clear the cache
bfcbcecb 1372 $self->_clear_cache;
c1915ab9 1373 # ...and recalculate the common readings.
1374 $self->calculate_common_readings();
b365fbae 1375 last;
1376 }
1377 }
c1915ab9 1378 # The graph calculation information is now up to date.
1379 $self->_graphcalc_done(1);
8e1394aa 1380}
3a1f2523 1381
910a0a6d 1382sub _assign_rank {
c9bf3dbf 1383 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1384 # Look at each of the children of @current_nodes. If all the child's
1385 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1386 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1387 # parent gets a rank.
910a0a6d 1388 my @next_nodes;
1389 foreach my $c ( @current_nodes ) {
c9bf3dbf 1390 warn "Current reading $c has no rank!"
1391 unless exists $node_ranks->{$c};
1392 # print STDERR "Looking at child of node $c, rank "
1393 # . $node_ranks->{$c} . "\n";
1394 foreach my $child ( $graph->successors( $c ) ) {
1395 next if exists $node_ranks->{$child};
910a0a6d 1396 my $highest_rank = -1;
1397 my $skip = 0;
c9bf3dbf 1398 foreach my $parent ( $graph->predecessors( $child ) ) {
1399 if( exists $node_ranks->{$parent} ) {
1400 $highest_rank = $node_ranks->{$parent}
1401 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1402 } else {
1403 $skip = 1;
1404 last;
1405 }
1406 }
1407 next if $skip;
c9bf3dbf 1408 my $c_rank = $highest_rank + 1;
1409 # print STDERR "Assigning rank $c_rank to node $child \n";
1410 $node_ranks->{$child} = $c_rank;
910a0a6d 1411 push( @next_nodes, $child );
1412 }
1413 }
1414 return @next_nodes;
4cdd82f1 1415}
910a0a6d 1416
c1915ab9 1417sub _clear_cache {
1418 my $self = shift;
1419 $self->wipe_svg if $self->has_cached_svg;
1420 $self->wipe_table if $self->has_cached_table;
1421}
1422
1423
4e5a7b2c 1424=head2 flatten_ranks
1425
1426A convenience method for parsing collation data. Searches the graph for readings
1427with the same text at the same rank, and merges any that are found.
1428
1429=cut
1430
0e476982 1431sub flatten_ranks {
1432 my $self = shift;
1433 my %unique_rank_rdg;
1434 foreach my $rdg ( $self->readings ) {
1435 next unless $rdg->has_rank;
1436 my $key = $rdg->rank . "||" . $rdg->text;
1437 if( exists $unique_rank_rdg{$key} ) {
1438 # Combine!
56eefa04 1439 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1440 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
0ecb975c 1441 # TODO see if this now makes a common point.
0e476982 1442 } else {
1443 $unique_rank_rdg{$key} = $rdg;
1444 }
1445 }
1446}
4633f9e4 1447
1448
d4b75f44 1449=head2 calculate_common_readings
1450
1451Goes through the graph identifying the readings that appear in every witness
1452(apart from those with lacunae at that spot.) Marks them as common and returns
1453the list.
1454
1455=begin testing
1456
1457use Text::Tradition;
1458
1459my $cxfile = 't/data/Collatex-16.xml';
1460my $t = Text::Tradition->new(
1461 'name' => 'inline',
1462 'input' => 'CollateX',
1463 'file' => $cxfile,
1464 );
1465my $c = $t->collation;
1466
1467my @common = $c->calculate_common_readings();
1468is( scalar @common, 8, "Found correct number of common readings" );
1469my @marked = sort $c->common_readings();
1470is( scalar @common, 8, "All common readings got marked as such" );
1471my @expected = qw/ n1 n12 n16 n19 n20 n5 n6 n7 /;
1472is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1473
1474=end testing
1475
1476=cut
1477
1478sub calculate_common_readings {
1479 my $self = shift;
1480 my @common;
c1915ab9 1481 map { $_->is_common( 0 ) } $self->readings;
1482 # Implicitly calls calculate_ranks
1dd07bda 1483 my $table = $self->alignment_table;
d4b75f44 1484 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
7f52eac8 1485 my @row = map { $_->{'tokens'}->[$idx]
1486 ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1487 @{$table->{'alignment'}};
d4b75f44 1488 my %hash;
1489 foreach my $r ( @row ) {
1490 if( $r ) {
1491 $hash{$r->id} = $r unless $r->is_meta;
1492 } else {
1493 $hash{'UNDEF'} = $r;
1494 }
1495 }
1496 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1497 my( $r ) = values %hash;
1498 $r->is_common( 1 );
1499 push( @common, $r );
1500 }
1501 }
1502 return @common;
1503}
1504
861c3e27 1505=head2 text_from_paths
1506
1507Calculate the text array for all witnesses from the path, for later consistency
1508checking. Only to be used if there is no non-graph-based way to know the
1509original texts.
1510
1511=cut
1512
1513sub text_from_paths {
1514 my $self = shift;
1515 foreach my $wit ( $self->tradition->witnesses ) {
1516 my @text = split( /\s+/,
1517 $self->reading_sequence( $self->start, $self->end, $wit->sigil ) );
1518 $wit->text( \@text );
1519 if( $wit->is_layered ) {
1520 my @uctext = split( /\s+/,
1521 $self->reading_sequence( $self->start, $self->end,
1522 $wit->sigil.$self->ac_label ) );
1523 $wit->text( \@uctext );
1524 }
1525 }
1526}
0e476982 1527
4e5a7b2c 1528=head1 UTILITY FUNCTIONS
1529
1530=head2 common_predecessor( $reading_a, $reading_b )
8e1394aa 1531
4e5a7b2c 1532Find the last reading that occurs in sequence before both the given readings.
1533
1534=head2 common_successor( $reading_a, $reading_b )
1535
1536Find the first reading that occurs in sequence after both the given readings.
1537
22222af9 1538=begin testing
1539
1540use Text::Tradition;
1541
1542my $cxfile = 't/data/Collatex-16.xml';
1543my $t = Text::Tradition->new(
1544 'name' => 'inline',
1545 'input' => 'CollateX',
1546 'file' => $cxfile,
1547 );
1548my $c = $t->collation;
1549
4e5a7b2c 1550is( $c->common_predecessor( 'n9', 'n23' )->id,
22222af9 1551 'n20', "Found correct common predecessor" );
4e5a7b2c 1552is( $c->common_successor( 'n9', 'n23' )->id,
22222af9 1553 '#END#', "Found correct common successor" );
1554
4e5a7b2c 1555is( $c->common_predecessor( 'n19', 'n17' )->id,
22222af9 1556 'n16', "Found correct common predecessor for readings on same path" );
4e5a7b2c 1557is( $c->common_successor( 'n21', 'n26' )->id,
22222af9 1558 '#END#', "Found correct common successor for readings on same path" );
1559
1560=end testing
1561
1562=cut
1563
1564## Return the closest reading that is a predecessor of both the given readings.
1565sub common_predecessor {
1566 my $self = shift;
4e5a7b2c 1567 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1568 return $self->_common_in_path( $r1, $r2, 'predecessors' );
22222af9 1569}
1570
1571sub common_successor {
1572 my $self = shift;
4e5a7b2c 1573 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1574 return $self->_common_in_path( $r1, $r2, 'successors' );
22222af9 1575}
1576
027d819c 1577sub _common_in_path {
22222af9 1578 my( $self, $r1, $r2, $dir ) = @_;
1579 my $iter = $r1->rank > $r2->rank ? $r1->rank : $r2->rank;
1580 $iter = $self->end->rank - $iter if $dir eq 'successors';
1581 my @candidates;
1582 my @last_checked = ( $r1, $r2 );
1583 my %all_seen;
1584 while( !@candidates ) {
1585 my @new_lc;
1586 foreach my $lc ( @last_checked ) {
1587 foreach my $p ( $lc->$dir ) {
1588 if( $all_seen{$p->id} ) {
1589 push( @candidates, $p );
1590 } else {
1591 $all_seen{$p->id} = 1;
1592 push( @new_lc, $p );
1593 }
1594 }
1595 }
1596 @last_checked = @new_lc;
1597 }
1598 my @answer = sort { $a->rank <=> $b->rank } @candidates;
1599 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1600}
1601
63778331 1602sub throw {
1603 Text::Tradition::Error->throw(
1604 'ident' => 'Collation error',
1605 'message' => $_[0],
1606 );
1607}
1608
dd3b58b0 1609no Moose;
1610__PACKAGE__->meta->make_immutable;
e867486f 1611
027d819c 1612=head1 LICENSE
e867486f 1613
027d819c 1614This package is free software and is provided "as is" without express
1615or implied warranty. You can redistribute it and/or modify it under
1616the same terms as Perl itself.
e867486f 1617
027d819c 1618=head1 AUTHOR
e867486f 1619
027d819c 1620Tara L Andrews E<lt>aurum@cpan.orgE<gt>