try to be smarter about recalculating rank and common readings
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
bfcbcecb 5use File::Which;
c9bf3dbf 6use Graph;
8e1394aa 7use IPC::Run qw( run binary );
910a0a6d 8use Text::CSV_XS;
b15511bf 9use Text::Tradition::Collation::Reading;
22222af9 10use Text::Tradition::Collation::RelationshipStore;
63778331 11use Text::Tradition::Error;
df6d9812 12use XML::LibXML;
a344ffcf 13use XML::LibXML::XPathContext;
dd3b58b0 14use Moose;
15
3a2ebbf4 16has 'sequence' => (
d047cd52 17 is => 'ro',
3a2ebbf4 18 isa => 'Graph',
19 default => sub { Graph->new() },
d047cd52 20 handles => {
3a2ebbf4 21 paths => 'edges',
d047cd52 22 },
d047cd52 23 );
3a2ebbf4 24
25has 'relations' => (
26 is => 'ro',
22222af9 27 isa => 'Text::Tradition::Collation::RelationshipStore',
28 handles => {
29 relationships => 'relationships',
30 related_readings => 'related_readings',
ee801e17 31 del_relationship => 'del_relationship',
22222af9 32 },
33 writer => '_set_relations',
3a2ebbf4 34 );
dd3b58b0 35
3a2ebbf4 36has 'tradition' => (
37 is => 'ro',
d047cd52 38 isa => 'Text::Tradition',
8d9a1cd8 39 weak_ref => 1,
d047cd52 40 );
dd3b58b0 41
3a2ebbf4 42has 'readings' => (
43 isa => 'HashRef[Text::Tradition::Collation::Reading]',
44 traits => ['Hash'],
45 handles => {
46 reading => 'get',
47 _add_reading => 'set',
48 del_reading => 'delete',
49 has_reading => 'exists',
50 readings => 'values',
51 },
52 default => sub { {} },
53 );
910a0a6d 54
4a8828f0 55has 'wit_list_separator' => (
7854e12e 56 is => 'rw',
57 isa => 'Str',
58 default => ', ',
59 );
60
61has 'baselabel' => (
62 is => 'rw',
63 isa => 'Str',
64 default => 'base text',
65 );
4a8828f0 66
15d2d3df 67has 'linear' => (
68 is => 'rw',
69 isa => 'Bool',
70 default => 1,
71 );
c84275ff 72
ef9d481f 73has 'ac_label' => (
74 is => 'rw',
75 isa => 'Str',
76 default => ' (a.c.)',
77 );
3a2ebbf4 78
79has 'start' => (
80 is => 'ro',
81 isa => 'Text::Tradition::Collation::Reading',
82 writer => '_set_start',
83 weak_ref => 1,
84 );
85
86has 'end' => (
87 is => 'ro',
88 isa => 'Text::Tradition::Collation::Reading',
89 writer => '_set_end',
90 weak_ref => 1,
91 );
b365fbae 92
93has 'cached_svg' => (
94 is => 'rw',
95 isa => 'Str',
96 predicate => 'has_cached_svg',
97 clearer => 'wipe_svg',
98 );
1dd07bda 99
100has 'cached_table' => (
101 is => 'rw',
102 isa => 'HashRef',
103 predicate => 'has_cached_table',
104 clearer => 'wipe_table',
105 );
c1915ab9 106
107has '_graphcalc_done' => (
108 is => 'rw',
109 isa => 'Bool',
110 default => undef,
111 );
1f563ac3 112
4e5a7b2c 113=head1 NAME
114
115Text::Tradition::Collation - a software model for a text collation
116
117=head1 SYNOPSIS
118
119 use Text::Tradition;
120 my $t = Text::Tradition->new(
121 'name' => 'this is a text',
122 'input' => 'TEI',
123 'file' => '/path/to/tei_parallel_seg_file.xml' );
124
125 my $c = $t->collation;
126 my @readings = $c->readings;
127 my @paths = $c->paths;
128 my @relationships = $c->relationships;
129
130 my $svg_variant_graph = $t->collation->as_svg();
131
132=head1 DESCRIPTION
133
134Text::Tradition is a library for representation and analysis of collated
135texts, particularly medieval ones. The Collation is the central feature of
136a Tradition, where the text, its sequence of readings, and its relationships
137between readings are actually kept.
138
139=head1 CONSTRUCTOR
140
141=head2 new
142
143The constructor. Takes a hash or hashref of the following arguments:
144
145=over
146
147=item * tradition - The Text::Tradition object to which the collation
148belongs. Required.
149
150=item * linear - Whether the collation should be linear; that is, whether
151transposed readings should be treated as two linked readings rather than one,
152and therefore whether the collation graph is acyclic. Defaults to true.
153
4e5a7b2c 154=item * baselabel - The default label for the path taken by a base text
155(if any). Defaults to 'base text'.
156
157=item * wit_list_separator - The string to join a list of witnesses for
158purposes of making labels in display graphs. Defaults to ', '.
159
160=item * ac_label - The extra label to tack onto a witness sigil when
161representing another layer of path for the given witness - that is, when
162a text has more than one possible reading due to scribal corrections or
163the like. Defaults to ' (a.c.)'.
164
165=back
166
167=head1 ACCESSORS
168
169=head2 tradition
170
171=head2 linear
172
4e5a7b2c 173=head2 wit_list_separator
174
175=head2 baselabel
176
177=head2 ac_label
178
179Simple accessors for collation attributes.
180
181=head2 start
182
183The meta-reading at the start of every witness path.
184
185=head2 end
186
187The meta-reading at the end of every witness path.
188
189=head2 readings
190
191Returns all Reading objects in the graph.
192
193=head2 reading( $id )
194
195Returns the Reading object corresponding to the given ID.
196
197=head2 add_reading( $reading_args )
198
199Adds a new reading object to the collation.
200See L<Text::Tradition::Collation::Reading> for the available arguments.
201
202=head2 del_reading( $object_or_id )
203
204Removes the given reading from the collation, implicitly removing its
205paths and relationships.
206
207=head2 merge_readings( $main, $second )
208
209Merges the $second reading into the $main one.
210The arguments may be either readings or reading IDs.
211
212=head2 has_reading( $id )
213
214Predicate to see whether a given reading ID is in the graph.
215
216=head2 reading_witnesses( $object_or_id )
217
218Returns a list of sigils whose witnesses contain the reading.
219
220=head2 paths
221
222Returns all reading paths within the document - that is, all edges in the
223collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
224
225=head2 add_path( $source, $target, $sigil )
226
227Links the given readings in the collation in sequence, under the given witness
228sigil. The readings may be specified by object or ID.
229
230=head2 del_path( $source, $target, $sigil )
231
232Links the given readings in the collation in sequence, under the given witness
233sigil. The readings may be specified by object or ID.
234
235=head2 has_path( $source, $target );
236
237Returns true if the two readings are linked in sequence in any witness.
238The readings may be specified by object or ID.
239
240=head2 relationships
241
242Returns all Relationship objects in the collation.
243
244=head2 add_relationship( $reading, $other_reading, $options )
245
246Adds a new relationship of the type given in $options between the two readings,
247which may be specified by object or ID. Returns a value of ( $status, @vectors)
248where $status is true on success, and @vectors is a list of relationship edges
249that were ultimately added.
250See L<Text::Tradition::Collation::Relationship> for the available options.
251
252=cut
dd3b58b0 253
d047cd52 254sub BUILD {
3a2ebbf4 255 my $self = shift;
22222af9 256 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
3a2ebbf4 257 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
258 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 259}
784877d9 260
3a2ebbf4 261### Reading construct/destruct functions
262
263sub add_reading {
264 my( $self, $reading ) = @_;
265 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
266 my %args = %$reading;
267 $reading = Text::Tradition::Collation::Reading->new(
268 'collation' => $self,
269 %args );
270 }
271 # First check to see if a reading with this ID exists.
272 if( $self->reading( $reading->id ) ) {
63778331 273 throw( "Collation already has a reading with id " . $reading->id );
3a2ebbf4 274 }
c1915ab9 275 $self->_graphcalc_done(0);
3a2ebbf4 276 $self->_add_reading( $reading->id => $reading );
277 # Once the reading has been added, put it in both graphs.
278 $self->sequence->add_vertex( $reading->id );
22222af9 279 $self->relations->add_reading( $reading->id );
3a2ebbf4 280 return $reading;
eca16057 281};
282
3a2ebbf4 283around del_reading => sub {
284 my $orig = shift;
285 my $self = shift;
286 my $arg = shift;
287
288 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
289 $arg = $arg->id;
290 }
3a2ebbf4 291 # Remove the reading from the graphs.
c1915ab9 292 $self->_graphcalc_done(0);
3a2ebbf4 293 $self->sequence->delete_vertex( $arg );
22222af9 294 $self->relations->delete_reading( $arg );
3a2ebbf4 295
296 # Carry on.
297 $self->$orig( $arg );
298};
7854e12e 299
3a2ebbf4 300# merge_readings( $main, $to_be_deleted );
7854e12e 301
3a2ebbf4 302sub merge_readings {
303 my $self = shift;
304
305 # We only need the IDs for adding paths to the graph, not the reading
306 # objects themselves.
49d4f2ac 307 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
c1915ab9 308 $self->_graphcalc_done(0);
3a2ebbf4 309
310 # The kept reading should inherit the paths and the relationships
311 # of the deleted reading.
312 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
313 my @vector = ( $kept );
314 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
315 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 316 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 317 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
318 $self->sequence->add_edge( @vector );
319 my $fwits = $self->sequence->get_edge_attributes( @vector );
320 @wits{keys %$fwits} = values %$fwits;
321 $self->sequence->set_edge_attributes( @vector, \%wits );
322 }
22222af9 323 $self->relations->merge_readings( $kept, $deleted, $combine_char );
3a2ebbf4 324
325 # Do the deletion deed.
49d4f2ac 326 if( $combine_char ) {
327 my $kept_obj = $self->reading( $kept );
328 my $new_text = join( $combine_char, $kept_obj->text,
329 $self->reading( $deleted )->text );
330 $kept_obj->alter_text( $new_text );
331 }
3a2ebbf4 332 $self->del_reading( $deleted );
333}
7854e12e 334
3265b0ce 335
3a2ebbf4 336# Helper function for manipulating the graph.
337sub _stringify_args {
338 my( $self, $first, $second, $arg ) = @_;
339 $first = $first->id
340 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
341 $second = $second->id
342 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
343 return( $first, $second, $arg );
344}
df6d9812 345
4e5a7b2c 346# Helper function for manipulating the graph.
347sub _objectify_args {
348 my( $self, $first, $second, $arg ) = @_;
349 $first = $self->reading( $first )
350 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
351 $second = $self->reading( $second )
352 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
353 return( $first, $second, $arg );
354}
3a2ebbf4 355### Path logic
356
357sub add_path {
358 my $self = shift;
359
360 # We only need the IDs for adding paths to the graph, not the reading
361 # objects themselves.
362 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
363
c1915ab9 364 $self->_graphcalc_done(0);
3a2ebbf4 365 # Connect the readings
366 $self->sequence->add_edge( $source, $target );
367 # Note the witness in question
368 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 369};
370
3a2ebbf4 371sub del_path {
372 my $self = shift;
49d4f2ac 373 my @args;
374 if( ref( $_[0] ) eq 'ARRAY' ) {
375 my $e = shift @_;
376 @args = ( @$e, @_ );
377 } else {
378 @args = @_;
379 }
3a2ebbf4 380
381 # We only need the IDs for adding paths to the graph, not the reading
382 # objects themselves.
49d4f2ac 383 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 384
c1915ab9 385 $self->_graphcalc_done(0);
3a2ebbf4 386 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 387 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 388 }
389 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
390 $self->sequence->delete_edge( $source, $target );
391 }
784877d9 392}
393
3a2ebbf4 394
15d2d3df 395# Extra graph-alike utility
396sub has_path {
3a2ebbf4 397 my $self = shift;
398 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
399 return undef unless $self->sequence->has_edge( $source, $target );
400 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 401}
402
4e5a7b2c 403=head2 clear_witness( @sigil_list )
3a2ebbf4 404
4e5a7b2c 405Clear the given witnesses out of the collation entirely, removing references
406to them in paths, and removing readings that belong only to them. Should only
407be called via $tradition->del_witness.
3a2ebbf4 408
409=cut
410
4e5a7b2c 411sub clear_witness {
412 my( $self, @sigils ) = @_;
413
c1915ab9 414 $self->_graphcalc_done(0);
4e5a7b2c 415 # Clear the witness(es) out of the paths
416 foreach my $e ( $self->paths ) {
417 foreach my $sig ( @sigils ) {
418 $self->del_path( $e, $sig );
419 }
420 }
421
422 # Clear out the newly unused readings
423 foreach my $r ( $self->readings ) {
424 unless( $self->reading_witnesses( $r ) ) {
425 $self->del_reading( $r );
426 }
427 }
428}
3a2ebbf4 429
430sub add_relationship {
431 my $self = shift;
22222af9 432 my( $source, $target, $opts ) = $self->_stringify_args( @_ );
63778331 433 my( @vectors ) = $self->relations->add_relationship( $source,
64ae6270 434 $self->reading( $source ), $target, $self->reading( $target ), $opts );
c1915ab9 435 $self->_graphcalc_done(0);
63778331 436 return @vectors;
22222af9 437}
ef9d481f 438
22222af9 439=head2 reading_witnesses( $reading )
910a0a6d 440
22222af9 441Return a list of sigils corresponding to the witnesses in which the reading appears.
3265b0ce 442
22222af9 443=cut
1d310495 444
1d310495 445sub reading_witnesses {
446 my( $self, $reading ) = @_;
447 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 448 # arbitrarily chosen "incoming". Thus, special-case the start node.
449 if( $reading eq $self->start ) {
450 return map { $_->sigil } $self->tradition->witnesses;
451 }
1d310495 452 my %all_witnesses;
453 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
454 my $wits = $self->sequence->get_edge_attributes( @$e );
455 @all_witnesses{ keys %$wits } = 1;
456 }
c12bb878 457 my $acstr = $self->ac_label;
458 foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
459 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
460 }
1d310495 461 return keys %all_witnesses;
910a0a6d 462}
463
4e5a7b2c 464=head1 OUTPUT METHODS
8e1394aa 465
0ecb975c 466=head2 as_svg( \%options )
8e1394aa 467
0068967c 468Returns an SVG string that represents the graph, via as_dot and graphviz.
bfcbcecb 469See as_dot for a list of options. Must have GraphViz (dot) installed to run.
8e1394aa 470
471=cut
472
473sub as_svg {
0ecb975c 474 my( $self, $opts ) = @_;
bfcbcecb 475 throw( "Need GraphViz installed to output SVG" )
476 unless File::Which::which( 'dot' );
e247aad1 477 my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
c1915ab9 478 $self->calculate_ranks() unless $self->_graphcalc_done;
e247aad1 479 if( !$self->has_cached_svg || $opts->{'recalc'} || $want_subgraph ) {
b365fbae 480 my @cmd = qw/dot -Tsvg/;
481 my( $svg, $err );
482 my $dotfile = File::Temp->new();
483 ## USE FOR DEBUGGING
484 # $dotfile->unlink_on_destroy(0);
485 binmode $dotfile, ':utf8';
486 print $dotfile $self->as_dot( $opts );
487 push( @cmd, $dotfile->filename );
488 run( \@cmd, ">", binary(), \$svg );
e247aad1 489 $svg = decode_utf8( $svg );
490 $self->cached_svg( $svg ) unless $want_subgraph;
491 return $svg;
492 } else {
493 return $self->cached_svg;
b365fbae 494 }
8e1394aa 495}
496
b22576c6 497
0ecb975c 498=head2 as_dot( \%options )
b22576c6 499
0ecb975c 500Returns a string that is the collation graph expressed in dot
501(i.e. GraphViz) format. Options include:
b22576c6 502
0ecb975c 503=over 4
b22576c6 504
0ecb975c 505=item * from
b22576c6 506
0ecb975c 507=item * to
df6d9812 508
0ecb975c 509=item * color_common
510
511=back
df6d9812 512
513=cut
514
515sub as_dot {
0ecb975c 516 my( $self, $opts ) = @_;
517 my $startrank = $opts->{'from'} if $opts;
518 my $endrank = $opts->{'to'} if $opts;
519 my $color_common = $opts->{'color_common'} if $opts;
b365fbae 520 my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
521 && $self->end->rank > 100;
522
b22576c6 523 # Check the arguments
524 if( $startrank ) {
525 return if $endrank && $startrank > $endrank;
526 return if $startrank > $self->end->rank;
527 }
528 if( defined $endrank ) {
529 return if $endrank < 0;
f1b3b33a 530 $endrank = undef if $endrank == $self->end->rank;
b22576c6 531 }
532
67da8d6c 533 my $graph_name = $self->tradition->name;
534 $graph_name =~ s/[^\w\s]//g;
535 $graph_name = join( '_', split( /\s+/, $graph_name ) );
f13b5582 536
537 my %graph_attrs = (
538 'rankdir' => 'LR',
539 'bgcolor' => 'none',
540 );
541 my %node_attrs = (
b8990398 542 'fontsize' => 14,
f13b5582 543 'fillcolor' => 'white',
544 'style' => 'filled',
545 'shape' => 'ellipse'
546 );
547 my %edge_attrs = (
548 'arrowhead' => 'open',
549 'color' => '#000000',
550 'fontcolor' => '#000000',
551 );
552
67da8d6c 553 my $dot = sprintf( "digraph %s {\n", $graph_name );
f13b5582 554 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
555 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
df6d9812 556
b22576c6 557 # Output substitute start/end readings if necessary
558 if( $startrank ) {
559 $dot .= "\t\"#SUBSTART#\" [ label=\"...\" ];\n";
560 }
561 if( $endrank ) {
562 $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n";
563 }
b365fbae 564 if( $STRAIGHTENHACK ) {
565 ## HACK part 1
566 $dot .= "\tsubgraph { rank=same \"#START#\" \"#SILENT#\" }\n";
567 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
568 }
b22576c6 569 my %used; # Keep track of the readings that actually appear in the graph
30ddc24c 570 # Sort the readings by rank if we have ranks; this speeds layout.
571 my @all_readings = $self->end->has_rank
572 ? sort { $a->rank <=> $b->rank } $self->readings
573 : $self->readings;
4633f9e4 574 # TODO Refrain from outputting lacuna nodes - just grey out the edges.
30ddc24c 575 foreach my $reading ( @all_readings ) {
b22576c6 576 # Only output readings within our rank range.
577 next if $startrank && $reading->rank < $startrank;
578 next if $endrank && $reading->rank > $endrank;
579 $used{$reading->id} = 1;
910a0a6d 580 # Need not output nodes without separate labels
3a2ebbf4 581 next if $reading->id eq $reading->text;
d4b75f44 582 my $rattrs;
30f0df34 583 my $label = $reading->text;
8f9cab7b 584 $label =~ s/\"/\\\"/g;
d4b75f44 585 $rattrs->{'label'} = $label;
0ecb975c 586 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
d4b75f44 587 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
df6d9812 588 }
3a2ebbf4 589
30ddc24c 590 # Add the real edges. Need to weight one edge per rank jump, in a
591 # continuous line.
b365fbae 592 # my $weighted = $self->_add_edge_weights;
b22576c6 593 my @edges = $self->paths;
3bdec618 594 my( %substart, %subend );
b22576c6 595 foreach my $edge ( @edges ) {
596 # Do we need to output this edge?
508fd430 597 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
027d819c 598 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
f13b5582 599 my $variables = { %edge_attrs, 'label' => $label };
30ddc24c 600
b22576c6 601 # Account for the rank gap if necessary
30ddc24c 602 my $rank0 = $self->reading( $edge->[0] )->rank
603 if $self->reading( $edge->[0] )->has_rank;
604 my $rank1 = $self->reading( $edge->[1] )->rank
605 if $self->reading( $edge->[1] )->has_rank;
606 if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
607 $variables->{'minlen'} = $rank1 - $rank0;
608 }
609
610 # Add the calculated edge weights
b365fbae 611 # if( exists $weighted->{$edge->[0]}
e247aad1 612 # && $weighted->{$edge->[0]} eq $edge->[1] ) {
613 # # $variables->{'color'} = 'red';
614 # $variables->{'weight'} = 3.0;
615 # }
30ddc24c 616
508fd430 617 # EXPERIMENTAL: make edge width reflect no. of witnesses
618 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
619 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
620
f13b5582 621 my $varopts = _dot_attr_string( $variables );
622 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
623 $edge->[0], $edge->[1], $varopts );
3bdec618 624 } elsif( $used{$edge->[0]} ) {
625 $subend{$edge->[0]} = 1;
626 } elsif( $used{$edge->[1]} ) {
627 $substart{$edge->[1]} = 1;
b22576c6 628 }
df6d9812 629 }
3bdec618 630 # Add substitute start and end edges if necessary
631 foreach my $node ( keys %substart ) {
027d819c 632 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 633 my $variables = { %edge_attrs, 'label' => $witstr };
634 my $varopts = _dot_attr_string( $variables );
635 $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;";
3bdec618 636 }
637 foreach my $node ( keys %subend ) {
027d819c 638 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 639 my $variables = { %edge_attrs, 'label' => $witstr };
640 my $varopts = _dot_attr_string( $variables );
641 $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;";
3bdec618 642 }
b365fbae 643 # HACK part 2
644 if( $STRAIGHTENHACK ) {
645 $dot .= "\t\"#END#\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
646 }
30ddc24c 647
df6d9812 648 $dot .= "}\n";
649 return $dot;
650}
651
f13b5582 652sub _dot_attr_string {
653 my( $hash ) = @_;
654 my @attrs;
655 foreach my $k ( sort keys %$hash ) {
656 my $v = $hash->{$k};
657 push( @attrs, $k.'="'.$v.'"' );
658 }
659 return( '[ ' . join( ', ', @attrs ) . ' ]' );
660}
661
30ddc24c 662sub _add_edge_weights {
663 my $self = shift;
664 # Walk the graph from START to END, choosing the successor node with
665 # the largest number of witness paths each time.
666 my $weighted = {};
667 my $curr = $self->start->id;
008fc8a6 668 my $ranked = $self->end->has_rank;
30ddc24c 669 while( $curr ne $self->end->id ) {
008fc8a6 670 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
30ddc24c 671 my @succ = sort { $self->path_witnesses( $curr, $a )
672 <=> $self->path_witnesses( $curr, $b ) }
673 $self->sequence->successors( $curr );
674 my $next = pop @succ;
008fc8a6 675 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
30ddc24c 676 # Try to avoid lacunae in the weighted path.
008fc8a6 677 while( @succ &&
678 ( $self->reading( $next )->is_lacuna ||
679 $nextrank - $rank > 1 ) ){
30ddc24c 680 $next = pop @succ;
681 }
682 $weighted->{$curr} = $next;
683 $curr = $next;
684 }
685 return $weighted;
686}
687
027d819c 688=head2 path_witnesses( $edge )
689
690Returns the list of sigils whose witnesses are associated with the given edge.
691The edge can be passed as either an array or an arrayref of ( $source, $target ).
692
693=cut
694
3a2ebbf4 695sub path_witnesses {
696 my( $self, @edge ) = @_;
697 # If edge is an arrayref, cope.
698 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
699 my $e = shift @edge;
700 @edge = @$e;
701 }
702 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
508fd430 703 return @wits;
3a2ebbf4 704}
705
027d819c 706sub _path_display_label {
508fd430 707 my $self = shift;
708 my @wits = sort @_;
8f9cab7b 709 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
710 if( scalar @wits > $maj ) {
f13b5582 711 # TODO break out a.c. wits
8f9cab7b 712 return 'majority';
713 } else {
714 return join( ', ', @wits );
715 }
716}
1dd07bda 717
718=head2 witnesses_at_rank
719
720Returns a list of witnesses that are not lacunose, for a given rank.
721
722=cut
723
724sub witnesses_at_rank {
725 my( $self, $rank ) = @_;
726}
8f9cab7b 727
4e5a7b2c 728=head2 as_graphml
8e1394aa 729
4e5a7b2c 730Returns a GraphML representation of the collation. The GraphML will contain
731two graphs. The first expresses the attributes of the readings and the witness
732paths that link them; the second expresses the relationships that link the
733readings. This is the native transfer format for a tradition.
8e1394aa 734
56eefa04 735=begin testing
736
737use Text::Tradition;
738
739my $READINGS = 311;
740my $PATHS = 361;
741
742my $datafile = 't/data/florilegium_tei_ps.xml';
743my $tradition = Text::Tradition->new( 'input' => 'TEI',
744 'name' => 'test0',
745 'file' => $datafile,
746 'linear' => 1 );
747
748ok( $tradition, "Got a tradition object" );
749is( scalar $tradition->witnesses, 13, "Found all witnesses" );
750ok( $tradition->collation, "Tradition has a collation" );
751
752my $c = $tradition->collation;
753is( scalar $c->readings, $READINGS, "Collation has all readings" );
754is( scalar $c->paths, $PATHS, "Collation has all paths" );
755is( scalar $c->relationships, 0, "Collation has all relationships" );
756
757# Add a few relationships
758$c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
759$c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
760$c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
761
762# Now write it to GraphML and parse it again.
763
764my $graphml = $c->as_graphml;
765my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
766is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
767is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
768is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
769
770=end testing
771
8e1394aa 772=cut
773
774sub as_graphml {
3a2ebbf4 775 my( $self ) = @_;
8e1394aa 776
777 # Some namespaces
778 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
779 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
780 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 781 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 782
783 # Create the document and root node
784 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
785 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
786 $graphml->setDocumentElement( $root );
787 $root->setNamespace( $xsi_ns, 'xsi', 0 );
788 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
789
e309421a 790 # Add the data keys for the graph
791 my %graph_data_keys;
792 my $gdi = 0;
1d310495 793 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
e309421a 794 foreach my $datum ( @graph_attributes ) {
795 $graph_data_keys{$datum} = 'dg'.$gdi++;
796 my $key = $root->addNewChild( $graphml_ns, 'key' );
797 $key->setAttribute( 'attr.name', $datum );
798 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
799 $key->setAttribute( 'for', 'graph' );
800 $key->setAttribute( 'id', $graph_data_keys{$datum} );
801 }
f6066bac 802
8e1394aa 803 # Add the data keys for nodes
ef9d481f 804 my %node_data_keys;
805 my $ndi = 0;
3a2ebbf4 806 my %node_data = (
807 id => 'string',
255875b8 808 text => 'string',
3a2ebbf4 809 rank => 'string',
810 is_start => 'boolean',
811 is_end => 'boolean',
812 is_lacuna => 'boolean',
813 );
814 foreach my $datum ( keys %node_data ) {
910a0a6d 815 $node_data_keys{$datum} = 'dn'.$ndi++;
816 my $key = $root->addNewChild( $graphml_ns, 'key' );
817 $key->setAttribute( 'attr.name', $datum );
3a2ebbf4 818 $key->setAttribute( 'attr.type', $node_data{$datum} );
910a0a6d 819 $key->setAttribute( 'for', 'node' );
820 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 821 }
822
df6d9812 823 # Add the data keys for edges, i.e. witnesses
ef9d481f 824 my $edi = 0;
825 my %edge_data_keys;
3a2ebbf4 826 my %edge_data = (
f523c7a8 827 class => 'string', # Class, deprecated soon
3a2ebbf4 828 witness => 'string', # ID/label for a path
829 relationship => 'string', # ID/label for a relationship
830 extra => 'boolean', # Path key
c84275ff 831 scope => 'string', # Relationship key
3a2ebbf4 832 non_correctable => 'boolean', # Relationship key
833 non_independent => 'boolean', # Relationship key
834 );
835 foreach my $datum ( keys %edge_data ) {
836 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 837 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 838 $key->setAttribute( 'attr.name', $datum );
839 $key->setAttribute( 'attr.type', $edge_data{$datum} );
910a0a6d 840 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 841 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 842 }
3a2ebbf4 843
22222af9 844 # Add the collation graph itself
2c669bca 845 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
846 $sgraph->setAttribute( 'edgedefault', 'directed' );
847 $sgraph->setAttribute( 'id', $self->tradition->name );
848 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
849 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
850 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
851 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
852 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
22222af9 853
e309421a 854 # Collation attribute data
855 foreach my $datum ( @graph_attributes ) {
2c669bca 856 my $value = $datum eq 'version' ? '3.0' : $self->$datum;
857 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 858 }
8e1394aa 859
860 my $node_ctr = 0;
861 my %node_hash;
22222af9 862 # Add our readings to the graph
3a2ebbf4 863 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
2c669bca 864 # Add to the main graph
865 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 866 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 867 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 868 $node_el->setAttribute( 'id', $node_xmlid );
255875b8 869 foreach my $d ( keys %node_data ) {
870 my $nval = $n->$d;
871 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
872 if defined $nval;
873 }
b15511bf 874 }
875
2c669bca 876 # Add the path edges to the sequence graph
df6d9812 877 my $edge_ctr = 0;
3a2ebbf4 878 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
879 # We add an edge in the graphml for every witness in $e.
508fd430 880 foreach my $wit ( sort $self->path_witnesses( $e ) ) {
3a2ebbf4 881 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
882 $node_hash{ $e->[0] },
883 $node_hash{ $e->[1] } );
2c669bca 884 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 885 $edge_el->setAttribute( 'source', $from );
886 $edge_el->setAttribute( 'target', $to );
887 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 888
889 # It's a witness path, so add the witness
890 my $base = $wit;
891 my $key = $edge_data_keys{'witness'};
892 # Is this an ante-corr witness?
893 my $aclabel = $self->ac_label;
894 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
895 # Keep the base witness
896 $base = $1;
897 # ...and record that this is an 'extra' reading path
898 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
899 }
900 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
f523c7a8 901 _add_graphml_data( $edge_el, $edge_data_keys{'class'}, 'path' );
3a2ebbf4 902 }
903 }
904
22222af9 905 # Add the relationship graph to the XML
027d819c 906 $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
2626f709 907 $node_data_keys{'id'}, \%edge_data_keys );
8e1394aa 908
94c00c71 909 # Save and return the thing
910 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 911 return $result;
df6d9812 912}
913
b15511bf 914sub _add_graphml_data {
915 my( $el, $key, $value ) = @_;
b15511bf 916 return unless defined $value;
c9bf3dbf 917 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 918 $data_el->setAttribute( 'key', $key );
919 $data_el->appendText( $value );
8e1394aa 920}
921
4e5a7b2c 922=head2 as_csv
910a0a6d 923
924Returns a CSV alignment table representation of the collation graph, one
2c669bca 925row per witness (or witness uncorrected.)
910a0a6d 926
927=cut
928
929sub as_csv {
3a2ebbf4 930 my( $self ) = @_;
1dd07bda 931 my $table = $self->alignment_table;
910a0a6d 932 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
933 my @result;
2c669bca 934 # Make the header row
935 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
936 push( @result, decode_utf8( $csv->string ) );
937 # Make the rest of the rows
938 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 939 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1dd07bda 940 my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
2c669bca 941 $csv->combine( @row );
910a0a6d 942 push( @result, decode_utf8( $csv->string ) );
943 }
3a2ebbf4 944 return join( "\n", @result );
910a0a6d 945}
946
1dd07bda 947=head2 alignment_table( $use_refs, $include_witnesses )
2c669bca 948
566f4595 949Return a reference to an alignment table, in a slightly enhanced CollateX
950format which looks like this:
951
952 $table = { alignment => [ { witness => "SIGIL",
4e5a7b2c 953 tokens => [ { t => "TEXT" }, ... ] },
566f4595 954 { witness => "SIG2",
4e5a7b2c 955 tokens => [ { t => "TEXT" }, ... ] },
566f4595 956 ... ],
957 length => TEXTLEN };
958
959If $use_refs is set to 1, the reading object is returned in the table
960instead of READINGTEXT; if not, the text of the reading is returned.
4e5a7b2c 961
962If $include_witnesses is set to a hashref, only the witnesses whose sigil
566f4595 963keys have a true hash value will be included.
2c669bca 964
965=cut
9f3ba6f7 966
1dd07bda 967sub alignment_table {
968 my( $self ) = @_;
c1915ab9 969 $self->calculate_ranks() unless $self->_graphcalc_done;
1dd07bda 970 return $self->cached_table if $self->has_cached_table;
971
0ecb975c 972 # Make sure we can do this
973 throw( "Need a linear graph in order to make an alignment table" )
974 unless $self->linear;
975 $self->calculate_ranks unless $self->end->has_rank;
976
2c669bca 977 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 978 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 979 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
eca16057 980 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 981 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1dd07bda 982 my @row = _make_witness_row( \@wit_path, \@all_pos );
2c669bca 983 push( @{$table->{'alignment'}},
984 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 985 if( $wit->is_layered ) {
986 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
861c3e27 987 $wit->sigil.$self->ac_label );
1dd07bda 988 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
2c669bca 989 push( @{$table->{'alignment'}},
990 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 991 }
992 }
1dd07bda 993 $self->cached_table( $table );
994 return $table;
910a0a6d 995}
996
997sub _make_witness_row {
1dd07bda 998 my( $path, $positions ) = @_;
910a0a6d 999 my %char_hash;
1000 map { $char_hash{$_} = undef } @$positions;
2c669bca 1001 my $debug = 0;
910a0a6d 1002 foreach my $rdg ( @$path ) {
eca16057 1003 my $rtext = $rdg->text;
1004 $rtext = '#LACUNA#' if $rdg->is_lacuna;
2c669bca 1005 print STDERR "rank " . $rdg->rank . "\n" if $debug;
3a2ebbf4 1006 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
1dd07bda 1007 $char_hash{$rdg->rank} = { 't' => $rdg };
910a0a6d 1008 }
1009 my @row = map { $char_hash{$_} } @$positions;
eca16057 1010 # Fill in lacuna markers for undef spots in the row
1011 my $last_el = shift @row;
1012 my @filled_row = ( $last_el );
1013 foreach my $el ( @row ) {
0e476982 1014 # If we are using node reference, make the lacuna node appear many times
1015 # in the table. If not, use the lacuna tag.
1dd07bda 1016 if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1017 $el = $last_el;
eca16057 1018 }
1019 push( @filled_row, $el );
1020 $last_el = $el;
1021 }
1022 return @filled_row;
910a0a6d 1023}
1024
4e5a7b2c 1025=head1 NAVIGATION METHODS
910a0a6d 1026
4e5a7b2c 1027=head2 reading_sequence( $first, $last, $sigil, $backup )
e2902068 1028
1029Returns the ordered list of readings, starting with $first and ending
4e5a7b2c 1030with $last, for the witness given in $sigil. If a $backup sigil is
1031specified (e.g. when walking a layered witness), it will be used wherever
1032no $sigil path exists. If there is a base text reading, that will be
1033used wherever no path exists for $sigil or $backup.
e2902068 1034
1035=cut
1036
910a0a6d 1037# TODO Think about returning some lazy-eval iterator.
b0b4421a 1038# TODO Get rid of backup; we should know from what witness is whether we need it.
910a0a6d 1039
e2902068 1040sub reading_sequence {
861c3e27 1041 my( $self, $start, $end, $witness ) = @_;
e2902068 1042
930ff666 1043 $witness = $self->baselabel unless $witness;
e2902068 1044 my @readings = ( $start );
1045 my %seen;
1046 my $n = $start;
3a2ebbf4 1047 while( $n && $n->id ne $end->id ) {
1048 if( exists( $seen{$n->id} ) ) {
63778331 1049 throw( "Detected loop for $witness at " . $n->id );
910a0a6d 1050 }
3a2ebbf4 1051 $seen{$n->id} = 1;
910a0a6d 1052
861c3e27 1053 my $next = $self->next_reading( $n, $witness );
44771cf2 1054 unless( $next ) {
63778331 1055 throw( "Did not find any path for $witness from reading " . $n->id );
44771cf2 1056 }
910a0a6d 1057 push( @readings, $next );
1058 $n = $next;
e2902068 1059 }
1060 # Check that the last reading is our end reading.
1061 my $last = $readings[$#readings];
63778331 1062 throw( "Last reading found from " . $start->text .
1063 " for witness $witness is not the end!" ) # TODO do we get this far?
3a2ebbf4 1064 unless $last->id eq $end->id;
e2902068 1065
1066 return @readings;
1067}
1068
4e5a7b2c 1069=head2 next_reading( $reading, $sigil );
8e1394aa 1070
4a8828f0 1071Returns the reading that follows the given reading along the given witness
930ff666 1072path.
8e1394aa 1073
1074=cut
1075
4a8828f0 1076sub next_reading {
e2902068 1077 # Return the successor via the corresponding path.
8e1394aa 1078 my $self = shift;
3a2ebbf4 1079 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 1080 return undef unless $answer;
3a2ebbf4 1081 return $self->reading( $answer );
8e1394aa 1082}
1083
4e5a7b2c 1084=head2 prior_reading( $reading, $sigil )
8e1394aa 1085
4a8828f0 1086Returns the reading that precedes the given reading along the given witness
930ff666 1087path.
8e1394aa 1088
1089=cut
1090
4a8828f0 1091sub prior_reading {
e2902068 1092 # Return the predecessor via the corresponding path.
8e1394aa 1093 my $self = shift;
3a2ebbf4 1094 my $answer = $self->_find_linked_reading( 'prior', @_ );
1095 return $self->reading( $answer );
8e1394aa 1096}
1097
4a8828f0 1098sub _find_linked_reading {
861c3e27 1099 my( $self, $direction, $node, $path ) = @_;
1100
1101 # Get a backup if we are dealing with a layered witness
1102 my $alt_path;
1103 my $aclabel = $self->ac_label;
1104 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1105 $alt_path = $1;
1106 }
1107
e2902068 1108 my @linked_paths = $direction eq 'next'
3a2ebbf4 1109 ? $self->sequence->edges_from( $node )
1110 : $self->sequence->edges_to( $node );
e2902068 1111 return undef unless scalar( @linked_paths );
8e1394aa 1112
e2902068 1113 # We have to find the linked path that contains all of the
1114 # witnesses supplied in $path.
1115 my( @path_wits, @alt_path_wits );
4e5a7b2c 1116 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1117 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 1118 my $base_le;
1119 my $alt_le;
1120 foreach my $le ( @linked_paths ) {
3a2ebbf4 1121 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 1122 $base_le = $le;
910a0a6d 1123 }
508fd430 1124 my @le_wits = sort $self->path_witnesses( $le );
3a2ebbf4 1125 if( _is_within( \@path_wits, \@le_wits ) ) {
1126 # This is the right path.
1127 return $direction eq 'next' ? $le->[1] : $le->[0];
1128 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1129 $alt_le = $le;
1130 }
8e1394aa 1131 }
e2902068 1132 # Got this far? Return the alternate path if it exists.
3a2ebbf4 1133 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 1134 if $alt_le;
e2902068 1135
1136 # Got this far? Return the base path if it exists.
3a2ebbf4 1137 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 1138 if $base_le;
e2902068 1139
1140 # Got this far? We have no appropriate path.
2c669bca 1141 warn "Could not find $direction node from " . $node->id
910a0a6d 1142 . " along path $path";
8e1394aa 1143 return undef;
1144}
1145
4a8828f0 1146# Some set logic.
1147sub _is_within {
1148 my( $set1, $set2 ) = @_;
7854e12e 1149 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 1150 foreach my $el ( @$set1 ) {
910a0a6d 1151 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 1152 }
1153 return $ret;
1154}
1155
4e5a7b2c 1156# Return the string that joins together a list of witnesses for
1157# display on a single path.
1158sub _witnesses_of_label {
1159 my( $self, $label ) = @_;
1160 my $regex = $self->wit_list_separator;
1161 my @answer = split( /\Q$regex\E/, $label );
1162 return @answer;
b0b4421a 1163}
1164
d4b75f44 1165=head2 common_readings
1166
1167Returns the list of common readings in the graph (i.e. those readings that are
1168shared by all non-lacunose witnesses.)
1169
1170=cut
1171
1172sub common_readings {
1173 my $self = shift;
1174 my @common = grep { $_->is_common } $self->readings;
1175 return @common;
1176}
1177
b0b4421a 1178=head2 path_text( $sigil, $mainsigil [, $start, $end ] )
1179
1180Returns the text of a witness (plus its backup, if we are using a layer)
1181as stored in the collation. The text is returned as a string, where the
1182individual readings are joined with spaces and the meta-readings (e.g.
1183lacunae) are omitted. Optional specification of $start and $end allows
1184the generation of a subset of the witness text.
4e5a7b2c 1185
b0b4421a 1186=cut
1187
1188sub path_text {
861c3e27 1189 my( $self, $wit, $start, $end ) = @_;
b0b4421a 1190 $start = $self->start unless $start;
1191 $end = $self->end unless $end;
861c3e27 1192 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
b0b4421a 1193 return join( ' ', map { $_->text } @path );
1194}
4e5a7b2c 1195
1196=head1 INITIALIZATION METHODS
1197
1198These are mostly for use by parsers.
1199
1200=head2 make_witness_path( $witness )
1201
1202Link the array of readings contained in $witness->path (and in
1203$witness->uncorrected_path if it exists) into collation paths.
1204Clear out the arrays when finished.
de51424a 1205
4e5a7b2c 1206=head2 make_witness_paths
1207
1208Call make_witness_path for all witnesses in the tradition.
1209
1210=cut
930ff666 1211
7e450e44 1212# For use when a collation is constructed from a base text and an apparatus.
1213# We have the sequences of readings and just need to add path edges.
1f7aa795 1214# When we are done, clear out the witness path attributes, as they are no
1215# longer needed.
1216# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 1217
6a222840 1218sub make_witness_paths {
1219 my( $self ) = @_;
910a0a6d 1220 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 1221 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 1222 $self->make_witness_path( $wit );
7854e12e 1223 }
7854e12e 1224}
1225
6a222840 1226sub make_witness_path {
7854e12e 1227 my( $self, $wit ) = @_;
1228 my @chain = @{$wit->path};
15d2d3df 1229 my $sig = $wit->sigil;
7854e12e 1230 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 1231 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 1232 }
1f7aa795 1233 if( $wit->is_layered ) {
d9e873d0 1234 @chain = @{$wit->uncorrected_path};
1235 foreach my $idx( 0 .. $#chain-1 ) {
1236 my $source = $chain[$idx];
1237 my $target = $chain[$idx+1];
1238 $self->add_path( $source, $target, $sig.$self->ac_label )
1239 unless $self->has_path( $source, $target, $sig );
1240 }
15d2d3df 1241 }
1f7aa795 1242 $wit->clear_path;
1243 $wit->clear_uncorrected_path;
e2902068 1244}
1245
4e5a7b2c 1246=head2 calculate_ranks
1247
1248Calculate the reading ranks (that is, their aligned positions relative
1249to each other) for the graph. This can only be called on linear collations.
1250
b365fbae 1251=begin testing
1252
1253use Text::Tradition;
1254
1255my $cxfile = 't/data/Collatex-16.xml';
1256my $t = Text::Tradition->new(
1257 'name' => 'inline',
1258 'input' => 'CollateX',
1259 'file' => $cxfile,
1260 );
1261my $c = $t->collation;
1262
1263# Make an svg
bfcbcecb 1264my $table = $c->alignment_table;
1265ok( $c->has_cached_table, "Alignment table was cached" );
1266is( $c->alignment_table, $table, "Cached table returned upon second call" );
b365fbae 1267$c->calculate_ranks;
bfcbcecb 1268is( $c->alignment_table, $table, "Cached table retained with no rank change" );
b365fbae 1269$c->add_relationship( 'n9', 'n23', { 'type' => 'spelling' } );
bfcbcecb 1270isnt( $c->alignment_table, $table, "Alignment table changed after relationship add" );
b365fbae 1271
1272=end testing
1273
4e5a7b2c 1274=cut
1275
910a0a6d 1276sub calculate_ranks {
1277 my $self = shift;
b365fbae 1278 # Save the existing ranks, in case we need to invalidate the cached SVG.
1279 my %existing_ranks;
910a0a6d 1280 # Walk a version of the graph where every node linked by a relationship
1281 # edge is fundamentally the same node, and do a topological ranking on
1282 # the nodes in this graph.
c9bf3dbf 1283 my $topo_graph = Graph->new();
910a0a6d 1284 my %rel_containers;
1285 my $rel_ctr = 0;
1286 # Add the nodes
1287 foreach my $r ( $self->readings ) {
3a2ebbf4 1288 next if exists $rel_containers{$r->id};
910a0a6d 1289 my @rels = $r->related_readings( 'colocated' );
1290 if( @rels ) {
1291 # Make a relationship container.
1292 push( @rels, $r );
c9bf3dbf 1293 my $rn = 'rel_container_' . $rel_ctr++;
1294 $topo_graph->add_vertex( $rn );
910a0a6d 1295 foreach( @rels ) {
3a2ebbf4 1296 $rel_containers{$_->id} = $rn;
910a0a6d 1297 }
1298 } else {
1299 # Add a new node to mirror the old node.
3a2ebbf4 1300 $rel_containers{$r->id} = $r->id;
1301 $topo_graph->add_vertex( $r->id );
910a0a6d 1302 }
4a8828f0 1303 }
3a1f2523 1304
3a2ebbf4 1305 # Add the edges.
910a0a6d 1306 foreach my $r ( $self->readings ) {
b365fbae 1307 $existing_ranks{$r} = $r->rank;
3a2ebbf4 1308 foreach my $n ( $self->sequence->successors( $r->id ) ) {
1309 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
1310 $rel_containers{$n} );
4e5a7b2c 1311 # $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 1312 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 1313 }
1314 }
1315
1316 # Now do the rankings, starting with the start node.
3a2ebbf4 1317 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 1318 my $node_ranks = { $topo_start => 0 };
910a0a6d 1319 my @curr_origin = ( $topo_start );
1320 # A little iterative function.
1321 while( @curr_origin ) {
c9bf3dbf 1322 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 1323 }
1324 # Transfer our rankings from the topological graph to the real one.
1325 foreach my $r ( $self->readings ) {
3a2ebbf4 1326 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1327 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 1328 } else {
63778331 1329 # Die. Find the last rank we calculated.
1330 my @all_defined = sort { $node_ranks->{$rel_containers{$a->id}}
1331 <=> $node_ranks->{$rel_containers{$b->id}} }
1332 $self->readings;
1333 my $last = pop @all_defined;
1334 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
67da8d6c 1335 }
de51424a 1336 }
bfcbcecb 1337 # Do we need to invalidate the cached data?
1338 if( $self->has_cached_svg || $self->has_cached_table ) {
b365fbae 1339 foreach my $r ( $self->readings ) {
1340 next if $existing_ranks{$r} == $r->rank;
c1915ab9 1341 # Something has changed, so clear the cache
bfcbcecb 1342 $self->_clear_cache;
c1915ab9 1343 # ...and recalculate the common readings.
1344 $self->calculate_common_readings();
b365fbae 1345 last;
1346 }
1347 }
c1915ab9 1348 # The graph calculation information is now up to date.
1349 $self->_graphcalc_done(1);
8e1394aa 1350}
3a1f2523 1351
910a0a6d 1352sub _assign_rank {
c9bf3dbf 1353 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1354 # Look at each of the children of @current_nodes. If all the child's
1355 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1356 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1357 # parent gets a rank.
910a0a6d 1358 my @next_nodes;
1359 foreach my $c ( @current_nodes ) {
c9bf3dbf 1360 warn "Current reading $c has no rank!"
1361 unless exists $node_ranks->{$c};
1362 # print STDERR "Looking at child of node $c, rank "
1363 # . $node_ranks->{$c} . "\n";
1364 foreach my $child ( $graph->successors( $c ) ) {
1365 next if exists $node_ranks->{$child};
910a0a6d 1366 my $highest_rank = -1;
1367 my $skip = 0;
c9bf3dbf 1368 foreach my $parent ( $graph->predecessors( $child ) ) {
1369 if( exists $node_ranks->{$parent} ) {
1370 $highest_rank = $node_ranks->{$parent}
1371 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1372 } else {
1373 $skip = 1;
1374 last;
1375 }
1376 }
1377 next if $skip;
c9bf3dbf 1378 my $c_rank = $highest_rank + 1;
1379 # print STDERR "Assigning rank $c_rank to node $child \n";
1380 $node_ranks->{$child} = $c_rank;
910a0a6d 1381 push( @next_nodes, $child );
1382 }
1383 }
1384 return @next_nodes;
4cdd82f1 1385}
910a0a6d 1386
c1915ab9 1387sub _clear_cache {
1388 my $self = shift;
1389 $self->wipe_svg if $self->has_cached_svg;
1390 $self->wipe_table if $self->has_cached_table;
1391}
1392
1393
4e5a7b2c 1394=head2 flatten_ranks
1395
1396A convenience method for parsing collation data. Searches the graph for readings
1397with the same text at the same rank, and merges any that are found.
1398
1399=cut
1400
0e476982 1401sub flatten_ranks {
1402 my $self = shift;
1403 my %unique_rank_rdg;
1404 foreach my $rdg ( $self->readings ) {
1405 next unless $rdg->has_rank;
1406 my $key = $rdg->rank . "||" . $rdg->text;
1407 if( exists $unique_rank_rdg{$key} ) {
1408 # Combine!
56eefa04 1409 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1410 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
0ecb975c 1411 # TODO see if this now makes a common point.
0e476982 1412 } else {
1413 $unique_rank_rdg{$key} = $rdg;
1414 }
1415 }
1416}
1417
4633f9e4 1418=head2 remove_collations
1419
1420Another convenience method for parsing. Removes all 'collation' relationships
1421that were defined in order to get the reading ranks to be correct.
1422
1423=begin testing
1424
1425use Text::Tradition;
1426
1427my $cxfile = 't/data/Collatex-16.xml';
1428my $t = Text::Tradition->new(
1429 'name' => 'inline',
1430 'input' => 'CollateX',
1431 'file' => $cxfile,
1432 );
1433my $c = $t->collation;
1434
1435isnt( $c->reading('n23')->rank, $c->reading('n9')->rank, "Rank skew exists" );
1436$c->add_relationship( 'n23', 'n9', { 'type' => 'collated', 'scope' => 'local' } );
1437is( scalar $c->relationships, 4, "Found all expected relationships" );
1438$c->remove_collations;
1439is( scalar $c->relationships, 3, "Collated relationships now gone" );
1440is( $c->reading('n23')->rank, $c->reading('n9')->rank, "Aligned ranks were preserved" );
1441
1442=end testing
1443
1444=cut
1445
1446sub remove_collations {
1447 my $self = shift;
1448 foreach my $reledge ( $self->relationships ) {
1449 my $relobj = $self->relations->get_relationship( $reledge );
1450 if( $relobj && $relobj->type eq 'collated' ) {
1451 $self->relations->delete_relationship( $reledge );
1452 }
1453 }
1454}
1455
1456
d4b75f44 1457=head2 calculate_common_readings
1458
1459Goes through the graph identifying the readings that appear in every witness
1460(apart from those with lacunae at that spot.) Marks them as common and returns
1461the list.
1462
1463=begin testing
1464
1465use Text::Tradition;
1466
1467my $cxfile = 't/data/Collatex-16.xml';
1468my $t = Text::Tradition->new(
1469 'name' => 'inline',
1470 'input' => 'CollateX',
1471 'file' => $cxfile,
1472 );
1473my $c = $t->collation;
1474
1475my @common = $c->calculate_common_readings();
1476is( scalar @common, 8, "Found correct number of common readings" );
1477my @marked = sort $c->common_readings();
1478is( scalar @common, 8, "All common readings got marked as such" );
1479my @expected = qw/ n1 n12 n16 n19 n20 n5 n6 n7 /;
1480is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1481
1482=end testing
1483
1484=cut
1485
1486sub calculate_common_readings {
1487 my $self = shift;
1488 my @common;
c1915ab9 1489 map { $_->is_common( 0 ) } $self->readings;
1490 # Implicitly calls calculate_ranks
1dd07bda 1491 my $table = $self->alignment_table;
d4b75f44 1492 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
7f52eac8 1493 my @row = map { $_->{'tokens'}->[$idx]
1494 ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1495 @{$table->{'alignment'}};
d4b75f44 1496 my %hash;
1497 foreach my $r ( @row ) {
1498 if( $r ) {
1499 $hash{$r->id} = $r unless $r->is_meta;
1500 } else {
1501 $hash{'UNDEF'} = $r;
1502 }
1503 }
1504 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1505 my( $r ) = values %hash;
1506 $r->is_common( 1 );
1507 push( @common, $r );
1508 }
1509 }
1510 return @common;
1511}
1512
861c3e27 1513=head2 text_from_paths
1514
1515Calculate the text array for all witnesses from the path, for later consistency
1516checking. Only to be used if there is no non-graph-based way to know the
1517original texts.
1518
1519=cut
1520
1521sub text_from_paths {
1522 my $self = shift;
1523 foreach my $wit ( $self->tradition->witnesses ) {
1524 my @text = split( /\s+/,
1525 $self->reading_sequence( $self->start, $self->end, $wit->sigil ) );
1526 $wit->text( \@text );
1527 if( $wit->is_layered ) {
1528 my @uctext = split( /\s+/,
1529 $self->reading_sequence( $self->start, $self->end,
1530 $wit->sigil.$self->ac_label ) );
1531 $wit->text( \@uctext );
1532 }
1533 }
1534}
0e476982 1535
4e5a7b2c 1536=head1 UTILITY FUNCTIONS
1537
1538=head2 common_predecessor( $reading_a, $reading_b )
8e1394aa 1539
4e5a7b2c 1540Find the last reading that occurs in sequence before both the given readings.
1541
1542=head2 common_successor( $reading_a, $reading_b )
1543
1544Find the first reading that occurs in sequence after both the given readings.
1545
22222af9 1546=begin testing
1547
1548use Text::Tradition;
1549
1550my $cxfile = 't/data/Collatex-16.xml';
1551my $t = Text::Tradition->new(
1552 'name' => 'inline',
1553 'input' => 'CollateX',
1554 'file' => $cxfile,
1555 );
1556my $c = $t->collation;
1557
4e5a7b2c 1558is( $c->common_predecessor( 'n9', 'n23' )->id,
22222af9 1559 'n20', "Found correct common predecessor" );
4e5a7b2c 1560is( $c->common_successor( 'n9', 'n23' )->id,
22222af9 1561 '#END#', "Found correct common successor" );
1562
4e5a7b2c 1563is( $c->common_predecessor( 'n19', 'n17' )->id,
22222af9 1564 'n16', "Found correct common predecessor for readings on same path" );
4e5a7b2c 1565is( $c->common_successor( 'n21', 'n26' )->id,
22222af9 1566 '#END#', "Found correct common successor for readings on same path" );
1567
1568=end testing
1569
1570=cut
1571
1572## Return the closest reading that is a predecessor of both the given readings.
1573sub common_predecessor {
1574 my $self = shift;
4e5a7b2c 1575 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1576 return $self->_common_in_path( $r1, $r2, 'predecessors' );
22222af9 1577}
1578
1579sub common_successor {
1580 my $self = shift;
4e5a7b2c 1581 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1582 return $self->_common_in_path( $r1, $r2, 'successors' );
22222af9 1583}
1584
027d819c 1585sub _common_in_path {
22222af9 1586 my( $self, $r1, $r2, $dir ) = @_;
1587 my $iter = $r1->rank > $r2->rank ? $r1->rank : $r2->rank;
1588 $iter = $self->end->rank - $iter if $dir eq 'successors';
1589 my @candidates;
1590 my @last_checked = ( $r1, $r2 );
1591 my %all_seen;
1592 while( !@candidates ) {
1593 my @new_lc;
1594 foreach my $lc ( @last_checked ) {
1595 foreach my $p ( $lc->$dir ) {
1596 if( $all_seen{$p->id} ) {
1597 push( @candidates, $p );
1598 } else {
1599 $all_seen{$p->id} = 1;
1600 push( @new_lc, $p );
1601 }
1602 }
1603 }
1604 @last_checked = @new_lc;
1605 }
1606 my @answer = sort { $a->rank <=> $b->rank } @candidates;
1607 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1608}
1609
63778331 1610sub throw {
1611 Text::Tradition::Error->throw(
1612 'ident' => 'Collation error',
1613 'message' => $_[0],
1614 );
1615}
1616
dd3b58b0 1617no Moose;
1618__PACKAGE__->meta->make_immutable;
e867486f 1619
027d819c 1620=head1 LICENSE
e867486f 1621
027d819c 1622This package is free software and is provided "as is" without express
1623or implied warranty. You can redistribute it and/or modify it under
1624the same terms as Perl itself.
e867486f 1625
027d819c 1626=head1 AUTHOR
e867486f 1627
027d819c 1628Tara L Andrews E<lt>aurum@cpan.orgE<gt>