remove redundant a.c. witnesses from list
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
c9bf3dbf 5use Graph;
8e1394aa 6use IPC::Run qw( run binary );
910a0a6d 7use Text::CSV_XS;
b15511bf 8use Text::Tradition::Collation::Reading;
22222af9 9use Text::Tradition::Collation::RelationshipStore;
63778331 10use Text::Tradition::Error;
df6d9812 11use XML::LibXML;
a344ffcf 12use XML::LibXML::XPathContext;
dd3b58b0 13use Moose;
14
3a2ebbf4 15has 'sequence' => (
d047cd52 16 is => 'ro',
3a2ebbf4 17 isa => 'Graph',
18 default => sub { Graph->new() },
d047cd52 19 handles => {
3a2ebbf4 20 paths => 'edges',
d047cd52 21 },
d047cd52 22 );
3a2ebbf4 23
24has 'relations' => (
25 is => 'ro',
22222af9 26 isa => 'Text::Tradition::Collation::RelationshipStore',
27 handles => {
28 relationships => 'relationships',
29 related_readings => 'related_readings',
ee801e17 30 del_relationship => 'del_relationship',
22222af9 31 },
32 writer => '_set_relations',
3a2ebbf4 33 );
dd3b58b0 34
3a2ebbf4 35has 'tradition' => (
36 is => 'ro',
d047cd52 37 isa => 'Text::Tradition',
8d9a1cd8 38 weak_ref => 1,
d047cd52 39 );
dd3b58b0 40
3a2ebbf4 41has 'readings' => (
42 isa => 'HashRef[Text::Tradition::Collation::Reading]',
43 traits => ['Hash'],
44 handles => {
45 reading => 'get',
46 _add_reading => 'set',
47 del_reading => 'delete',
48 has_reading => 'exists',
49 readings => 'values',
50 },
51 default => sub { {} },
52 );
910a0a6d 53
4a8828f0 54has 'wit_list_separator' => (
7854e12e 55 is => 'rw',
56 isa => 'Str',
57 default => ', ',
58 );
59
60has 'baselabel' => (
61 is => 'rw',
62 isa => 'Str',
63 default => 'base text',
64 );
4a8828f0 65
15d2d3df 66has 'linear' => (
67 is => 'rw',
68 isa => 'Bool',
69 default => 1,
70 );
c84275ff 71
ef9d481f 72has 'ac_label' => (
73 is => 'rw',
74 isa => 'Str',
75 default => ' (a.c.)',
76 );
3a2ebbf4 77
78has 'start' => (
79 is => 'ro',
80 isa => 'Text::Tradition::Collation::Reading',
81 writer => '_set_start',
82 weak_ref => 1,
83 );
84
85has 'end' => (
86 is => 'ro',
87 isa => 'Text::Tradition::Collation::Reading',
88 writer => '_set_end',
89 weak_ref => 1,
90 );
b365fbae 91
92has 'cached_svg' => (
93 is => 'rw',
94 isa => 'Str',
95 predicate => 'has_cached_svg',
96 clearer => 'wipe_svg',
97 );
1dd07bda 98
99has 'cached_table' => (
100 is => 'rw',
101 isa => 'HashRef',
102 predicate => 'has_cached_table',
103 clearer => 'wipe_table',
104 );
1f563ac3 105
4e5a7b2c 106=head1 NAME
107
108Text::Tradition::Collation - a software model for a text collation
109
110=head1 SYNOPSIS
111
112 use Text::Tradition;
113 my $t = Text::Tradition->new(
114 'name' => 'this is a text',
115 'input' => 'TEI',
116 'file' => '/path/to/tei_parallel_seg_file.xml' );
117
118 my $c = $t->collation;
119 my @readings = $c->readings;
120 my @paths = $c->paths;
121 my @relationships = $c->relationships;
122
123 my $svg_variant_graph = $t->collation->as_svg();
124
125=head1 DESCRIPTION
126
127Text::Tradition is a library for representation and analysis of collated
128texts, particularly medieval ones. The Collation is the central feature of
129a Tradition, where the text, its sequence of readings, and its relationships
130between readings are actually kept.
131
132=head1 CONSTRUCTOR
133
134=head2 new
135
136The constructor. Takes a hash or hashref of the following arguments:
137
138=over
139
140=item * tradition - The Text::Tradition object to which the collation
141belongs. Required.
142
143=item * linear - Whether the collation should be linear; that is, whether
144transposed readings should be treated as two linked readings rather than one,
145and therefore whether the collation graph is acyclic. Defaults to true.
146
4e5a7b2c 147=item * baselabel - The default label for the path taken by a base text
148(if any). Defaults to 'base text'.
149
150=item * wit_list_separator - The string to join a list of witnesses for
151purposes of making labels in display graphs. Defaults to ', '.
152
153=item * ac_label - The extra label to tack onto a witness sigil when
154representing another layer of path for the given witness - that is, when
155a text has more than one possible reading due to scribal corrections or
156the like. Defaults to ' (a.c.)'.
157
158=back
159
160=head1 ACCESSORS
161
162=head2 tradition
163
164=head2 linear
165
4e5a7b2c 166=head2 wit_list_separator
167
168=head2 baselabel
169
170=head2 ac_label
171
172Simple accessors for collation attributes.
173
174=head2 start
175
176The meta-reading at the start of every witness path.
177
178=head2 end
179
180The meta-reading at the end of every witness path.
181
182=head2 readings
183
184Returns all Reading objects in the graph.
185
186=head2 reading( $id )
187
188Returns the Reading object corresponding to the given ID.
189
190=head2 add_reading( $reading_args )
191
192Adds a new reading object to the collation.
193See L<Text::Tradition::Collation::Reading> for the available arguments.
194
195=head2 del_reading( $object_or_id )
196
197Removes the given reading from the collation, implicitly removing its
198paths and relationships.
199
200=head2 merge_readings( $main, $second )
201
202Merges the $second reading into the $main one.
203The arguments may be either readings or reading IDs.
204
205=head2 has_reading( $id )
206
207Predicate to see whether a given reading ID is in the graph.
208
209=head2 reading_witnesses( $object_or_id )
210
211Returns a list of sigils whose witnesses contain the reading.
212
213=head2 paths
214
215Returns all reading paths within the document - that is, all edges in the
216collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
217
218=head2 add_path( $source, $target, $sigil )
219
220Links the given readings in the collation in sequence, under the given witness
221sigil. The readings may be specified by object or ID.
222
223=head2 del_path( $source, $target, $sigil )
224
225Links the given readings in the collation in sequence, under the given witness
226sigil. The readings may be specified by object or ID.
227
228=head2 has_path( $source, $target );
229
230Returns true if the two readings are linked in sequence in any witness.
231The readings may be specified by object or ID.
232
233=head2 relationships
234
235Returns all Relationship objects in the collation.
236
237=head2 add_relationship( $reading, $other_reading, $options )
238
239Adds a new relationship of the type given in $options between the two readings,
240which may be specified by object or ID. Returns a value of ( $status, @vectors)
241where $status is true on success, and @vectors is a list of relationship edges
242that were ultimately added.
243See L<Text::Tradition::Collation::Relationship> for the available options.
244
245=cut
dd3b58b0 246
d047cd52 247sub BUILD {
3a2ebbf4 248 my $self = shift;
22222af9 249 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
3a2ebbf4 250 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
251 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 252}
784877d9 253
3a2ebbf4 254### Reading construct/destruct functions
255
1dd07bda 256sub _clear_cache {
257 my $self = shift;
258 $self->wipe_svg if $self->has_cached_svg;
259 $self->wipe_table if $self->has_cached_table;
260}
261
3a2ebbf4 262sub add_reading {
263 my( $self, $reading ) = @_;
264 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
265 my %args = %$reading;
266 $reading = Text::Tradition::Collation::Reading->new(
267 'collation' => $self,
268 %args );
269 }
270 # First check to see if a reading with this ID exists.
271 if( $self->reading( $reading->id ) ) {
63778331 272 throw( "Collation already has a reading with id " . $reading->id );
3a2ebbf4 273 }
1dd07bda 274 $self->_clear_cache;
3a2ebbf4 275 $self->_add_reading( $reading->id => $reading );
276 # Once the reading has been added, put it in both graphs.
277 $self->sequence->add_vertex( $reading->id );
22222af9 278 $self->relations->add_reading( $reading->id );
3a2ebbf4 279 return $reading;
eca16057 280};
281
3a2ebbf4 282around del_reading => sub {
283 my $orig = shift;
284 my $self = shift;
285 my $arg = shift;
286
287 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
288 $arg = $arg->id;
289 }
3a2ebbf4 290 # Remove the reading from the graphs.
1dd07bda 291 $self->_clear_cache;
3a2ebbf4 292 $self->sequence->delete_vertex( $arg );
22222af9 293 $self->relations->delete_reading( $arg );
3a2ebbf4 294
295 # Carry on.
296 $self->$orig( $arg );
297};
7854e12e 298
3a2ebbf4 299# merge_readings( $main, $to_be_deleted );
7854e12e 300
3a2ebbf4 301sub merge_readings {
302 my $self = shift;
303
304 # We only need the IDs for adding paths to the graph, not the reading
305 # objects themselves.
49d4f2ac 306 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
1dd07bda 307 $self->_clear_cache;
3a2ebbf4 308
309 # The kept reading should inherit the paths and the relationships
310 # of the deleted reading.
311 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
312 my @vector = ( $kept );
313 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
314 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 315 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 316 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
317 $self->sequence->add_edge( @vector );
318 my $fwits = $self->sequence->get_edge_attributes( @vector );
319 @wits{keys %$fwits} = values %$fwits;
320 $self->sequence->set_edge_attributes( @vector, \%wits );
321 }
22222af9 322 $self->relations->merge_readings( $kept, $deleted, $combine_char );
3a2ebbf4 323
324 # Do the deletion deed.
49d4f2ac 325 if( $combine_char ) {
326 my $kept_obj = $self->reading( $kept );
327 my $new_text = join( $combine_char, $kept_obj->text,
328 $self->reading( $deleted )->text );
329 $kept_obj->alter_text( $new_text );
330 }
3a2ebbf4 331 $self->del_reading( $deleted );
332}
7854e12e 333
3265b0ce 334
3a2ebbf4 335# Helper function for manipulating the graph.
336sub _stringify_args {
337 my( $self, $first, $second, $arg ) = @_;
338 $first = $first->id
339 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
340 $second = $second->id
341 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
342 return( $first, $second, $arg );
343}
df6d9812 344
4e5a7b2c 345# Helper function for manipulating the graph.
346sub _objectify_args {
347 my( $self, $first, $second, $arg ) = @_;
348 $first = $self->reading( $first )
349 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
350 $second = $self->reading( $second )
351 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
352 return( $first, $second, $arg );
353}
3a2ebbf4 354### Path logic
355
356sub add_path {
357 my $self = shift;
358
359 # We only need the IDs for adding paths to the graph, not the reading
360 # objects themselves.
361 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
362
1dd07bda 363 $self->_clear_cache;
3a2ebbf4 364 # Connect the readings
365 $self->sequence->add_edge( $source, $target );
366 # Note the witness in question
367 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 368};
369
3a2ebbf4 370sub del_path {
371 my $self = shift;
49d4f2ac 372 my @args;
373 if( ref( $_[0] ) eq 'ARRAY' ) {
374 my $e = shift @_;
375 @args = ( @$e, @_ );
376 } else {
377 @args = @_;
378 }
3a2ebbf4 379
380 # We only need the IDs for adding paths to the graph, not the reading
381 # objects themselves.
49d4f2ac 382 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 383
1dd07bda 384 $self->_clear_cache;
3a2ebbf4 385 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 386 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 387 }
388 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
389 $self->sequence->delete_edge( $source, $target );
390 }
784877d9 391}
392
3a2ebbf4 393
15d2d3df 394# Extra graph-alike utility
395sub has_path {
3a2ebbf4 396 my $self = shift;
397 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
398 return undef unless $self->sequence->has_edge( $source, $target );
399 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 400}
401
4e5a7b2c 402=head2 clear_witness( @sigil_list )
3a2ebbf4 403
4e5a7b2c 404Clear the given witnesses out of the collation entirely, removing references
405to them in paths, and removing readings that belong only to them. Should only
406be called via $tradition->del_witness.
3a2ebbf4 407
408=cut
409
4e5a7b2c 410sub clear_witness {
411 my( $self, @sigils ) = @_;
412
1dd07bda 413 $self->_clear_cache;
4e5a7b2c 414 # Clear the witness(es) out of the paths
415 foreach my $e ( $self->paths ) {
416 foreach my $sig ( @sigils ) {
417 $self->del_path( $e, $sig );
418 }
419 }
420
421 # Clear out the newly unused readings
422 foreach my $r ( $self->readings ) {
423 unless( $self->reading_witnesses( $r ) ) {
424 $self->del_reading( $r );
425 }
426 }
427}
3a2ebbf4 428
429sub add_relationship {
430 my $self = shift;
22222af9 431 my( $source, $target, $opts ) = $self->_stringify_args( @_ );
63778331 432 my( @vectors ) = $self->relations->add_relationship( $source,
64ae6270 433 $self->reading( $source ), $target, $self->reading( $target ), $opts );
434 # Force a full rank recalculation every time. Yuck.
63778331 435 $self->calculate_ranks() if $self->end->has_rank;
1dd07bda 436 $self->_clear_cache;
63778331 437 return @vectors;
22222af9 438}
ef9d481f 439
22222af9 440=head2 reading_witnesses( $reading )
910a0a6d 441
22222af9 442Return a list of sigils corresponding to the witnesses in which the reading appears.
3265b0ce 443
22222af9 444=cut
1d310495 445
1d310495 446sub reading_witnesses {
447 my( $self, $reading ) = @_;
448 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 449 # arbitrarily chosen "incoming". Thus, special-case the start node.
450 if( $reading eq $self->start ) {
451 return map { $_->sigil } $self->tradition->witnesses;
452 }
1d310495 453 my %all_witnesses;
454 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
455 my $wits = $self->sequence->get_edge_attributes( @$e );
456 @all_witnesses{ keys %$wits } = 1;
457 }
c12bb878 458 my $acstr = $self->ac_label;
459 foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
460 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
461 }
1d310495 462 return keys %all_witnesses;
910a0a6d 463}
464
4e5a7b2c 465=head1 OUTPUT METHODS
8e1394aa 466
0ecb975c 467=head2 as_svg( \%options )
8e1394aa 468
0068967c 469Returns an SVG string that represents the graph, via as_dot and graphviz.
0ecb975c 470See as_dot for a list of options.
8e1394aa 471
472=cut
473
474sub as_svg {
0ecb975c 475 my( $self, $opts ) = @_;
e247aad1 476 my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
477 if( !$self->has_cached_svg || $opts->{'recalc'} || $want_subgraph ) {
b365fbae 478 my @cmd = qw/dot -Tsvg/;
479 my( $svg, $err );
480 my $dotfile = File::Temp->new();
481 ## USE FOR DEBUGGING
482 # $dotfile->unlink_on_destroy(0);
483 binmode $dotfile, ':utf8';
484 print $dotfile $self->as_dot( $opts );
485 push( @cmd, $dotfile->filename );
486 run( \@cmd, ">", binary(), \$svg );
e247aad1 487 $svg = decode_utf8( $svg );
488 $self->cached_svg( $svg ) unless $want_subgraph;
489 return $svg;
490 } else {
491 return $self->cached_svg;
b365fbae 492 }
8e1394aa 493}
494
b22576c6 495
0ecb975c 496=head2 as_dot( \%options )
b22576c6 497
0ecb975c 498Returns a string that is the collation graph expressed in dot
499(i.e. GraphViz) format. Options include:
b22576c6 500
0ecb975c 501=over 4
b22576c6 502
0ecb975c 503=item * from
b22576c6 504
0ecb975c 505=item * to
df6d9812 506
0ecb975c 507=item * color_common
508
509=back
df6d9812 510
511=cut
512
513sub as_dot {
0ecb975c 514 my( $self, $opts ) = @_;
515 my $startrank = $opts->{'from'} if $opts;
516 my $endrank = $opts->{'to'} if $opts;
517 my $color_common = $opts->{'color_common'} if $opts;
b365fbae 518 my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
519 && $self->end->rank > 100;
520
b22576c6 521 # Check the arguments
522 if( $startrank ) {
523 return if $endrank && $startrank > $endrank;
524 return if $startrank > $self->end->rank;
525 }
526 if( defined $endrank ) {
527 return if $endrank < 0;
f1b3b33a 528 $endrank = undef if $endrank == $self->end->rank;
b22576c6 529 }
530
67da8d6c 531 my $graph_name = $self->tradition->name;
532 $graph_name =~ s/[^\w\s]//g;
533 $graph_name = join( '_', split( /\s+/, $graph_name ) );
f13b5582 534
535 my %graph_attrs = (
536 'rankdir' => 'LR',
537 'bgcolor' => 'none',
538 );
539 my %node_attrs = (
b8990398 540 'fontsize' => 14,
f13b5582 541 'fillcolor' => 'white',
542 'style' => 'filled',
543 'shape' => 'ellipse'
544 );
545 my %edge_attrs = (
546 'arrowhead' => 'open',
547 'color' => '#000000',
548 'fontcolor' => '#000000',
549 );
550
67da8d6c 551 my $dot = sprintf( "digraph %s {\n", $graph_name );
f13b5582 552 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
553 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
df6d9812 554
b22576c6 555 # Output substitute start/end readings if necessary
556 if( $startrank ) {
557 $dot .= "\t\"#SUBSTART#\" [ label=\"...\" ];\n";
558 }
559 if( $endrank ) {
560 $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n";
561 }
b365fbae 562 if( $STRAIGHTENHACK ) {
563 ## HACK part 1
564 $dot .= "\tsubgraph { rank=same \"#START#\" \"#SILENT#\" }\n";
565 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
566 }
b22576c6 567 my %used; # Keep track of the readings that actually appear in the graph
30ddc24c 568 # Sort the readings by rank if we have ranks; this speeds layout.
569 my @all_readings = $self->end->has_rank
570 ? sort { $a->rank <=> $b->rank } $self->readings
571 : $self->readings;
4633f9e4 572 # TODO Refrain from outputting lacuna nodes - just grey out the edges.
30ddc24c 573 foreach my $reading ( @all_readings ) {
b22576c6 574 # Only output readings within our rank range.
575 next if $startrank && $reading->rank < $startrank;
576 next if $endrank && $reading->rank > $endrank;
577 $used{$reading->id} = 1;
910a0a6d 578 # Need not output nodes without separate labels
3a2ebbf4 579 next if $reading->id eq $reading->text;
d4b75f44 580 my $rattrs;
30f0df34 581 my $label = $reading->text;
8f9cab7b 582 $label =~ s/\"/\\\"/g;
d4b75f44 583 $rattrs->{'label'} = $label;
0ecb975c 584 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
d4b75f44 585 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
df6d9812 586 }
3a2ebbf4 587
30ddc24c 588 # Add the real edges. Need to weight one edge per rank jump, in a
589 # continuous line.
b365fbae 590 # my $weighted = $self->_add_edge_weights;
b22576c6 591 my @edges = $self->paths;
3bdec618 592 my( %substart, %subend );
b22576c6 593 foreach my $edge ( @edges ) {
594 # Do we need to output this edge?
508fd430 595 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
027d819c 596 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
f13b5582 597 my $variables = { %edge_attrs, 'label' => $label };
30ddc24c 598
b22576c6 599 # Account for the rank gap if necessary
30ddc24c 600 my $rank0 = $self->reading( $edge->[0] )->rank
601 if $self->reading( $edge->[0] )->has_rank;
602 my $rank1 = $self->reading( $edge->[1] )->rank
603 if $self->reading( $edge->[1] )->has_rank;
604 if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
605 $variables->{'minlen'} = $rank1 - $rank0;
606 }
607
608 # Add the calculated edge weights
b365fbae 609 # if( exists $weighted->{$edge->[0]}
e247aad1 610 # && $weighted->{$edge->[0]} eq $edge->[1] ) {
611 # # $variables->{'color'} = 'red';
612 # $variables->{'weight'} = 3.0;
613 # }
30ddc24c 614
508fd430 615 # EXPERIMENTAL: make edge width reflect no. of witnesses
616 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
617 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
618
f13b5582 619 my $varopts = _dot_attr_string( $variables );
620 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
621 $edge->[0], $edge->[1], $varopts );
3bdec618 622 } elsif( $used{$edge->[0]} ) {
623 $subend{$edge->[0]} = 1;
624 } elsif( $used{$edge->[1]} ) {
625 $substart{$edge->[1]} = 1;
b22576c6 626 }
df6d9812 627 }
3bdec618 628 # Add substitute start and end edges if necessary
629 foreach my $node ( keys %substart ) {
027d819c 630 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 631 my $variables = { %edge_attrs, 'label' => $witstr };
632 my $varopts = _dot_attr_string( $variables );
633 $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;";
3bdec618 634 }
635 foreach my $node ( keys %subend ) {
027d819c 636 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 637 my $variables = { %edge_attrs, 'label' => $witstr };
638 my $varopts = _dot_attr_string( $variables );
639 $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;";
3bdec618 640 }
b365fbae 641 # HACK part 2
642 if( $STRAIGHTENHACK ) {
643 $dot .= "\t\"#END#\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
644 }
30ddc24c 645
df6d9812 646 $dot .= "}\n";
647 return $dot;
648}
649
f13b5582 650sub _dot_attr_string {
651 my( $hash ) = @_;
652 my @attrs;
653 foreach my $k ( sort keys %$hash ) {
654 my $v = $hash->{$k};
655 push( @attrs, $k.'="'.$v.'"' );
656 }
657 return( '[ ' . join( ', ', @attrs ) . ' ]' );
658}
659
30ddc24c 660sub _add_edge_weights {
661 my $self = shift;
662 # Walk the graph from START to END, choosing the successor node with
663 # the largest number of witness paths each time.
664 my $weighted = {};
665 my $curr = $self->start->id;
008fc8a6 666 my $ranked = $self->end->has_rank;
30ddc24c 667 while( $curr ne $self->end->id ) {
008fc8a6 668 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
30ddc24c 669 my @succ = sort { $self->path_witnesses( $curr, $a )
670 <=> $self->path_witnesses( $curr, $b ) }
671 $self->sequence->successors( $curr );
672 my $next = pop @succ;
008fc8a6 673 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
30ddc24c 674 # Try to avoid lacunae in the weighted path.
008fc8a6 675 while( @succ &&
676 ( $self->reading( $next )->is_lacuna ||
677 $nextrank - $rank > 1 ) ){
30ddc24c 678 $next = pop @succ;
679 }
680 $weighted->{$curr} = $next;
681 $curr = $next;
682 }
683 return $weighted;
684}
685
027d819c 686=head2 path_witnesses( $edge )
687
688Returns the list of sigils whose witnesses are associated with the given edge.
689The edge can be passed as either an array or an arrayref of ( $source, $target ).
690
691=cut
692
3a2ebbf4 693sub path_witnesses {
694 my( $self, @edge ) = @_;
695 # If edge is an arrayref, cope.
696 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
697 my $e = shift @edge;
698 @edge = @$e;
699 }
700 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
508fd430 701 return @wits;
3a2ebbf4 702}
703
027d819c 704sub _path_display_label {
508fd430 705 my $self = shift;
706 my @wits = sort @_;
8f9cab7b 707 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
708 if( scalar @wits > $maj ) {
f13b5582 709 # TODO break out a.c. wits
8f9cab7b 710 return 'majority';
711 } else {
712 return join( ', ', @wits );
713 }
714}
1dd07bda 715
716=head2 witnesses_at_rank
717
718Returns a list of witnesses that are not lacunose, for a given rank.
719
720=cut
721
722sub witnesses_at_rank {
723 my( $self, $rank ) = @_;
724}
8f9cab7b 725
4e5a7b2c 726=head2 as_graphml
8e1394aa 727
4e5a7b2c 728Returns a GraphML representation of the collation. The GraphML will contain
729two graphs. The first expresses the attributes of the readings and the witness
730paths that link them; the second expresses the relationships that link the
731readings. This is the native transfer format for a tradition.
8e1394aa 732
56eefa04 733=begin testing
734
735use Text::Tradition;
736
737my $READINGS = 311;
738my $PATHS = 361;
739
740my $datafile = 't/data/florilegium_tei_ps.xml';
741my $tradition = Text::Tradition->new( 'input' => 'TEI',
742 'name' => 'test0',
743 'file' => $datafile,
744 'linear' => 1 );
745
746ok( $tradition, "Got a tradition object" );
747is( scalar $tradition->witnesses, 13, "Found all witnesses" );
748ok( $tradition->collation, "Tradition has a collation" );
749
750my $c = $tradition->collation;
751is( scalar $c->readings, $READINGS, "Collation has all readings" );
752is( scalar $c->paths, $PATHS, "Collation has all paths" );
753is( scalar $c->relationships, 0, "Collation has all relationships" );
754
755# Add a few relationships
756$c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
757$c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
758$c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
759
760# Now write it to GraphML and parse it again.
761
762my $graphml = $c->as_graphml;
763my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
764is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
765is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
766is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
767
768=end testing
769
8e1394aa 770=cut
771
772sub as_graphml {
3a2ebbf4 773 my( $self ) = @_;
8e1394aa 774
775 # Some namespaces
776 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
777 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
778 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 779 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 780
781 # Create the document and root node
782 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
783 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
784 $graphml->setDocumentElement( $root );
785 $root->setNamespace( $xsi_ns, 'xsi', 0 );
786 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
787
e309421a 788 # Add the data keys for the graph
789 my %graph_data_keys;
790 my $gdi = 0;
1d310495 791 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
e309421a 792 foreach my $datum ( @graph_attributes ) {
793 $graph_data_keys{$datum} = 'dg'.$gdi++;
794 my $key = $root->addNewChild( $graphml_ns, 'key' );
795 $key->setAttribute( 'attr.name', $datum );
796 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
797 $key->setAttribute( 'for', 'graph' );
798 $key->setAttribute( 'id', $graph_data_keys{$datum} );
799 }
f6066bac 800
8e1394aa 801 # Add the data keys for nodes
ef9d481f 802 my %node_data_keys;
803 my $ndi = 0;
3a2ebbf4 804 my %node_data = (
805 id => 'string',
255875b8 806 text => 'string',
3a2ebbf4 807 rank => 'string',
808 is_start => 'boolean',
809 is_end => 'boolean',
810 is_lacuna => 'boolean',
811 );
812 foreach my $datum ( keys %node_data ) {
910a0a6d 813 $node_data_keys{$datum} = 'dn'.$ndi++;
814 my $key = $root->addNewChild( $graphml_ns, 'key' );
815 $key->setAttribute( 'attr.name', $datum );
3a2ebbf4 816 $key->setAttribute( 'attr.type', $node_data{$datum} );
910a0a6d 817 $key->setAttribute( 'for', 'node' );
818 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 819 }
820
df6d9812 821 # Add the data keys for edges, i.e. witnesses
ef9d481f 822 my $edi = 0;
823 my %edge_data_keys;
3a2ebbf4 824 my %edge_data = (
f523c7a8 825 class => 'string', # Class, deprecated soon
3a2ebbf4 826 witness => 'string', # ID/label for a path
827 relationship => 'string', # ID/label for a relationship
828 extra => 'boolean', # Path key
c84275ff 829 scope => 'string', # Relationship key
3a2ebbf4 830 non_correctable => 'boolean', # Relationship key
831 non_independent => 'boolean', # Relationship key
832 );
833 foreach my $datum ( keys %edge_data ) {
834 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 835 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 836 $key->setAttribute( 'attr.name', $datum );
837 $key->setAttribute( 'attr.type', $edge_data{$datum} );
910a0a6d 838 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 839 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 840 }
3a2ebbf4 841
22222af9 842 # Add the collation graph itself
2c669bca 843 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
844 $sgraph->setAttribute( 'edgedefault', 'directed' );
845 $sgraph->setAttribute( 'id', $self->tradition->name );
846 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
847 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
848 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
849 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
850 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
22222af9 851
e309421a 852 # Collation attribute data
853 foreach my $datum ( @graph_attributes ) {
2c669bca 854 my $value = $datum eq 'version' ? '3.0' : $self->$datum;
855 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 856 }
8e1394aa 857
858 my $node_ctr = 0;
859 my %node_hash;
22222af9 860 # Add our readings to the graph
3a2ebbf4 861 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
2c669bca 862 # Add to the main graph
863 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 864 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 865 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 866 $node_el->setAttribute( 'id', $node_xmlid );
255875b8 867 foreach my $d ( keys %node_data ) {
868 my $nval = $n->$d;
869 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
870 if defined $nval;
871 }
b15511bf 872 }
873
2c669bca 874 # Add the path edges to the sequence graph
df6d9812 875 my $edge_ctr = 0;
3a2ebbf4 876 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
877 # We add an edge in the graphml for every witness in $e.
508fd430 878 foreach my $wit ( sort $self->path_witnesses( $e ) ) {
3a2ebbf4 879 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
880 $node_hash{ $e->[0] },
881 $node_hash{ $e->[1] } );
2c669bca 882 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 883 $edge_el->setAttribute( 'source', $from );
884 $edge_el->setAttribute( 'target', $to );
885 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 886
887 # It's a witness path, so add the witness
888 my $base = $wit;
889 my $key = $edge_data_keys{'witness'};
890 # Is this an ante-corr witness?
891 my $aclabel = $self->ac_label;
892 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
893 # Keep the base witness
894 $base = $1;
895 # ...and record that this is an 'extra' reading path
896 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
897 }
898 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
f523c7a8 899 _add_graphml_data( $edge_el, $edge_data_keys{'class'}, 'path' );
3a2ebbf4 900 }
901 }
902
22222af9 903 # Add the relationship graph to the XML
027d819c 904 $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
2626f709 905 $node_data_keys{'id'}, \%edge_data_keys );
8e1394aa 906
94c00c71 907 # Save and return the thing
908 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 909 return $result;
df6d9812 910}
911
b15511bf 912sub _add_graphml_data {
913 my( $el, $key, $value ) = @_;
b15511bf 914 return unless defined $value;
c9bf3dbf 915 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 916 $data_el->setAttribute( 'key', $key );
917 $data_el->appendText( $value );
8e1394aa 918}
919
4e5a7b2c 920=head2 as_csv
910a0a6d 921
922Returns a CSV alignment table representation of the collation graph, one
2c669bca 923row per witness (or witness uncorrected.)
910a0a6d 924
925=cut
926
927sub as_csv {
3a2ebbf4 928 my( $self ) = @_;
1dd07bda 929 my $table = $self->alignment_table;
910a0a6d 930 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
931 my @result;
2c669bca 932 # Make the header row
933 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
934 push( @result, decode_utf8( $csv->string ) );
935 # Make the rest of the rows
936 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 937 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1dd07bda 938 my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
2c669bca 939 $csv->combine( @row );
910a0a6d 940 push( @result, decode_utf8( $csv->string ) );
941 }
3a2ebbf4 942 return join( "\n", @result );
910a0a6d 943}
944
1dd07bda 945=head2 alignment_table( $use_refs, $include_witnesses )
2c669bca 946
566f4595 947Return a reference to an alignment table, in a slightly enhanced CollateX
948format which looks like this:
949
950 $table = { alignment => [ { witness => "SIGIL",
4e5a7b2c 951 tokens => [ { t => "TEXT" }, ... ] },
566f4595 952 { witness => "SIG2",
4e5a7b2c 953 tokens => [ { t => "TEXT" }, ... ] },
566f4595 954 ... ],
955 length => TEXTLEN };
956
957If $use_refs is set to 1, the reading object is returned in the table
958instead of READINGTEXT; if not, the text of the reading is returned.
4e5a7b2c 959
960If $include_witnesses is set to a hashref, only the witnesses whose sigil
566f4595 961keys have a true hash value will be included.
2c669bca 962
963=cut
9f3ba6f7 964
1dd07bda 965sub alignment_table {
966 my( $self ) = @_;
967 my $include; # see if we can ditch this
968 return $self->cached_table if $self->has_cached_table;
969
0ecb975c 970 # Make sure we can do this
971 throw( "Need a linear graph in order to make an alignment table" )
972 unless $self->linear;
973 $self->calculate_ranks unless $self->end->has_rank;
974
2c669bca 975 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 976 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 977 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
2c669bca 978 if( $include ) {
566f4595 979 next unless $include->{$wit->sigil};
2c669bca 980 }
eca16057 981 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 982 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1dd07bda 983 my @row = _make_witness_row( \@wit_path, \@all_pos );
2c669bca 984 push( @{$table->{'alignment'}},
985 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 986 if( $wit->is_layered ) {
987 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
861c3e27 988 $wit->sigil.$self->ac_label );
1dd07bda 989 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
2c669bca 990 push( @{$table->{'alignment'}},
991 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 992 }
993 }
1dd07bda 994 $self->cached_table( $table );
995 return $table;
910a0a6d 996}
997
998sub _make_witness_row {
1dd07bda 999 my( $path, $positions ) = @_;
910a0a6d 1000 my %char_hash;
1001 map { $char_hash{$_} = undef } @$positions;
2c669bca 1002 my $debug = 0;
910a0a6d 1003 foreach my $rdg ( @$path ) {
eca16057 1004 my $rtext = $rdg->text;
1005 $rtext = '#LACUNA#' if $rdg->is_lacuna;
2c669bca 1006 print STDERR "rank " . $rdg->rank . "\n" if $debug;
3a2ebbf4 1007 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
1dd07bda 1008 $char_hash{$rdg->rank} = { 't' => $rdg };
910a0a6d 1009 }
1010 my @row = map { $char_hash{$_} } @$positions;
eca16057 1011 # Fill in lacuna markers for undef spots in the row
1012 my $last_el = shift @row;
1013 my @filled_row = ( $last_el );
1014 foreach my $el ( @row ) {
0e476982 1015 # If we are using node reference, make the lacuna node appear many times
1016 # in the table. If not, use the lacuna tag.
1dd07bda 1017 if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1018 $el = $last_el;
eca16057 1019 }
1020 push( @filled_row, $el );
1021 $last_el = $el;
1022 }
1023 return @filled_row;
910a0a6d 1024}
1025
4e5a7b2c 1026=head1 NAVIGATION METHODS
910a0a6d 1027
4e5a7b2c 1028=head2 reading_sequence( $first, $last, $sigil, $backup )
e2902068 1029
1030Returns the ordered list of readings, starting with $first and ending
4e5a7b2c 1031with $last, for the witness given in $sigil. If a $backup sigil is
1032specified (e.g. when walking a layered witness), it will be used wherever
1033no $sigil path exists. If there is a base text reading, that will be
1034used wherever no path exists for $sigil or $backup.
e2902068 1035
1036=cut
1037
910a0a6d 1038# TODO Think about returning some lazy-eval iterator.
b0b4421a 1039# TODO Get rid of backup; we should know from what witness is whether we need it.
910a0a6d 1040
e2902068 1041sub reading_sequence {
861c3e27 1042 my( $self, $start, $end, $witness ) = @_;
e2902068 1043
930ff666 1044 $witness = $self->baselabel unless $witness;
e2902068 1045 my @readings = ( $start );
1046 my %seen;
1047 my $n = $start;
3a2ebbf4 1048 while( $n && $n->id ne $end->id ) {
1049 if( exists( $seen{$n->id} ) ) {
63778331 1050 throw( "Detected loop for $witness at " . $n->id );
910a0a6d 1051 }
3a2ebbf4 1052 $seen{$n->id} = 1;
910a0a6d 1053
861c3e27 1054 my $next = $self->next_reading( $n, $witness );
44771cf2 1055 unless( $next ) {
63778331 1056 throw( "Did not find any path for $witness from reading " . $n->id );
44771cf2 1057 }
910a0a6d 1058 push( @readings, $next );
1059 $n = $next;
e2902068 1060 }
1061 # Check that the last reading is our end reading.
1062 my $last = $readings[$#readings];
63778331 1063 throw( "Last reading found from " . $start->text .
1064 " for witness $witness is not the end!" ) # TODO do we get this far?
3a2ebbf4 1065 unless $last->id eq $end->id;
e2902068 1066
1067 return @readings;
1068}
1069
4e5a7b2c 1070=head2 next_reading( $reading, $sigil );
8e1394aa 1071
4a8828f0 1072Returns the reading that follows the given reading along the given witness
930ff666 1073path.
8e1394aa 1074
1075=cut
1076
4a8828f0 1077sub next_reading {
e2902068 1078 # Return the successor via the corresponding path.
8e1394aa 1079 my $self = shift;
3a2ebbf4 1080 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 1081 return undef unless $answer;
3a2ebbf4 1082 return $self->reading( $answer );
8e1394aa 1083}
1084
4e5a7b2c 1085=head2 prior_reading( $reading, $sigil )
8e1394aa 1086
4a8828f0 1087Returns the reading that precedes the given reading along the given witness
930ff666 1088path.
8e1394aa 1089
1090=cut
1091
4a8828f0 1092sub prior_reading {
e2902068 1093 # Return the predecessor via the corresponding path.
8e1394aa 1094 my $self = shift;
3a2ebbf4 1095 my $answer = $self->_find_linked_reading( 'prior', @_ );
1096 return $self->reading( $answer );
8e1394aa 1097}
1098
4a8828f0 1099sub _find_linked_reading {
861c3e27 1100 my( $self, $direction, $node, $path ) = @_;
1101
1102 # Get a backup if we are dealing with a layered witness
1103 my $alt_path;
1104 my $aclabel = $self->ac_label;
1105 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1106 $alt_path = $1;
1107 }
1108
e2902068 1109 my @linked_paths = $direction eq 'next'
3a2ebbf4 1110 ? $self->sequence->edges_from( $node )
1111 : $self->sequence->edges_to( $node );
e2902068 1112 return undef unless scalar( @linked_paths );
8e1394aa 1113
e2902068 1114 # We have to find the linked path that contains all of the
1115 # witnesses supplied in $path.
1116 my( @path_wits, @alt_path_wits );
4e5a7b2c 1117 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1118 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 1119 my $base_le;
1120 my $alt_le;
1121 foreach my $le ( @linked_paths ) {
3a2ebbf4 1122 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 1123 $base_le = $le;
910a0a6d 1124 }
508fd430 1125 my @le_wits = sort $self->path_witnesses( $le );
3a2ebbf4 1126 if( _is_within( \@path_wits, \@le_wits ) ) {
1127 # This is the right path.
1128 return $direction eq 'next' ? $le->[1] : $le->[0];
1129 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1130 $alt_le = $le;
1131 }
8e1394aa 1132 }
e2902068 1133 # Got this far? Return the alternate path if it exists.
3a2ebbf4 1134 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 1135 if $alt_le;
e2902068 1136
1137 # Got this far? Return the base path if it exists.
3a2ebbf4 1138 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 1139 if $base_le;
e2902068 1140
1141 # Got this far? We have no appropriate path.
2c669bca 1142 warn "Could not find $direction node from " . $node->id
910a0a6d 1143 . " along path $path";
8e1394aa 1144 return undef;
1145}
1146
4a8828f0 1147# Some set logic.
1148sub _is_within {
1149 my( $set1, $set2 ) = @_;
7854e12e 1150 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 1151 foreach my $el ( @$set1 ) {
910a0a6d 1152 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 1153 }
1154 return $ret;
1155}
1156
4e5a7b2c 1157# Return the string that joins together a list of witnesses for
1158# display on a single path.
1159sub _witnesses_of_label {
1160 my( $self, $label ) = @_;
1161 my $regex = $self->wit_list_separator;
1162 my @answer = split( /\Q$regex\E/, $label );
1163 return @answer;
b0b4421a 1164}
1165
d4b75f44 1166=head2 common_readings
1167
1168Returns the list of common readings in the graph (i.e. those readings that are
1169shared by all non-lacunose witnesses.)
1170
1171=cut
1172
1173sub common_readings {
1174 my $self = shift;
1175 my @common = grep { $_->is_common } $self->readings;
1176 return @common;
1177}
1178
b0b4421a 1179=head2 path_text( $sigil, $mainsigil [, $start, $end ] )
1180
1181Returns the text of a witness (plus its backup, if we are using a layer)
1182as stored in the collation. The text is returned as a string, where the
1183individual readings are joined with spaces and the meta-readings (e.g.
1184lacunae) are omitted. Optional specification of $start and $end allows
1185the generation of a subset of the witness text.
4e5a7b2c 1186
b0b4421a 1187=cut
1188
1189sub path_text {
861c3e27 1190 my( $self, $wit, $start, $end ) = @_;
b0b4421a 1191 $start = $self->start unless $start;
1192 $end = $self->end unless $end;
861c3e27 1193 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
b0b4421a 1194 return join( ' ', map { $_->text } @path );
1195}
4e5a7b2c 1196
1197=head1 INITIALIZATION METHODS
1198
1199These are mostly for use by parsers.
1200
1201=head2 make_witness_path( $witness )
1202
1203Link the array of readings contained in $witness->path (and in
1204$witness->uncorrected_path if it exists) into collation paths.
1205Clear out the arrays when finished.
de51424a 1206
4e5a7b2c 1207=head2 make_witness_paths
1208
1209Call make_witness_path for all witnesses in the tradition.
1210
1211=cut
930ff666 1212
7e450e44 1213# For use when a collation is constructed from a base text and an apparatus.
1214# We have the sequences of readings and just need to add path edges.
1f7aa795 1215# When we are done, clear out the witness path attributes, as they are no
1216# longer needed.
1217# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 1218
6a222840 1219sub make_witness_paths {
1220 my( $self ) = @_;
910a0a6d 1221 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 1222 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 1223 $self->make_witness_path( $wit );
7854e12e 1224 }
7854e12e 1225}
1226
6a222840 1227sub make_witness_path {
7854e12e 1228 my( $self, $wit ) = @_;
1229 my @chain = @{$wit->path};
15d2d3df 1230 my $sig = $wit->sigil;
7854e12e 1231 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 1232 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 1233 }
1f7aa795 1234 if( $wit->is_layered ) {
d9e873d0 1235 @chain = @{$wit->uncorrected_path};
1236 foreach my $idx( 0 .. $#chain-1 ) {
1237 my $source = $chain[$idx];
1238 my $target = $chain[$idx+1];
1239 $self->add_path( $source, $target, $sig.$self->ac_label )
1240 unless $self->has_path( $source, $target, $sig );
1241 }
15d2d3df 1242 }
1f7aa795 1243 $wit->clear_path;
1244 $wit->clear_uncorrected_path;
e2902068 1245}
1246
4e5a7b2c 1247=head2 calculate_ranks
1248
1249Calculate the reading ranks (that is, their aligned positions relative
1250to each other) for the graph. This can only be called on linear collations.
1251
b365fbae 1252=begin testing
1253
1254use Text::Tradition;
1255
1256my $cxfile = 't/data/Collatex-16.xml';
1257my $t = Text::Tradition->new(
1258 'name' => 'inline',
1259 'input' => 'CollateX',
1260 'file' => $cxfile,
1261 );
1262my $c = $t->collation;
1263
1264# Make an svg
1265my $svg = $c->as_svg;
1266is( substr( $svg, 0, 5 ), '<?xml', "Got XML doc for svg" );
1267ok( $c->has_cached_svg, "SVG was cached" );
1268is( $c->as_svg, $svg, "Cached SVG returned upon second call" );
1269$c->calculate_ranks;
1270is( $c->as_svg, $svg, "Cached SVG retained with no rank change" );
1271$c->add_relationship( 'n9', 'n23', { 'type' => 'spelling' } );
1272isnt( $c->as_svg, $svg, "SVG changed after relationship add" );
1273
1274=end testing
1275
4e5a7b2c 1276=cut
1277
910a0a6d 1278sub calculate_ranks {
1279 my $self = shift;
b365fbae 1280 # Save the existing ranks, in case we need to invalidate the cached SVG.
1281 my %existing_ranks;
910a0a6d 1282 # Walk a version of the graph where every node linked by a relationship
1283 # edge is fundamentally the same node, and do a topological ranking on
1284 # the nodes in this graph.
c9bf3dbf 1285 my $topo_graph = Graph->new();
910a0a6d 1286 my %rel_containers;
1287 my $rel_ctr = 0;
1288 # Add the nodes
1289 foreach my $r ( $self->readings ) {
3a2ebbf4 1290 next if exists $rel_containers{$r->id};
910a0a6d 1291 my @rels = $r->related_readings( 'colocated' );
1292 if( @rels ) {
1293 # Make a relationship container.
1294 push( @rels, $r );
c9bf3dbf 1295 my $rn = 'rel_container_' . $rel_ctr++;
1296 $topo_graph->add_vertex( $rn );
910a0a6d 1297 foreach( @rels ) {
3a2ebbf4 1298 $rel_containers{$_->id} = $rn;
910a0a6d 1299 }
1300 } else {
1301 # Add a new node to mirror the old node.
3a2ebbf4 1302 $rel_containers{$r->id} = $r->id;
1303 $topo_graph->add_vertex( $r->id );
910a0a6d 1304 }
4a8828f0 1305 }
3a1f2523 1306
3a2ebbf4 1307 # Add the edges.
910a0a6d 1308 foreach my $r ( $self->readings ) {
b365fbae 1309 $existing_ranks{$r} = $r->rank;
3a2ebbf4 1310 foreach my $n ( $self->sequence->successors( $r->id ) ) {
1311 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
1312 $rel_containers{$n} );
4e5a7b2c 1313 # $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 1314 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 1315 }
1316 }
1317
1318 # Now do the rankings, starting with the start node.
3a2ebbf4 1319 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 1320 my $node_ranks = { $topo_start => 0 };
910a0a6d 1321 my @curr_origin = ( $topo_start );
1322 # A little iterative function.
1323 while( @curr_origin ) {
c9bf3dbf 1324 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 1325 }
1326 # Transfer our rankings from the topological graph to the real one.
1327 foreach my $r ( $self->readings ) {
3a2ebbf4 1328 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1329 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 1330 } else {
63778331 1331 # Die. Find the last rank we calculated.
1332 my @all_defined = sort { $node_ranks->{$rel_containers{$a->id}}
1333 <=> $node_ranks->{$rel_containers{$b->id}} }
1334 $self->readings;
1335 my $last = pop @all_defined;
1336 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
67da8d6c 1337 }
de51424a 1338 }
b365fbae 1339 # Do we need to invalidate the cached SVG?
1340 if( $self->has_cached_svg ) {
1341 foreach my $r ( $self->readings ) {
1342 next if $existing_ranks{$r} == $r->rank;
1343 $self->wipe_svg;
1344 last;
1345 }
1346 }
8e1394aa 1347}
3a1f2523 1348
910a0a6d 1349sub _assign_rank {
c9bf3dbf 1350 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1351 # Look at each of the children of @current_nodes. If all the child's
1352 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1353 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1354 # parent gets a rank.
910a0a6d 1355 my @next_nodes;
1356 foreach my $c ( @current_nodes ) {
c9bf3dbf 1357 warn "Current reading $c has no rank!"
1358 unless exists $node_ranks->{$c};
1359 # print STDERR "Looking at child of node $c, rank "
1360 # . $node_ranks->{$c} . "\n";
1361 foreach my $child ( $graph->successors( $c ) ) {
1362 next if exists $node_ranks->{$child};
910a0a6d 1363 my $highest_rank = -1;
1364 my $skip = 0;
c9bf3dbf 1365 foreach my $parent ( $graph->predecessors( $child ) ) {
1366 if( exists $node_ranks->{$parent} ) {
1367 $highest_rank = $node_ranks->{$parent}
1368 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1369 } else {
1370 $skip = 1;
1371 last;
1372 }
1373 }
1374 next if $skip;
c9bf3dbf 1375 my $c_rank = $highest_rank + 1;
1376 # print STDERR "Assigning rank $c_rank to node $child \n";
1377 $node_ranks->{$child} = $c_rank;
910a0a6d 1378 push( @next_nodes, $child );
1379 }
1380 }
1381 return @next_nodes;
4cdd82f1 1382}
910a0a6d 1383
4e5a7b2c 1384=head2 flatten_ranks
1385
1386A convenience method for parsing collation data. Searches the graph for readings
1387with the same text at the same rank, and merges any that are found.
1388
1389=cut
1390
0e476982 1391sub flatten_ranks {
1392 my $self = shift;
1393 my %unique_rank_rdg;
1394 foreach my $rdg ( $self->readings ) {
1395 next unless $rdg->has_rank;
1396 my $key = $rdg->rank . "||" . $rdg->text;
1397 if( exists $unique_rank_rdg{$key} ) {
1398 # Combine!
56eefa04 1399 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1400 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
0ecb975c 1401 # TODO see if this now makes a common point.
0e476982 1402 } else {
1403 $unique_rank_rdg{$key} = $rdg;
1404 }
1405 }
1406}
1407
4633f9e4 1408=head2 remove_collations
1409
1410Another convenience method for parsing. Removes all 'collation' relationships
1411that were defined in order to get the reading ranks to be correct.
1412
1413=begin testing
1414
1415use Text::Tradition;
1416
1417my $cxfile = 't/data/Collatex-16.xml';
1418my $t = Text::Tradition->new(
1419 'name' => 'inline',
1420 'input' => 'CollateX',
1421 'file' => $cxfile,
1422 );
1423my $c = $t->collation;
1424
1425isnt( $c->reading('n23')->rank, $c->reading('n9')->rank, "Rank skew exists" );
1426$c->add_relationship( 'n23', 'n9', { 'type' => 'collated', 'scope' => 'local' } );
1427is( scalar $c->relationships, 4, "Found all expected relationships" );
1428$c->remove_collations;
1429is( scalar $c->relationships, 3, "Collated relationships now gone" );
1430is( $c->reading('n23')->rank, $c->reading('n9')->rank, "Aligned ranks were preserved" );
1431
1432=end testing
1433
1434=cut
1435
1436sub remove_collations {
1437 my $self = shift;
1438 foreach my $reledge ( $self->relationships ) {
1439 my $relobj = $self->relations->get_relationship( $reledge );
1440 if( $relobj && $relobj->type eq 'collated' ) {
1441 $self->relations->delete_relationship( $reledge );
1442 }
1443 }
1444}
1445
1446
d4b75f44 1447=head2 calculate_common_readings
1448
1449Goes through the graph identifying the readings that appear in every witness
1450(apart from those with lacunae at that spot.) Marks them as common and returns
1451the list.
1452
1453=begin testing
1454
1455use Text::Tradition;
1456
1457my $cxfile = 't/data/Collatex-16.xml';
1458my $t = Text::Tradition->new(
1459 'name' => 'inline',
1460 'input' => 'CollateX',
1461 'file' => $cxfile,
1462 );
1463my $c = $t->collation;
1464
1465my @common = $c->calculate_common_readings();
1466is( scalar @common, 8, "Found correct number of common readings" );
1467my @marked = sort $c->common_readings();
1468is( scalar @common, 8, "All common readings got marked as such" );
1469my @expected = qw/ n1 n12 n16 n19 n20 n5 n6 n7 /;
1470is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1471
1472=end testing
1473
1474=cut
1475
1476sub calculate_common_readings {
1477 my $self = shift;
1478 my @common;
1dd07bda 1479 my $table = $self->alignment_table;
d4b75f44 1480 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
7f52eac8 1481 my @row = map { $_->{'tokens'}->[$idx]
1482 ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1483 @{$table->{'alignment'}};
d4b75f44 1484 my %hash;
1485 foreach my $r ( @row ) {
1486 if( $r ) {
1487 $hash{$r->id} = $r unless $r->is_meta;
1488 } else {
1489 $hash{'UNDEF'} = $r;
1490 }
1491 }
1492 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1493 my( $r ) = values %hash;
1494 $r->is_common( 1 );
1495 push( @common, $r );
1496 }
1497 }
1498 return @common;
1499}
1500
861c3e27 1501=head2 text_from_paths
1502
1503Calculate the text array for all witnesses from the path, for later consistency
1504checking. Only to be used if there is no non-graph-based way to know the
1505original texts.
1506
1507=cut
1508
1509sub text_from_paths {
1510 my $self = shift;
1511 foreach my $wit ( $self->tradition->witnesses ) {
1512 my @text = split( /\s+/,
1513 $self->reading_sequence( $self->start, $self->end, $wit->sigil ) );
1514 $wit->text( \@text );
1515 if( $wit->is_layered ) {
1516 my @uctext = split( /\s+/,
1517 $self->reading_sequence( $self->start, $self->end,
1518 $wit->sigil.$self->ac_label ) );
1519 $wit->text( \@uctext );
1520 }
1521 }
1522}
0e476982 1523
4e5a7b2c 1524=head1 UTILITY FUNCTIONS
1525
1526=head2 common_predecessor( $reading_a, $reading_b )
8e1394aa 1527
4e5a7b2c 1528Find the last reading that occurs in sequence before both the given readings.
1529
1530=head2 common_successor( $reading_a, $reading_b )
1531
1532Find the first reading that occurs in sequence after both the given readings.
1533
22222af9 1534=begin testing
1535
1536use Text::Tradition;
1537
1538my $cxfile = 't/data/Collatex-16.xml';
1539my $t = Text::Tradition->new(
1540 'name' => 'inline',
1541 'input' => 'CollateX',
1542 'file' => $cxfile,
1543 );
1544my $c = $t->collation;
1545
4e5a7b2c 1546is( $c->common_predecessor( 'n9', 'n23' )->id,
22222af9 1547 'n20', "Found correct common predecessor" );
4e5a7b2c 1548is( $c->common_successor( 'n9', 'n23' )->id,
22222af9 1549 '#END#', "Found correct common successor" );
1550
4e5a7b2c 1551is( $c->common_predecessor( 'n19', 'n17' )->id,
22222af9 1552 'n16', "Found correct common predecessor for readings on same path" );
4e5a7b2c 1553is( $c->common_successor( 'n21', 'n26' )->id,
22222af9 1554 '#END#', "Found correct common successor for readings on same path" );
1555
1556=end testing
1557
1558=cut
1559
1560## Return the closest reading that is a predecessor of both the given readings.
1561sub common_predecessor {
1562 my $self = shift;
4e5a7b2c 1563 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1564 return $self->_common_in_path( $r1, $r2, 'predecessors' );
22222af9 1565}
1566
1567sub common_successor {
1568 my $self = shift;
4e5a7b2c 1569 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1570 return $self->_common_in_path( $r1, $r2, 'successors' );
22222af9 1571}
1572
027d819c 1573sub _common_in_path {
22222af9 1574 my( $self, $r1, $r2, $dir ) = @_;
1575 my $iter = $r1->rank > $r2->rank ? $r1->rank : $r2->rank;
1576 $iter = $self->end->rank - $iter if $dir eq 'successors';
1577 my @candidates;
1578 my @last_checked = ( $r1, $r2 );
1579 my %all_seen;
1580 while( !@candidates ) {
1581 my @new_lc;
1582 foreach my $lc ( @last_checked ) {
1583 foreach my $p ( $lc->$dir ) {
1584 if( $all_seen{$p->id} ) {
1585 push( @candidates, $p );
1586 } else {
1587 $all_seen{$p->id} = 1;
1588 push( @new_lc, $p );
1589 }
1590 }
1591 }
1592 @last_checked = @new_lc;
1593 }
1594 my @answer = sort { $a->rank <=> $b->rank } @candidates;
1595 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1596}
1597
63778331 1598sub throw {
1599 Text::Tradition::Error->throw(
1600 'ident' => 'Collation error',
1601 'message' => $_[0],
1602 );
1603}
1604
dd3b58b0 1605no Moose;
1606__PACKAGE__->meta->make_immutable;
e867486f 1607
027d819c 1608=head1 LICENSE
e867486f 1609
027d819c 1610This package is free software and is provided "as is" without express
1611or implied warranty. You can redistribute it and/or modify it under
1612the same terms as Perl itself.
e867486f 1613
027d819c 1614=head1 AUTHOR
e867486f 1615
027d819c 1616Tara L Andrews E<lt>aurum@cpan.orgE<gt>