don't run SVG tests unless dot is installed
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
bfcbcecb 5use File::Which;
c9bf3dbf 6use Graph;
8e1394aa 7use IPC::Run qw( run binary );
910a0a6d 8use Text::CSV_XS;
b15511bf 9use Text::Tradition::Collation::Reading;
22222af9 10use Text::Tradition::Collation::RelationshipStore;
63778331 11use Text::Tradition::Error;
df6d9812 12use XML::LibXML;
a344ffcf 13use XML::LibXML::XPathContext;
dd3b58b0 14use Moose;
15
3a2ebbf4 16has 'sequence' => (
d047cd52 17 is => 'ro',
3a2ebbf4 18 isa => 'Graph',
19 default => sub { Graph->new() },
d047cd52 20 handles => {
3a2ebbf4 21 paths => 'edges',
d047cd52 22 },
d047cd52 23 );
3a2ebbf4 24
25has 'relations' => (
26 is => 'ro',
22222af9 27 isa => 'Text::Tradition::Collation::RelationshipStore',
28 handles => {
29 relationships => 'relationships',
30 related_readings => 'related_readings',
ee801e17 31 del_relationship => 'del_relationship',
22222af9 32 },
33 writer => '_set_relations',
3a2ebbf4 34 );
dd3b58b0 35
3a2ebbf4 36has 'tradition' => (
37 is => 'ro',
d047cd52 38 isa => 'Text::Tradition',
8d9a1cd8 39 weak_ref => 1,
d047cd52 40 );
dd3b58b0 41
3a2ebbf4 42has 'readings' => (
43 isa => 'HashRef[Text::Tradition::Collation::Reading]',
44 traits => ['Hash'],
45 handles => {
46 reading => 'get',
47 _add_reading => 'set',
48 del_reading => 'delete',
49 has_reading => 'exists',
50 readings => 'values',
51 },
52 default => sub { {} },
53 );
910a0a6d 54
4a8828f0 55has 'wit_list_separator' => (
7854e12e 56 is => 'rw',
57 isa => 'Str',
58 default => ', ',
59 );
60
61has 'baselabel' => (
62 is => 'rw',
63 isa => 'Str',
64 default => 'base text',
65 );
4a8828f0 66
15d2d3df 67has 'linear' => (
68 is => 'rw',
69 isa => 'Bool',
70 default => 1,
71 );
c84275ff 72
ef9d481f 73has 'ac_label' => (
74 is => 'rw',
75 isa => 'Str',
76 default => ' (a.c.)',
77 );
3a2ebbf4 78
79has 'start' => (
80 is => 'ro',
81 isa => 'Text::Tradition::Collation::Reading',
82 writer => '_set_start',
83 weak_ref => 1,
84 );
85
86has 'end' => (
87 is => 'ro',
88 isa => 'Text::Tradition::Collation::Reading',
89 writer => '_set_end',
90 weak_ref => 1,
91 );
b365fbae 92
93has 'cached_svg' => (
94 is => 'rw',
95 isa => 'Str',
96 predicate => 'has_cached_svg',
97 clearer => 'wipe_svg',
98 );
1dd07bda 99
100has 'cached_table' => (
101 is => 'rw',
102 isa => 'HashRef',
103 predicate => 'has_cached_table',
104 clearer => 'wipe_table',
105 );
1f563ac3 106
4e5a7b2c 107=head1 NAME
108
109Text::Tradition::Collation - a software model for a text collation
110
111=head1 SYNOPSIS
112
113 use Text::Tradition;
114 my $t = Text::Tradition->new(
115 'name' => 'this is a text',
116 'input' => 'TEI',
117 'file' => '/path/to/tei_parallel_seg_file.xml' );
118
119 my $c = $t->collation;
120 my @readings = $c->readings;
121 my @paths = $c->paths;
122 my @relationships = $c->relationships;
123
124 my $svg_variant_graph = $t->collation->as_svg();
125
126=head1 DESCRIPTION
127
128Text::Tradition is a library for representation and analysis of collated
129texts, particularly medieval ones. The Collation is the central feature of
130a Tradition, where the text, its sequence of readings, and its relationships
131between readings are actually kept.
132
133=head1 CONSTRUCTOR
134
135=head2 new
136
137The constructor. Takes a hash or hashref of the following arguments:
138
139=over
140
141=item * tradition - The Text::Tradition object to which the collation
142belongs. Required.
143
144=item * linear - Whether the collation should be linear; that is, whether
145transposed readings should be treated as two linked readings rather than one,
146and therefore whether the collation graph is acyclic. Defaults to true.
147
4e5a7b2c 148=item * baselabel - The default label for the path taken by a base text
149(if any). Defaults to 'base text'.
150
151=item * wit_list_separator - The string to join a list of witnesses for
152purposes of making labels in display graphs. Defaults to ', '.
153
154=item * ac_label - The extra label to tack onto a witness sigil when
155representing another layer of path for the given witness - that is, when
156a text has more than one possible reading due to scribal corrections or
157the like. Defaults to ' (a.c.)'.
158
159=back
160
161=head1 ACCESSORS
162
163=head2 tradition
164
165=head2 linear
166
4e5a7b2c 167=head2 wit_list_separator
168
169=head2 baselabel
170
171=head2 ac_label
172
173Simple accessors for collation attributes.
174
175=head2 start
176
177The meta-reading at the start of every witness path.
178
179=head2 end
180
181The meta-reading at the end of every witness path.
182
183=head2 readings
184
185Returns all Reading objects in the graph.
186
187=head2 reading( $id )
188
189Returns the Reading object corresponding to the given ID.
190
191=head2 add_reading( $reading_args )
192
193Adds a new reading object to the collation.
194See L<Text::Tradition::Collation::Reading> for the available arguments.
195
196=head2 del_reading( $object_or_id )
197
198Removes the given reading from the collation, implicitly removing its
199paths and relationships.
200
201=head2 merge_readings( $main, $second )
202
203Merges the $second reading into the $main one.
204The arguments may be either readings or reading IDs.
205
206=head2 has_reading( $id )
207
208Predicate to see whether a given reading ID is in the graph.
209
210=head2 reading_witnesses( $object_or_id )
211
212Returns a list of sigils whose witnesses contain the reading.
213
214=head2 paths
215
216Returns all reading paths within the document - that is, all edges in the
217collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
218
219=head2 add_path( $source, $target, $sigil )
220
221Links the given readings in the collation in sequence, under the given witness
222sigil. The readings may be specified by object or ID.
223
224=head2 del_path( $source, $target, $sigil )
225
226Links the given readings in the collation in sequence, under the given witness
227sigil. The readings may be specified by object or ID.
228
229=head2 has_path( $source, $target );
230
231Returns true if the two readings are linked in sequence in any witness.
232The readings may be specified by object or ID.
233
234=head2 relationships
235
236Returns all Relationship objects in the collation.
237
238=head2 add_relationship( $reading, $other_reading, $options )
239
240Adds a new relationship of the type given in $options between the two readings,
241which may be specified by object or ID. Returns a value of ( $status, @vectors)
242where $status is true on success, and @vectors is a list of relationship edges
243that were ultimately added.
244See L<Text::Tradition::Collation::Relationship> for the available options.
245
246=cut
dd3b58b0 247
d047cd52 248sub BUILD {
3a2ebbf4 249 my $self = shift;
22222af9 250 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
3a2ebbf4 251 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
252 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 253}
784877d9 254
3a2ebbf4 255### Reading construct/destruct functions
256
1dd07bda 257sub _clear_cache {
258 my $self = shift;
259 $self->wipe_svg if $self->has_cached_svg;
260 $self->wipe_table if $self->has_cached_table;
261}
262
3a2ebbf4 263sub add_reading {
264 my( $self, $reading ) = @_;
265 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
266 my %args = %$reading;
267 $reading = Text::Tradition::Collation::Reading->new(
268 'collation' => $self,
269 %args );
270 }
271 # First check to see if a reading with this ID exists.
272 if( $self->reading( $reading->id ) ) {
63778331 273 throw( "Collation already has a reading with id " . $reading->id );
3a2ebbf4 274 }
1dd07bda 275 $self->_clear_cache;
3a2ebbf4 276 $self->_add_reading( $reading->id => $reading );
277 # Once the reading has been added, put it in both graphs.
278 $self->sequence->add_vertex( $reading->id );
22222af9 279 $self->relations->add_reading( $reading->id );
3a2ebbf4 280 return $reading;
eca16057 281};
282
3a2ebbf4 283around del_reading => sub {
284 my $orig = shift;
285 my $self = shift;
286 my $arg = shift;
287
288 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
289 $arg = $arg->id;
290 }
3a2ebbf4 291 # Remove the reading from the graphs.
1dd07bda 292 $self->_clear_cache;
3a2ebbf4 293 $self->sequence->delete_vertex( $arg );
22222af9 294 $self->relations->delete_reading( $arg );
3a2ebbf4 295
296 # Carry on.
297 $self->$orig( $arg );
298};
7854e12e 299
3a2ebbf4 300# merge_readings( $main, $to_be_deleted );
7854e12e 301
3a2ebbf4 302sub merge_readings {
303 my $self = shift;
304
305 # We only need the IDs for adding paths to the graph, not the reading
306 # objects themselves.
49d4f2ac 307 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
1dd07bda 308 $self->_clear_cache;
3a2ebbf4 309
310 # The kept reading should inherit the paths and the relationships
311 # of the deleted reading.
312 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
313 my @vector = ( $kept );
314 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
315 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 316 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 317 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
318 $self->sequence->add_edge( @vector );
319 my $fwits = $self->sequence->get_edge_attributes( @vector );
320 @wits{keys %$fwits} = values %$fwits;
321 $self->sequence->set_edge_attributes( @vector, \%wits );
322 }
22222af9 323 $self->relations->merge_readings( $kept, $deleted, $combine_char );
3a2ebbf4 324
325 # Do the deletion deed.
49d4f2ac 326 if( $combine_char ) {
327 my $kept_obj = $self->reading( $kept );
328 my $new_text = join( $combine_char, $kept_obj->text,
329 $self->reading( $deleted )->text );
330 $kept_obj->alter_text( $new_text );
331 }
3a2ebbf4 332 $self->del_reading( $deleted );
333}
7854e12e 334
3265b0ce 335
3a2ebbf4 336# Helper function for manipulating the graph.
337sub _stringify_args {
338 my( $self, $first, $second, $arg ) = @_;
339 $first = $first->id
340 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
341 $second = $second->id
342 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
343 return( $first, $second, $arg );
344}
df6d9812 345
4e5a7b2c 346# Helper function for manipulating the graph.
347sub _objectify_args {
348 my( $self, $first, $second, $arg ) = @_;
349 $first = $self->reading( $first )
350 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
351 $second = $self->reading( $second )
352 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
353 return( $first, $second, $arg );
354}
3a2ebbf4 355### Path logic
356
357sub add_path {
358 my $self = shift;
359
360 # We only need the IDs for adding paths to the graph, not the reading
361 # objects themselves.
362 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
363
1dd07bda 364 $self->_clear_cache;
3a2ebbf4 365 # Connect the readings
366 $self->sequence->add_edge( $source, $target );
367 # Note the witness in question
368 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 369};
370
3a2ebbf4 371sub del_path {
372 my $self = shift;
49d4f2ac 373 my @args;
374 if( ref( $_[0] ) eq 'ARRAY' ) {
375 my $e = shift @_;
376 @args = ( @$e, @_ );
377 } else {
378 @args = @_;
379 }
3a2ebbf4 380
381 # We only need the IDs for adding paths to the graph, not the reading
382 # objects themselves.
49d4f2ac 383 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 384
1dd07bda 385 $self->_clear_cache;
3a2ebbf4 386 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 387 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 388 }
389 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
390 $self->sequence->delete_edge( $source, $target );
391 }
784877d9 392}
393
3a2ebbf4 394
15d2d3df 395# Extra graph-alike utility
396sub has_path {
3a2ebbf4 397 my $self = shift;
398 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
399 return undef unless $self->sequence->has_edge( $source, $target );
400 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 401}
402
4e5a7b2c 403=head2 clear_witness( @sigil_list )
3a2ebbf4 404
4e5a7b2c 405Clear the given witnesses out of the collation entirely, removing references
406to them in paths, and removing readings that belong only to them. Should only
407be called via $tradition->del_witness.
3a2ebbf4 408
409=cut
410
4e5a7b2c 411sub clear_witness {
412 my( $self, @sigils ) = @_;
413
1dd07bda 414 $self->_clear_cache;
4e5a7b2c 415 # Clear the witness(es) out of the paths
416 foreach my $e ( $self->paths ) {
417 foreach my $sig ( @sigils ) {
418 $self->del_path( $e, $sig );
419 }
420 }
421
422 # Clear out the newly unused readings
423 foreach my $r ( $self->readings ) {
424 unless( $self->reading_witnesses( $r ) ) {
425 $self->del_reading( $r );
426 }
427 }
428}
3a2ebbf4 429
430sub add_relationship {
431 my $self = shift;
22222af9 432 my( $source, $target, $opts ) = $self->_stringify_args( @_ );
63778331 433 my( @vectors ) = $self->relations->add_relationship( $source,
64ae6270 434 $self->reading( $source ), $target, $self->reading( $target ), $opts );
435 # Force a full rank recalculation every time. Yuck.
63778331 436 $self->calculate_ranks() if $self->end->has_rank;
1dd07bda 437 $self->_clear_cache;
63778331 438 return @vectors;
22222af9 439}
ef9d481f 440
22222af9 441=head2 reading_witnesses( $reading )
910a0a6d 442
22222af9 443Return a list of sigils corresponding to the witnesses in which the reading appears.
3265b0ce 444
22222af9 445=cut
1d310495 446
1d310495 447sub reading_witnesses {
448 my( $self, $reading ) = @_;
449 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 450 # arbitrarily chosen "incoming". Thus, special-case the start node.
451 if( $reading eq $self->start ) {
452 return map { $_->sigil } $self->tradition->witnesses;
453 }
1d310495 454 my %all_witnesses;
455 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
456 my $wits = $self->sequence->get_edge_attributes( @$e );
457 @all_witnesses{ keys %$wits } = 1;
458 }
c12bb878 459 my $acstr = $self->ac_label;
460 foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
461 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
462 }
1d310495 463 return keys %all_witnesses;
910a0a6d 464}
465
4e5a7b2c 466=head1 OUTPUT METHODS
8e1394aa 467
0ecb975c 468=head2 as_svg( \%options )
8e1394aa 469
0068967c 470Returns an SVG string that represents the graph, via as_dot and graphviz.
bfcbcecb 471See as_dot for a list of options. Must have GraphViz (dot) installed to run.
8e1394aa 472
473=cut
474
475sub as_svg {
0ecb975c 476 my( $self, $opts ) = @_;
bfcbcecb 477 throw( "Need GraphViz installed to output SVG" )
478 unless File::Which::which( 'dot' );
e247aad1 479 my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
480 if( !$self->has_cached_svg || $opts->{'recalc'} || $want_subgraph ) {
b365fbae 481 my @cmd = qw/dot -Tsvg/;
482 my( $svg, $err );
483 my $dotfile = File::Temp->new();
484 ## USE FOR DEBUGGING
485 # $dotfile->unlink_on_destroy(0);
486 binmode $dotfile, ':utf8';
487 print $dotfile $self->as_dot( $opts );
488 push( @cmd, $dotfile->filename );
489 run( \@cmd, ">", binary(), \$svg );
e247aad1 490 $svg = decode_utf8( $svg );
491 $self->cached_svg( $svg ) unless $want_subgraph;
492 return $svg;
493 } else {
494 return $self->cached_svg;
b365fbae 495 }
8e1394aa 496}
497
b22576c6 498
0ecb975c 499=head2 as_dot( \%options )
b22576c6 500
0ecb975c 501Returns a string that is the collation graph expressed in dot
502(i.e. GraphViz) format. Options include:
b22576c6 503
0ecb975c 504=over 4
b22576c6 505
0ecb975c 506=item * from
b22576c6 507
0ecb975c 508=item * to
df6d9812 509
0ecb975c 510=item * color_common
511
512=back
df6d9812 513
514=cut
515
516sub as_dot {
0ecb975c 517 my( $self, $opts ) = @_;
518 my $startrank = $opts->{'from'} if $opts;
519 my $endrank = $opts->{'to'} if $opts;
520 my $color_common = $opts->{'color_common'} if $opts;
b365fbae 521 my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
522 && $self->end->rank > 100;
523
b22576c6 524 # Check the arguments
525 if( $startrank ) {
526 return if $endrank && $startrank > $endrank;
527 return if $startrank > $self->end->rank;
528 }
529 if( defined $endrank ) {
530 return if $endrank < 0;
f1b3b33a 531 $endrank = undef if $endrank == $self->end->rank;
b22576c6 532 }
533
67da8d6c 534 my $graph_name = $self->tradition->name;
535 $graph_name =~ s/[^\w\s]//g;
536 $graph_name = join( '_', split( /\s+/, $graph_name ) );
f13b5582 537
538 my %graph_attrs = (
539 'rankdir' => 'LR',
540 'bgcolor' => 'none',
541 );
542 my %node_attrs = (
b8990398 543 'fontsize' => 14,
f13b5582 544 'fillcolor' => 'white',
545 'style' => 'filled',
546 'shape' => 'ellipse'
547 );
548 my %edge_attrs = (
549 'arrowhead' => 'open',
550 'color' => '#000000',
551 'fontcolor' => '#000000',
552 );
553
67da8d6c 554 my $dot = sprintf( "digraph %s {\n", $graph_name );
f13b5582 555 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
556 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
df6d9812 557
b22576c6 558 # Output substitute start/end readings if necessary
559 if( $startrank ) {
560 $dot .= "\t\"#SUBSTART#\" [ label=\"...\" ];\n";
561 }
562 if( $endrank ) {
563 $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n";
564 }
b365fbae 565 if( $STRAIGHTENHACK ) {
566 ## HACK part 1
567 $dot .= "\tsubgraph { rank=same \"#START#\" \"#SILENT#\" }\n";
568 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
569 }
b22576c6 570 my %used; # Keep track of the readings that actually appear in the graph
30ddc24c 571 # Sort the readings by rank if we have ranks; this speeds layout.
572 my @all_readings = $self->end->has_rank
573 ? sort { $a->rank <=> $b->rank } $self->readings
574 : $self->readings;
4633f9e4 575 # TODO Refrain from outputting lacuna nodes - just grey out the edges.
30ddc24c 576 foreach my $reading ( @all_readings ) {
b22576c6 577 # Only output readings within our rank range.
578 next if $startrank && $reading->rank < $startrank;
579 next if $endrank && $reading->rank > $endrank;
580 $used{$reading->id} = 1;
910a0a6d 581 # Need not output nodes without separate labels
3a2ebbf4 582 next if $reading->id eq $reading->text;
d4b75f44 583 my $rattrs;
30f0df34 584 my $label = $reading->text;
8f9cab7b 585 $label =~ s/\"/\\\"/g;
d4b75f44 586 $rattrs->{'label'} = $label;
0ecb975c 587 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
d4b75f44 588 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
df6d9812 589 }
3a2ebbf4 590
30ddc24c 591 # Add the real edges. Need to weight one edge per rank jump, in a
592 # continuous line.
b365fbae 593 # my $weighted = $self->_add_edge_weights;
b22576c6 594 my @edges = $self->paths;
3bdec618 595 my( %substart, %subend );
b22576c6 596 foreach my $edge ( @edges ) {
597 # Do we need to output this edge?
508fd430 598 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
027d819c 599 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
f13b5582 600 my $variables = { %edge_attrs, 'label' => $label };
30ddc24c 601
b22576c6 602 # Account for the rank gap if necessary
30ddc24c 603 my $rank0 = $self->reading( $edge->[0] )->rank
604 if $self->reading( $edge->[0] )->has_rank;
605 my $rank1 = $self->reading( $edge->[1] )->rank
606 if $self->reading( $edge->[1] )->has_rank;
607 if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
608 $variables->{'minlen'} = $rank1 - $rank0;
609 }
610
611 # Add the calculated edge weights
b365fbae 612 # if( exists $weighted->{$edge->[0]}
e247aad1 613 # && $weighted->{$edge->[0]} eq $edge->[1] ) {
614 # # $variables->{'color'} = 'red';
615 # $variables->{'weight'} = 3.0;
616 # }
30ddc24c 617
508fd430 618 # EXPERIMENTAL: make edge width reflect no. of witnesses
619 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
620 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
621
f13b5582 622 my $varopts = _dot_attr_string( $variables );
623 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
624 $edge->[0], $edge->[1], $varopts );
3bdec618 625 } elsif( $used{$edge->[0]} ) {
626 $subend{$edge->[0]} = 1;
627 } elsif( $used{$edge->[1]} ) {
628 $substart{$edge->[1]} = 1;
b22576c6 629 }
df6d9812 630 }
3bdec618 631 # Add substitute start and end edges if necessary
632 foreach my $node ( keys %substart ) {
027d819c 633 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 634 my $variables = { %edge_attrs, 'label' => $witstr };
635 my $varopts = _dot_attr_string( $variables );
636 $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;";
3bdec618 637 }
638 foreach my $node ( keys %subend ) {
027d819c 639 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 640 my $variables = { %edge_attrs, 'label' => $witstr };
641 my $varopts = _dot_attr_string( $variables );
642 $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;";
3bdec618 643 }
b365fbae 644 # HACK part 2
645 if( $STRAIGHTENHACK ) {
646 $dot .= "\t\"#END#\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
647 }
30ddc24c 648
df6d9812 649 $dot .= "}\n";
650 return $dot;
651}
652
f13b5582 653sub _dot_attr_string {
654 my( $hash ) = @_;
655 my @attrs;
656 foreach my $k ( sort keys %$hash ) {
657 my $v = $hash->{$k};
658 push( @attrs, $k.'="'.$v.'"' );
659 }
660 return( '[ ' . join( ', ', @attrs ) . ' ]' );
661}
662
30ddc24c 663sub _add_edge_weights {
664 my $self = shift;
665 # Walk the graph from START to END, choosing the successor node with
666 # the largest number of witness paths each time.
667 my $weighted = {};
668 my $curr = $self->start->id;
008fc8a6 669 my $ranked = $self->end->has_rank;
30ddc24c 670 while( $curr ne $self->end->id ) {
008fc8a6 671 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
30ddc24c 672 my @succ = sort { $self->path_witnesses( $curr, $a )
673 <=> $self->path_witnesses( $curr, $b ) }
674 $self->sequence->successors( $curr );
675 my $next = pop @succ;
008fc8a6 676 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
30ddc24c 677 # Try to avoid lacunae in the weighted path.
008fc8a6 678 while( @succ &&
679 ( $self->reading( $next )->is_lacuna ||
680 $nextrank - $rank > 1 ) ){
30ddc24c 681 $next = pop @succ;
682 }
683 $weighted->{$curr} = $next;
684 $curr = $next;
685 }
686 return $weighted;
687}
688
027d819c 689=head2 path_witnesses( $edge )
690
691Returns the list of sigils whose witnesses are associated with the given edge.
692The edge can be passed as either an array or an arrayref of ( $source, $target ).
693
694=cut
695
3a2ebbf4 696sub path_witnesses {
697 my( $self, @edge ) = @_;
698 # If edge is an arrayref, cope.
699 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
700 my $e = shift @edge;
701 @edge = @$e;
702 }
703 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
508fd430 704 return @wits;
3a2ebbf4 705}
706
027d819c 707sub _path_display_label {
508fd430 708 my $self = shift;
709 my @wits = sort @_;
8f9cab7b 710 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
711 if( scalar @wits > $maj ) {
f13b5582 712 # TODO break out a.c. wits
8f9cab7b 713 return 'majority';
714 } else {
715 return join( ', ', @wits );
716 }
717}
1dd07bda 718
719=head2 witnesses_at_rank
720
721Returns a list of witnesses that are not lacunose, for a given rank.
722
723=cut
724
725sub witnesses_at_rank {
726 my( $self, $rank ) = @_;
727}
8f9cab7b 728
4e5a7b2c 729=head2 as_graphml
8e1394aa 730
4e5a7b2c 731Returns a GraphML representation of the collation. The GraphML will contain
732two graphs. The first expresses the attributes of the readings and the witness
733paths that link them; the second expresses the relationships that link the
734readings. This is the native transfer format for a tradition.
8e1394aa 735
56eefa04 736=begin testing
737
738use Text::Tradition;
739
740my $READINGS = 311;
741my $PATHS = 361;
742
743my $datafile = 't/data/florilegium_tei_ps.xml';
744my $tradition = Text::Tradition->new( 'input' => 'TEI',
745 'name' => 'test0',
746 'file' => $datafile,
747 'linear' => 1 );
748
749ok( $tradition, "Got a tradition object" );
750is( scalar $tradition->witnesses, 13, "Found all witnesses" );
751ok( $tradition->collation, "Tradition has a collation" );
752
753my $c = $tradition->collation;
754is( scalar $c->readings, $READINGS, "Collation has all readings" );
755is( scalar $c->paths, $PATHS, "Collation has all paths" );
756is( scalar $c->relationships, 0, "Collation has all relationships" );
757
758# Add a few relationships
759$c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
760$c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
761$c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
762
763# Now write it to GraphML and parse it again.
764
765my $graphml = $c->as_graphml;
766my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
767is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
768is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
769is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
770
771=end testing
772
8e1394aa 773=cut
774
775sub as_graphml {
3a2ebbf4 776 my( $self ) = @_;
8e1394aa 777
778 # Some namespaces
779 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
780 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
781 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 782 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 783
784 # Create the document and root node
785 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
786 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
787 $graphml->setDocumentElement( $root );
788 $root->setNamespace( $xsi_ns, 'xsi', 0 );
789 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
790
e309421a 791 # Add the data keys for the graph
792 my %graph_data_keys;
793 my $gdi = 0;
1d310495 794 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
e309421a 795 foreach my $datum ( @graph_attributes ) {
796 $graph_data_keys{$datum} = 'dg'.$gdi++;
797 my $key = $root->addNewChild( $graphml_ns, 'key' );
798 $key->setAttribute( 'attr.name', $datum );
799 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
800 $key->setAttribute( 'for', 'graph' );
801 $key->setAttribute( 'id', $graph_data_keys{$datum} );
802 }
f6066bac 803
8e1394aa 804 # Add the data keys for nodes
ef9d481f 805 my %node_data_keys;
806 my $ndi = 0;
3a2ebbf4 807 my %node_data = (
808 id => 'string',
255875b8 809 text => 'string',
3a2ebbf4 810 rank => 'string',
811 is_start => 'boolean',
812 is_end => 'boolean',
813 is_lacuna => 'boolean',
814 );
815 foreach my $datum ( keys %node_data ) {
910a0a6d 816 $node_data_keys{$datum} = 'dn'.$ndi++;
817 my $key = $root->addNewChild( $graphml_ns, 'key' );
818 $key->setAttribute( 'attr.name', $datum );
3a2ebbf4 819 $key->setAttribute( 'attr.type', $node_data{$datum} );
910a0a6d 820 $key->setAttribute( 'for', 'node' );
821 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 822 }
823
df6d9812 824 # Add the data keys for edges, i.e. witnesses
ef9d481f 825 my $edi = 0;
826 my %edge_data_keys;
3a2ebbf4 827 my %edge_data = (
f523c7a8 828 class => 'string', # Class, deprecated soon
3a2ebbf4 829 witness => 'string', # ID/label for a path
830 relationship => 'string', # ID/label for a relationship
831 extra => 'boolean', # Path key
c84275ff 832 scope => 'string', # Relationship key
3a2ebbf4 833 non_correctable => 'boolean', # Relationship key
834 non_independent => 'boolean', # Relationship key
835 );
836 foreach my $datum ( keys %edge_data ) {
837 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 838 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 839 $key->setAttribute( 'attr.name', $datum );
840 $key->setAttribute( 'attr.type', $edge_data{$datum} );
910a0a6d 841 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 842 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 843 }
3a2ebbf4 844
22222af9 845 # Add the collation graph itself
2c669bca 846 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
847 $sgraph->setAttribute( 'edgedefault', 'directed' );
848 $sgraph->setAttribute( 'id', $self->tradition->name );
849 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
850 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
851 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
852 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
853 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
22222af9 854
e309421a 855 # Collation attribute data
856 foreach my $datum ( @graph_attributes ) {
2c669bca 857 my $value = $datum eq 'version' ? '3.0' : $self->$datum;
858 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 859 }
8e1394aa 860
861 my $node_ctr = 0;
862 my %node_hash;
22222af9 863 # Add our readings to the graph
3a2ebbf4 864 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
2c669bca 865 # Add to the main graph
866 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 867 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 868 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 869 $node_el->setAttribute( 'id', $node_xmlid );
255875b8 870 foreach my $d ( keys %node_data ) {
871 my $nval = $n->$d;
872 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
873 if defined $nval;
874 }
b15511bf 875 }
876
2c669bca 877 # Add the path edges to the sequence graph
df6d9812 878 my $edge_ctr = 0;
3a2ebbf4 879 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
880 # We add an edge in the graphml for every witness in $e.
508fd430 881 foreach my $wit ( sort $self->path_witnesses( $e ) ) {
3a2ebbf4 882 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
883 $node_hash{ $e->[0] },
884 $node_hash{ $e->[1] } );
2c669bca 885 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 886 $edge_el->setAttribute( 'source', $from );
887 $edge_el->setAttribute( 'target', $to );
888 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 889
890 # It's a witness path, so add the witness
891 my $base = $wit;
892 my $key = $edge_data_keys{'witness'};
893 # Is this an ante-corr witness?
894 my $aclabel = $self->ac_label;
895 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
896 # Keep the base witness
897 $base = $1;
898 # ...and record that this is an 'extra' reading path
899 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
900 }
901 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
f523c7a8 902 _add_graphml_data( $edge_el, $edge_data_keys{'class'}, 'path' );
3a2ebbf4 903 }
904 }
905
22222af9 906 # Add the relationship graph to the XML
027d819c 907 $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
2626f709 908 $node_data_keys{'id'}, \%edge_data_keys );
8e1394aa 909
94c00c71 910 # Save and return the thing
911 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 912 return $result;
df6d9812 913}
914
b15511bf 915sub _add_graphml_data {
916 my( $el, $key, $value ) = @_;
b15511bf 917 return unless defined $value;
c9bf3dbf 918 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 919 $data_el->setAttribute( 'key', $key );
920 $data_el->appendText( $value );
8e1394aa 921}
922
4e5a7b2c 923=head2 as_csv
910a0a6d 924
925Returns a CSV alignment table representation of the collation graph, one
2c669bca 926row per witness (or witness uncorrected.)
910a0a6d 927
928=cut
929
930sub as_csv {
3a2ebbf4 931 my( $self ) = @_;
1dd07bda 932 my $table = $self->alignment_table;
910a0a6d 933 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
934 my @result;
2c669bca 935 # Make the header row
936 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
937 push( @result, decode_utf8( $csv->string ) );
938 # Make the rest of the rows
939 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 940 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1dd07bda 941 my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
2c669bca 942 $csv->combine( @row );
910a0a6d 943 push( @result, decode_utf8( $csv->string ) );
944 }
3a2ebbf4 945 return join( "\n", @result );
910a0a6d 946}
947
1dd07bda 948=head2 alignment_table( $use_refs, $include_witnesses )
2c669bca 949
566f4595 950Return a reference to an alignment table, in a slightly enhanced CollateX
951format which looks like this:
952
953 $table = { alignment => [ { witness => "SIGIL",
4e5a7b2c 954 tokens => [ { t => "TEXT" }, ... ] },
566f4595 955 { witness => "SIG2",
4e5a7b2c 956 tokens => [ { t => "TEXT" }, ... ] },
566f4595 957 ... ],
958 length => TEXTLEN };
959
960If $use_refs is set to 1, the reading object is returned in the table
961instead of READINGTEXT; if not, the text of the reading is returned.
4e5a7b2c 962
963If $include_witnesses is set to a hashref, only the witnesses whose sigil
566f4595 964keys have a true hash value will be included.
2c669bca 965
966=cut
9f3ba6f7 967
1dd07bda 968sub alignment_table {
969 my( $self ) = @_;
970 my $include; # see if we can ditch this
971 return $self->cached_table if $self->has_cached_table;
972
0ecb975c 973 # Make sure we can do this
974 throw( "Need a linear graph in order to make an alignment table" )
975 unless $self->linear;
976 $self->calculate_ranks unless $self->end->has_rank;
977
2c669bca 978 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 979 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 980 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
2c669bca 981 if( $include ) {
566f4595 982 next unless $include->{$wit->sigil};
2c669bca 983 }
eca16057 984 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 985 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1dd07bda 986 my @row = _make_witness_row( \@wit_path, \@all_pos );
2c669bca 987 push( @{$table->{'alignment'}},
988 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 989 if( $wit->is_layered ) {
990 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
861c3e27 991 $wit->sigil.$self->ac_label );
1dd07bda 992 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
2c669bca 993 push( @{$table->{'alignment'}},
994 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 995 }
996 }
1dd07bda 997 $self->cached_table( $table );
998 return $table;
910a0a6d 999}
1000
1001sub _make_witness_row {
1dd07bda 1002 my( $path, $positions ) = @_;
910a0a6d 1003 my %char_hash;
1004 map { $char_hash{$_} = undef } @$positions;
2c669bca 1005 my $debug = 0;
910a0a6d 1006 foreach my $rdg ( @$path ) {
eca16057 1007 my $rtext = $rdg->text;
1008 $rtext = '#LACUNA#' if $rdg->is_lacuna;
2c669bca 1009 print STDERR "rank " . $rdg->rank . "\n" if $debug;
3a2ebbf4 1010 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
1dd07bda 1011 $char_hash{$rdg->rank} = { 't' => $rdg };
910a0a6d 1012 }
1013 my @row = map { $char_hash{$_} } @$positions;
eca16057 1014 # Fill in lacuna markers for undef spots in the row
1015 my $last_el = shift @row;
1016 my @filled_row = ( $last_el );
1017 foreach my $el ( @row ) {
0e476982 1018 # If we are using node reference, make the lacuna node appear many times
1019 # in the table. If not, use the lacuna tag.
1dd07bda 1020 if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1021 $el = $last_el;
eca16057 1022 }
1023 push( @filled_row, $el );
1024 $last_el = $el;
1025 }
1026 return @filled_row;
910a0a6d 1027}
1028
4e5a7b2c 1029=head1 NAVIGATION METHODS
910a0a6d 1030
4e5a7b2c 1031=head2 reading_sequence( $first, $last, $sigil, $backup )
e2902068 1032
1033Returns the ordered list of readings, starting with $first and ending
4e5a7b2c 1034with $last, for the witness given in $sigil. If a $backup sigil is
1035specified (e.g. when walking a layered witness), it will be used wherever
1036no $sigil path exists. If there is a base text reading, that will be
1037used wherever no path exists for $sigil or $backup.
e2902068 1038
1039=cut
1040
910a0a6d 1041# TODO Think about returning some lazy-eval iterator.
b0b4421a 1042# TODO Get rid of backup; we should know from what witness is whether we need it.
910a0a6d 1043
e2902068 1044sub reading_sequence {
861c3e27 1045 my( $self, $start, $end, $witness ) = @_;
e2902068 1046
930ff666 1047 $witness = $self->baselabel unless $witness;
e2902068 1048 my @readings = ( $start );
1049 my %seen;
1050 my $n = $start;
3a2ebbf4 1051 while( $n && $n->id ne $end->id ) {
1052 if( exists( $seen{$n->id} ) ) {
63778331 1053 throw( "Detected loop for $witness at " . $n->id );
910a0a6d 1054 }
3a2ebbf4 1055 $seen{$n->id} = 1;
910a0a6d 1056
861c3e27 1057 my $next = $self->next_reading( $n, $witness );
44771cf2 1058 unless( $next ) {
63778331 1059 throw( "Did not find any path for $witness from reading " . $n->id );
44771cf2 1060 }
910a0a6d 1061 push( @readings, $next );
1062 $n = $next;
e2902068 1063 }
1064 # Check that the last reading is our end reading.
1065 my $last = $readings[$#readings];
63778331 1066 throw( "Last reading found from " . $start->text .
1067 " for witness $witness is not the end!" ) # TODO do we get this far?
3a2ebbf4 1068 unless $last->id eq $end->id;
e2902068 1069
1070 return @readings;
1071}
1072
4e5a7b2c 1073=head2 next_reading( $reading, $sigil );
8e1394aa 1074
4a8828f0 1075Returns the reading that follows the given reading along the given witness
930ff666 1076path.
8e1394aa 1077
1078=cut
1079
4a8828f0 1080sub next_reading {
e2902068 1081 # Return the successor via the corresponding path.
8e1394aa 1082 my $self = shift;
3a2ebbf4 1083 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 1084 return undef unless $answer;
3a2ebbf4 1085 return $self->reading( $answer );
8e1394aa 1086}
1087
4e5a7b2c 1088=head2 prior_reading( $reading, $sigil )
8e1394aa 1089
4a8828f0 1090Returns the reading that precedes the given reading along the given witness
930ff666 1091path.
8e1394aa 1092
1093=cut
1094
4a8828f0 1095sub prior_reading {
e2902068 1096 # Return the predecessor via the corresponding path.
8e1394aa 1097 my $self = shift;
3a2ebbf4 1098 my $answer = $self->_find_linked_reading( 'prior', @_ );
1099 return $self->reading( $answer );
8e1394aa 1100}
1101
4a8828f0 1102sub _find_linked_reading {
861c3e27 1103 my( $self, $direction, $node, $path ) = @_;
1104
1105 # Get a backup if we are dealing with a layered witness
1106 my $alt_path;
1107 my $aclabel = $self->ac_label;
1108 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1109 $alt_path = $1;
1110 }
1111
e2902068 1112 my @linked_paths = $direction eq 'next'
3a2ebbf4 1113 ? $self->sequence->edges_from( $node )
1114 : $self->sequence->edges_to( $node );
e2902068 1115 return undef unless scalar( @linked_paths );
8e1394aa 1116
e2902068 1117 # We have to find the linked path that contains all of the
1118 # witnesses supplied in $path.
1119 my( @path_wits, @alt_path_wits );
4e5a7b2c 1120 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1121 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 1122 my $base_le;
1123 my $alt_le;
1124 foreach my $le ( @linked_paths ) {
3a2ebbf4 1125 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 1126 $base_le = $le;
910a0a6d 1127 }
508fd430 1128 my @le_wits = sort $self->path_witnesses( $le );
3a2ebbf4 1129 if( _is_within( \@path_wits, \@le_wits ) ) {
1130 # This is the right path.
1131 return $direction eq 'next' ? $le->[1] : $le->[0];
1132 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1133 $alt_le = $le;
1134 }
8e1394aa 1135 }
e2902068 1136 # Got this far? Return the alternate path if it exists.
3a2ebbf4 1137 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 1138 if $alt_le;
e2902068 1139
1140 # Got this far? Return the base path if it exists.
3a2ebbf4 1141 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 1142 if $base_le;
e2902068 1143
1144 # Got this far? We have no appropriate path.
2c669bca 1145 warn "Could not find $direction node from " . $node->id
910a0a6d 1146 . " along path $path";
8e1394aa 1147 return undef;
1148}
1149
4a8828f0 1150# Some set logic.
1151sub _is_within {
1152 my( $set1, $set2 ) = @_;
7854e12e 1153 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 1154 foreach my $el ( @$set1 ) {
910a0a6d 1155 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 1156 }
1157 return $ret;
1158}
1159
4e5a7b2c 1160# Return the string that joins together a list of witnesses for
1161# display on a single path.
1162sub _witnesses_of_label {
1163 my( $self, $label ) = @_;
1164 my $regex = $self->wit_list_separator;
1165 my @answer = split( /\Q$regex\E/, $label );
1166 return @answer;
b0b4421a 1167}
1168
d4b75f44 1169=head2 common_readings
1170
1171Returns the list of common readings in the graph (i.e. those readings that are
1172shared by all non-lacunose witnesses.)
1173
1174=cut
1175
1176sub common_readings {
1177 my $self = shift;
1178 my @common = grep { $_->is_common } $self->readings;
1179 return @common;
1180}
1181
b0b4421a 1182=head2 path_text( $sigil, $mainsigil [, $start, $end ] )
1183
1184Returns the text of a witness (plus its backup, if we are using a layer)
1185as stored in the collation. The text is returned as a string, where the
1186individual readings are joined with spaces and the meta-readings (e.g.
1187lacunae) are omitted. Optional specification of $start and $end allows
1188the generation of a subset of the witness text.
4e5a7b2c 1189
b0b4421a 1190=cut
1191
1192sub path_text {
861c3e27 1193 my( $self, $wit, $start, $end ) = @_;
b0b4421a 1194 $start = $self->start unless $start;
1195 $end = $self->end unless $end;
861c3e27 1196 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
b0b4421a 1197 return join( ' ', map { $_->text } @path );
1198}
4e5a7b2c 1199
1200=head1 INITIALIZATION METHODS
1201
1202These are mostly for use by parsers.
1203
1204=head2 make_witness_path( $witness )
1205
1206Link the array of readings contained in $witness->path (and in
1207$witness->uncorrected_path if it exists) into collation paths.
1208Clear out the arrays when finished.
de51424a 1209
4e5a7b2c 1210=head2 make_witness_paths
1211
1212Call make_witness_path for all witnesses in the tradition.
1213
1214=cut
930ff666 1215
7e450e44 1216# For use when a collation is constructed from a base text and an apparatus.
1217# We have the sequences of readings and just need to add path edges.
1f7aa795 1218# When we are done, clear out the witness path attributes, as they are no
1219# longer needed.
1220# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 1221
6a222840 1222sub make_witness_paths {
1223 my( $self ) = @_;
910a0a6d 1224 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 1225 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 1226 $self->make_witness_path( $wit );
7854e12e 1227 }
7854e12e 1228}
1229
6a222840 1230sub make_witness_path {
7854e12e 1231 my( $self, $wit ) = @_;
1232 my @chain = @{$wit->path};
15d2d3df 1233 my $sig = $wit->sigil;
7854e12e 1234 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 1235 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 1236 }
1f7aa795 1237 if( $wit->is_layered ) {
d9e873d0 1238 @chain = @{$wit->uncorrected_path};
1239 foreach my $idx( 0 .. $#chain-1 ) {
1240 my $source = $chain[$idx];
1241 my $target = $chain[$idx+1];
1242 $self->add_path( $source, $target, $sig.$self->ac_label )
1243 unless $self->has_path( $source, $target, $sig );
1244 }
15d2d3df 1245 }
1f7aa795 1246 $wit->clear_path;
1247 $wit->clear_uncorrected_path;
e2902068 1248}
1249
4e5a7b2c 1250=head2 calculate_ranks
1251
1252Calculate the reading ranks (that is, their aligned positions relative
1253to each other) for the graph. This can only be called on linear collations.
1254
b365fbae 1255=begin testing
1256
1257use Text::Tradition;
1258
1259my $cxfile = 't/data/Collatex-16.xml';
1260my $t = Text::Tradition->new(
1261 'name' => 'inline',
1262 'input' => 'CollateX',
1263 'file' => $cxfile,
1264 );
1265my $c = $t->collation;
1266
1267# Make an svg
bfcbcecb 1268my $table = $c->alignment_table;
1269ok( $c->has_cached_table, "Alignment table was cached" );
1270is( $c->alignment_table, $table, "Cached table returned upon second call" );
b365fbae 1271$c->calculate_ranks;
bfcbcecb 1272is( $c->alignment_table, $table, "Cached table retained with no rank change" );
b365fbae 1273$c->add_relationship( 'n9', 'n23', { 'type' => 'spelling' } );
bfcbcecb 1274isnt( $c->alignment_table, $table, "Alignment table changed after relationship add" );
b365fbae 1275
1276=end testing
1277
4e5a7b2c 1278=cut
1279
910a0a6d 1280sub calculate_ranks {
1281 my $self = shift;
b365fbae 1282 # Save the existing ranks, in case we need to invalidate the cached SVG.
1283 my %existing_ranks;
910a0a6d 1284 # Walk a version of the graph where every node linked by a relationship
1285 # edge is fundamentally the same node, and do a topological ranking on
1286 # the nodes in this graph.
c9bf3dbf 1287 my $topo_graph = Graph->new();
910a0a6d 1288 my %rel_containers;
1289 my $rel_ctr = 0;
1290 # Add the nodes
1291 foreach my $r ( $self->readings ) {
3a2ebbf4 1292 next if exists $rel_containers{$r->id};
910a0a6d 1293 my @rels = $r->related_readings( 'colocated' );
1294 if( @rels ) {
1295 # Make a relationship container.
1296 push( @rels, $r );
c9bf3dbf 1297 my $rn = 'rel_container_' . $rel_ctr++;
1298 $topo_graph->add_vertex( $rn );
910a0a6d 1299 foreach( @rels ) {
3a2ebbf4 1300 $rel_containers{$_->id} = $rn;
910a0a6d 1301 }
1302 } else {
1303 # Add a new node to mirror the old node.
3a2ebbf4 1304 $rel_containers{$r->id} = $r->id;
1305 $topo_graph->add_vertex( $r->id );
910a0a6d 1306 }
4a8828f0 1307 }
3a1f2523 1308
3a2ebbf4 1309 # Add the edges.
910a0a6d 1310 foreach my $r ( $self->readings ) {
b365fbae 1311 $existing_ranks{$r} = $r->rank;
3a2ebbf4 1312 foreach my $n ( $self->sequence->successors( $r->id ) ) {
1313 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
1314 $rel_containers{$n} );
4e5a7b2c 1315 # $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 1316 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 1317 }
1318 }
1319
1320 # Now do the rankings, starting with the start node.
3a2ebbf4 1321 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 1322 my $node_ranks = { $topo_start => 0 };
910a0a6d 1323 my @curr_origin = ( $topo_start );
1324 # A little iterative function.
1325 while( @curr_origin ) {
c9bf3dbf 1326 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 1327 }
1328 # Transfer our rankings from the topological graph to the real one.
1329 foreach my $r ( $self->readings ) {
3a2ebbf4 1330 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1331 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 1332 } else {
63778331 1333 # Die. Find the last rank we calculated.
1334 my @all_defined = sort { $node_ranks->{$rel_containers{$a->id}}
1335 <=> $node_ranks->{$rel_containers{$b->id}} }
1336 $self->readings;
1337 my $last = pop @all_defined;
1338 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
67da8d6c 1339 }
de51424a 1340 }
bfcbcecb 1341 # Do we need to invalidate the cached data?
1342 if( $self->has_cached_svg || $self->has_cached_table ) {
b365fbae 1343 foreach my $r ( $self->readings ) {
1344 next if $existing_ranks{$r} == $r->rank;
bfcbcecb 1345 $self->_clear_cache;
b365fbae 1346 last;
1347 }
1348 }
8e1394aa 1349}
3a1f2523 1350
910a0a6d 1351sub _assign_rank {
c9bf3dbf 1352 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1353 # Look at each of the children of @current_nodes. If all the child's
1354 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1355 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1356 # parent gets a rank.
910a0a6d 1357 my @next_nodes;
1358 foreach my $c ( @current_nodes ) {
c9bf3dbf 1359 warn "Current reading $c has no rank!"
1360 unless exists $node_ranks->{$c};
1361 # print STDERR "Looking at child of node $c, rank "
1362 # . $node_ranks->{$c} . "\n";
1363 foreach my $child ( $graph->successors( $c ) ) {
1364 next if exists $node_ranks->{$child};
910a0a6d 1365 my $highest_rank = -1;
1366 my $skip = 0;
c9bf3dbf 1367 foreach my $parent ( $graph->predecessors( $child ) ) {
1368 if( exists $node_ranks->{$parent} ) {
1369 $highest_rank = $node_ranks->{$parent}
1370 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1371 } else {
1372 $skip = 1;
1373 last;
1374 }
1375 }
1376 next if $skip;
c9bf3dbf 1377 my $c_rank = $highest_rank + 1;
1378 # print STDERR "Assigning rank $c_rank to node $child \n";
1379 $node_ranks->{$child} = $c_rank;
910a0a6d 1380 push( @next_nodes, $child );
1381 }
1382 }
1383 return @next_nodes;
4cdd82f1 1384}
910a0a6d 1385
4e5a7b2c 1386=head2 flatten_ranks
1387
1388A convenience method for parsing collation data. Searches the graph for readings
1389with the same text at the same rank, and merges any that are found.
1390
1391=cut
1392
0e476982 1393sub flatten_ranks {
1394 my $self = shift;
1395 my %unique_rank_rdg;
1396 foreach my $rdg ( $self->readings ) {
1397 next unless $rdg->has_rank;
1398 my $key = $rdg->rank . "||" . $rdg->text;
1399 if( exists $unique_rank_rdg{$key} ) {
1400 # Combine!
56eefa04 1401 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1402 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
0ecb975c 1403 # TODO see if this now makes a common point.
0e476982 1404 } else {
1405 $unique_rank_rdg{$key} = $rdg;
1406 }
1407 }
1408}
1409
4633f9e4 1410=head2 remove_collations
1411
1412Another convenience method for parsing. Removes all 'collation' relationships
1413that were defined in order to get the reading ranks to be correct.
1414
1415=begin testing
1416
1417use Text::Tradition;
1418
1419my $cxfile = 't/data/Collatex-16.xml';
1420my $t = Text::Tradition->new(
1421 'name' => 'inline',
1422 'input' => 'CollateX',
1423 'file' => $cxfile,
1424 );
1425my $c = $t->collation;
1426
1427isnt( $c->reading('n23')->rank, $c->reading('n9')->rank, "Rank skew exists" );
1428$c->add_relationship( 'n23', 'n9', { 'type' => 'collated', 'scope' => 'local' } );
1429is( scalar $c->relationships, 4, "Found all expected relationships" );
1430$c->remove_collations;
1431is( scalar $c->relationships, 3, "Collated relationships now gone" );
1432is( $c->reading('n23')->rank, $c->reading('n9')->rank, "Aligned ranks were preserved" );
1433
1434=end testing
1435
1436=cut
1437
1438sub remove_collations {
1439 my $self = shift;
1440 foreach my $reledge ( $self->relationships ) {
1441 my $relobj = $self->relations->get_relationship( $reledge );
1442 if( $relobj && $relobj->type eq 'collated' ) {
1443 $self->relations->delete_relationship( $reledge );
1444 }
1445 }
1446}
1447
1448
d4b75f44 1449=head2 calculate_common_readings
1450
1451Goes through the graph identifying the readings that appear in every witness
1452(apart from those with lacunae at that spot.) Marks them as common and returns
1453the list.
1454
1455=begin testing
1456
1457use Text::Tradition;
1458
1459my $cxfile = 't/data/Collatex-16.xml';
1460my $t = Text::Tradition->new(
1461 'name' => 'inline',
1462 'input' => 'CollateX',
1463 'file' => $cxfile,
1464 );
1465my $c = $t->collation;
1466
1467my @common = $c->calculate_common_readings();
1468is( scalar @common, 8, "Found correct number of common readings" );
1469my @marked = sort $c->common_readings();
1470is( scalar @common, 8, "All common readings got marked as such" );
1471my @expected = qw/ n1 n12 n16 n19 n20 n5 n6 n7 /;
1472is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1473
1474=end testing
1475
1476=cut
1477
1478sub calculate_common_readings {
1479 my $self = shift;
1480 my @common;
1dd07bda 1481 my $table = $self->alignment_table;
d4b75f44 1482 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
7f52eac8 1483 my @row = map { $_->{'tokens'}->[$idx]
1484 ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1485 @{$table->{'alignment'}};
d4b75f44 1486 my %hash;
1487 foreach my $r ( @row ) {
1488 if( $r ) {
1489 $hash{$r->id} = $r unless $r->is_meta;
1490 } else {
1491 $hash{'UNDEF'} = $r;
1492 }
1493 }
1494 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1495 my( $r ) = values %hash;
1496 $r->is_common( 1 );
1497 push( @common, $r );
1498 }
1499 }
1500 return @common;
1501}
1502
861c3e27 1503=head2 text_from_paths
1504
1505Calculate the text array for all witnesses from the path, for later consistency
1506checking. Only to be used if there is no non-graph-based way to know the
1507original texts.
1508
1509=cut
1510
1511sub text_from_paths {
1512 my $self = shift;
1513 foreach my $wit ( $self->tradition->witnesses ) {
1514 my @text = split( /\s+/,
1515 $self->reading_sequence( $self->start, $self->end, $wit->sigil ) );
1516 $wit->text( \@text );
1517 if( $wit->is_layered ) {
1518 my @uctext = split( /\s+/,
1519 $self->reading_sequence( $self->start, $self->end,
1520 $wit->sigil.$self->ac_label ) );
1521 $wit->text( \@uctext );
1522 }
1523 }
1524}
0e476982 1525
4e5a7b2c 1526=head1 UTILITY FUNCTIONS
1527
1528=head2 common_predecessor( $reading_a, $reading_b )
8e1394aa 1529
4e5a7b2c 1530Find the last reading that occurs in sequence before both the given readings.
1531
1532=head2 common_successor( $reading_a, $reading_b )
1533
1534Find the first reading that occurs in sequence after both the given readings.
1535
22222af9 1536=begin testing
1537
1538use Text::Tradition;
1539
1540my $cxfile = 't/data/Collatex-16.xml';
1541my $t = Text::Tradition->new(
1542 'name' => 'inline',
1543 'input' => 'CollateX',
1544 'file' => $cxfile,
1545 );
1546my $c = $t->collation;
1547
4e5a7b2c 1548is( $c->common_predecessor( 'n9', 'n23' )->id,
22222af9 1549 'n20', "Found correct common predecessor" );
4e5a7b2c 1550is( $c->common_successor( 'n9', 'n23' )->id,
22222af9 1551 '#END#', "Found correct common successor" );
1552
4e5a7b2c 1553is( $c->common_predecessor( 'n19', 'n17' )->id,
22222af9 1554 'n16', "Found correct common predecessor for readings on same path" );
4e5a7b2c 1555is( $c->common_successor( 'n21', 'n26' )->id,
22222af9 1556 '#END#', "Found correct common successor for readings on same path" );
1557
1558=end testing
1559
1560=cut
1561
1562## Return the closest reading that is a predecessor of both the given readings.
1563sub common_predecessor {
1564 my $self = shift;
4e5a7b2c 1565 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1566 return $self->_common_in_path( $r1, $r2, 'predecessors' );
22222af9 1567}
1568
1569sub common_successor {
1570 my $self = shift;
4e5a7b2c 1571 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1572 return $self->_common_in_path( $r1, $r2, 'successors' );
22222af9 1573}
1574
027d819c 1575sub _common_in_path {
22222af9 1576 my( $self, $r1, $r2, $dir ) = @_;
1577 my $iter = $r1->rank > $r2->rank ? $r1->rank : $r2->rank;
1578 $iter = $self->end->rank - $iter if $dir eq 'successors';
1579 my @candidates;
1580 my @last_checked = ( $r1, $r2 );
1581 my %all_seen;
1582 while( !@candidates ) {
1583 my @new_lc;
1584 foreach my $lc ( @last_checked ) {
1585 foreach my $p ( $lc->$dir ) {
1586 if( $all_seen{$p->id} ) {
1587 push( @candidates, $p );
1588 } else {
1589 $all_seen{$p->id} = 1;
1590 push( @new_lc, $p );
1591 }
1592 }
1593 }
1594 @last_checked = @new_lc;
1595 }
1596 my @answer = sort { $a->rank <=> $b->rank } @candidates;
1597 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1598}
1599
63778331 1600sub throw {
1601 Text::Tradition::Error->throw(
1602 'ident' => 'Collation error',
1603 'message' => $_[0],
1604 );
1605}
1606
dd3b58b0 1607no Moose;
1608__PACKAGE__->meta->make_immutable;
e867486f 1609
027d819c 1610=head1 LICENSE
e867486f 1611
027d819c 1612This package is free software and is provided "as is" without express
1613or implied warranty. You can redistribute it and/or modify it under
1614the same terms as Perl itself.
e867486f 1615
027d819c 1616=head1 AUTHOR
e867486f 1617
027d819c 1618Tara L Andrews E<lt>aurum@cpan.orgE<gt>