small conveniences
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
bfcbcecb 5use File::Which;
c9bf3dbf 6use Graph;
8e1394aa 7use IPC::Run qw( run binary );
910a0a6d 8use Text::CSV_XS;
b15511bf 9use Text::Tradition::Collation::Reading;
22222af9 10use Text::Tradition::Collation::RelationshipStore;
63778331 11use Text::Tradition::Error;
df6d9812 12use XML::LibXML;
a344ffcf 13use XML::LibXML::XPathContext;
dd3b58b0 14use Moose;
15
3a2ebbf4 16has 'sequence' => (
d047cd52 17 is => 'ro',
3a2ebbf4 18 isa => 'Graph',
19 default => sub { Graph->new() },
d047cd52 20 handles => {
3a2ebbf4 21 paths => 'edges',
d047cd52 22 },
d047cd52 23 );
3a2ebbf4 24
25has 'relations' => (
26 is => 'ro',
22222af9 27 isa => 'Text::Tradition::Collation::RelationshipStore',
28 handles => {
29 relationships => 'relationships',
30 related_readings => 'related_readings',
202ccb18 31 get_relationship => 'get_relationship',
ee801e17 32 del_relationship => 'del_relationship',
22222af9 33 },
34 writer => '_set_relations',
3a2ebbf4 35 );
dd3b58b0 36
3a2ebbf4 37has 'tradition' => (
38 is => 'ro',
d047cd52 39 isa => 'Text::Tradition',
8d9a1cd8 40 weak_ref => 1,
d047cd52 41 );
dd3b58b0 42
3a2ebbf4 43has 'readings' => (
44 isa => 'HashRef[Text::Tradition::Collation::Reading]',
45 traits => ['Hash'],
46 handles => {
47 reading => 'get',
48 _add_reading => 'set',
49 del_reading => 'delete',
50 has_reading => 'exists',
51 readings => 'values',
52 },
53 default => sub { {} },
54 );
910a0a6d 55
4a8828f0 56has 'wit_list_separator' => (
7854e12e 57 is => 'rw',
58 isa => 'Str',
59 default => ', ',
60 );
61
62has 'baselabel' => (
63 is => 'rw',
64 isa => 'Str',
65 default => 'base text',
66 );
4a8828f0 67
15d2d3df 68has 'linear' => (
69 is => 'rw',
70 isa => 'Bool',
71 default => 1,
72 );
c84275ff 73
ef9d481f 74has 'ac_label' => (
75 is => 'rw',
76 isa => 'Str',
77 default => ' (a.c.)',
78 );
3a2ebbf4 79
80has 'start' => (
81 is => 'ro',
82 isa => 'Text::Tradition::Collation::Reading',
83 writer => '_set_start',
84 weak_ref => 1,
85 );
86
87has 'end' => (
88 is => 'ro',
89 isa => 'Text::Tradition::Collation::Reading',
90 writer => '_set_end',
91 weak_ref => 1,
92 );
b365fbae 93
94has 'cached_svg' => (
95 is => 'rw',
96 isa => 'Str',
97 predicate => 'has_cached_svg',
98 clearer => 'wipe_svg',
99 );
1dd07bda 100
101has 'cached_table' => (
102 is => 'rw',
103 isa => 'HashRef',
104 predicate => 'has_cached_table',
105 clearer => 'wipe_table',
106 );
c1915ab9 107
108has '_graphcalc_done' => (
109 is => 'rw',
110 isa => 'Bool',
111 default => undef,
112 );
1f563ac3 113
4e5a7b2c 114=head1 NAME
115
116Text::Tradition::Collation - a software model for a text collation
117
118=head1 SYNOPSIS
119
120 use Text::Tradition;
121 my $t = Text::Tradition->new(
122 'name' => 'this is a text',
123 'input' => 'TEI',
124 'file' => '/path/to/tei_parallel_seg_file.xml' );
125
126 my $c = $t->collation;
127 my @readings = $c->readings;
128 my @paths = $c->paths;
129 my @relationships = $c->relationships;
130
131 my $svg_variant_graph = $t->collation->as_svg();
132
133=head1 DESCRIPTION
134
135Text::Tradition is a library for representation and analysis of collated
136texts, particularly medieval ones. The Collation is the central feature of
137a Tradition, where the text, its sequence of readings, and its relationships
138between readings are actually kept.
139
140=head1 CONSTRUCTOR
141
142=head2 new
143
144The constructor. Takes a hash or hashref of the following arguments:
145
146=over
147
148=item * tradition - The Text::Tradition object to which the collation
149belongs. Required.
150
151=item * linear - Whether the collation should be linear; that is, whether
152transposed readings should be treated as two linked readings rather than one,
153and therefore whether the collation graph is acyclic. Defaults to true.
154
4e5a7b2c 155=item * baselabel - The default label for the path taken by a base text
156(if any). Defaults to 'base text'.
157
158=item * wit_list_separator - The string to join a list of witnesses for
159purposes of making labels in display graphs. Defaults to ', '.
160
161=item * ac_label - The extra label to tack onto a witness sigil when
162representing another layer of path for the given witness - that is, when
163a text has more than one possible reading due to scribal corrections or
164the like. Defaults to ' (a.c.)'.
165
166=back
167
168=head1 ACCESSORS
169
170=head2 tradition
171
172=head2 linear
173
4e5a7b2c 174=head2 wit_list_separator
175
176=head2 baselabel
177
178=head2 ac_label
179
180Simple accessors for collation attributes.
181
182=head2 start
183
184The meta-reading at the start of every witness path.
185
186=head2 end
187
188The meta-reading at the end of every witness path.
189
190=head2 readings
191
192Returns all Reading objects in the graph.
193
194=head2 reading( $id )
195
196Returns the Reading object corresponding to the given ID.
197
198=head2 add_reading( $reading_args )
199
200Adds a new reading object to the collation.
201See L<Text::Tradition::Collation::Reading> for the available arguments.
202
203=head2 del_reading( $object_or_id )
204
205Removes the given reading from the collation, implicitly removing its
206paths and relationships.
207
208=head2 merge_readings( $main, $second )
209
210Merges the $second reading into the $main one.
211The arguments may be either readings or reading IDs.
212
213=head2 has_reading( $id )
214
215Predicate to see whether a given reading ID is in the graph.
216
217=head2 reading_witnesses( $object_or_id )
218
219Returns a list of sigils whose witnesses contain the reading.
220
221=head2 paths
222
223Returns all reading paths within the document - that is, all edges in the
224collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
225
226=head2 add_path( $source, $target, $sigil )
227
228Links the given readings in the collation in sequence, under the given witness
229sigil. The readings may be specified by object or ID.
230
231=head2 del_path( $source, $target, $sigil )
232
233Links the given readings in the collation in sequence, under the given witness
234sigil. The readings may be specified by object or ID.
235
236=head2 has_path( $source, $target );
237
238Returns true if the two readings are linked in sequence in any witness.
239The readings may be specified by object or ID.
240
241=head2 relationships
242
243Returns all Relationship objects in the collation.
244
245=head2 add_relationship( $reading, $other_reading, $options )
246
247Adds a new relationship of the type given in $options between the two readings,
248which may be specified by object or ID. Returns a value of ( $status, @vectors)
249where $status is true on success, and @vectors is a list of relationship edges
250that were ultimately added.
251See L<Text::Tradition::Collation::Relationship> for the available options.
252
253=cut
dd3b58b0 254
d047cd52 255sub BUILD {
3a2ebbf4 256 my $self = shift;
22222af9 257 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
3a2ebbf4 258 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
259 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 260}
784877d9 261
3a2ebbf4 262### Reading construct/destruct functions
263
264sub add_reading {
265 my( $self, $reading ) = @_;
266 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
267 my %args = %$reading;
268 $reading = Text::Tradition::Collation::Reading->new(
269 'collation' => $self,
270 %args );
271 }
272 # First check to see if a reading with this ID exists.
273 if( $self->reading( $reading->id ) ) {
63778331 274 throw( "Collation already has a reading with id " . $reading->id );
3a2ebbf4 275 }
c1915ab9 276 $self->_graphcalc_done(0);
3a2ebbf4 277 $self->_add_reading( $reading->id => $reading );
278 # Once the reading has been added, put it in both graphs.
279 $self->sequence->add_vertex( $reading->id );
22222af9 280 $self->relations->add_reading( $reading->id );
3a2ebbf4 281 return $reading;
eca16057 282};
283
3a2ebbf4 284around del_reading => sub {
285 my $orig = shift;
286 my $self = shift;
287 my $arg = shift;
288
289 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
290 $arg = $arg->id;
291 }
3a2ebbf4 292 # Remove the reading from the graphs.
c1915ab9 293 $self->_graphcalc_done(0);
3a2ebbf4 294 $self->sequence->delete_vertex( $arg );
22222af9 295 $self->relations->delete_reading( $arg );
3a2ebbf4 296
297 # Carry on.
298 $self->$orig( $arg );
299};
7854e12e 300
3a2ebbf4 301# merge_readings( $main, $to_be_deleted );
7854e12e 302
3a2ebbf4 303sub merge_readings {
304 my $self = shift;
305
306 # We only need the IDs for adding paths to the graph, not the reading
307 # objects themselves.
49d4f2ac 308 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
c1915ab9 309 $self->_graphcalc_done(0);
3a2ebbf4 310
311 # The kept reading should inherit the paths and the relationships
312 # of the deleted reading.
313 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
314 my @vector = ( $kept );
315 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
316 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 317 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 318 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
319 $self->sequence->add_edge( @vector );
320 my $fwits = $self->sequence->get_edge_attributes( @vector );
321 @wits{keys %$fwits} = values %$fwits;
322 $self->sequence->set_edge_attributes( @vector, \%wits );
323 }
22222af9 324 $self->relations->merge_readings( $kept, $deleted, $combine_char );
3a2ebbf4 325
326 # Do the deletion deed.
49d4f2ac 327 if( $combine_char ) {
328 my $kept_obj = $self->reading( $kept );
329 my $new_text = join( $combine_char, $kept_obj->text,
330 $self->reading( $deleted )->text );
331 $kept_obj->alter_text( $new_text );
332 }
3a2ebbf4 333 $self->del_reading( $deleted );
334}
7854e12e 335
3265b0ce 336
3a2ebbf4 337# Helper function for manipulating the graph.
338sub _stringify_args {
339 my( $self, $first, $second, $arg ) = @_;
340 $first = $first->id
341 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
342 $second = $second->id
343 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
344 return( $first, $second, $arg );
345}
df6d9812 346
4e5a7b2c 347# Helper function for manipulating the graph.
348sub _objectify_args {
349 my( $self, $first, $second, $arg ) = @_;
350 $first = $self->reading( $first )
351 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
352 $second = $self->reading( $second )
353 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
354 return( $first, $second, $arg );
355}
3a2ebbf4 356### Path logic
357
358sub add_path {
359 my $self = shift;
360
361 # We only need the IDs for adding paths to the graph, not the reading
362 # objects themselves.
363 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
364
c1915ab9 365 $self->_graphcalc_done(0);
3a2ebbf4 366 # Connect the readings
367 $self->sequence->add_edge( $source, $target );
368 # Note the witness in question
369 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 370};
371
3a2ebbf4 372sub del_path {
373 my $self = shift;
49d4f2ac 374 my @args;
375 if( ref( $_[0] ) eq 'ARRAY' ) {
376 my $e = shift @_;
377 @args = ( @$e, @_ );
378 } else {
379 @args = @_;
380 }
3a2ebbf4 381
382 # We only need the IDs for adding paths to the graph, not the reading
383 # objects themselves.
49d4f2ac 384 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 385
c1915ab9 386 $self->_graphcalc_done(0);
3a2ebbf4 387 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 388 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 389 }
390 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
391 $self->sequence->delete_edge( $source, $target );
392 }
784877d9 393}
394
3a2ebbf4 395
15d2d3df 396# Extra graph-alike utility
397sub has_path {
3a2ebbf4 398 my $self = shift;
399 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
400 return undef unless $self->sequence->has_edge( $source, $target );
401 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 402}
403
4e5a7b2c 404=head2 clear_witness( @sigil_list )
3a2ebbf4 405
4e5a7b2c 406Clear the given witnesses out of the collation entirely, removing references
407to them in paths, and removing readings that belong only to them. Should only
408be called via $tradition->del_witness.
3a2ebbf4 409
410=cut
411
4e5a7b2c 412sub clear_witness {
413 my( $self, @sigils ) = @_;
414
c1915ab9 415 $self->_graphcalc_done(0);
4e5a7b2c 416 # Clear the witness(es) out of the paths
417 foreach my $e ( $self->paths ) {
418 foreach my $sig ( @sigils ) {
419 $self->del_path( $e, $sig );
420 }
421 }
422
423 # Clear out the newly unused readings
424 foreach my $r ( $self->readings ) {
425 unless( $self->reading_witnesses( $r ) ) {
426 $self->del_reading( $r );
427 }
428 }
429}
3a2ebbf4 430
431sub add_relationship {
432 my $self = shift;
22222af9 433 my( $source, $target, $opts ) = $self->_stringify_args( @_ );
63778331 434 my( @vectors ) = $self->relations->add_relationship( $source,
64ae6270 435 $self->reading( $source ), $target, $self->reading( $target ), $opts );
c1915ab9 436 $self->_graphcalc_done(0);
63778331 437 return @vectors;
22222af9 438}
ef9d481f 439
22222af9 440=head2 reading_witnesses( $reading )
910a0a6d 441
22222af9 442Return a list of sigils corresponding to the witnesses in which the reading appears.
3265b0ce 443
22222af9 444=cut
1d310495 445
1d310495 446sub reading_witnesses {
447 my( $self, $reading ) = @_;
448 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 449 # arbitrarily chosen "incoming". Thus, special-case the start node.
450 if( $reading eq $self->start ) {
451 return map { $_->sigil } $self->tradition->witnesses;
452 }
1d310495 453 my %all_witnesses;
454 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
455 my $wits = $self->sequence->get_edge_attributes( @$e );
456 @all_witnesses{ keys %$wits } = 1;
457 }
c12bb878 458 my $acstr = $self->ac_label;
459 foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
460 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
461 }
1d310495 462 return keys %all_witnesses;
910a0a6d 463}
464
4e5a7b2c 465=head1 OUTPUT METHODS
8e1394aa 466
0ecb975c 467=head2 as_svg( \%options )
8e1394aa 468
0068967c 469Returns an SVG string that represents the graph, via as_dot and graphviz.
bfcbcecb 470See as_dot for a list of options. Must have GraphViz (dot) installed to run.
8e1394aa 471
472=cut
473
474sub as_svg {
0ecb975c 475 my( $self, $opts ) = @_;
bfcbcecb 476 throw( "Need GraphViz installed to output SVG" )
477 unless File::Which::which( 'dot' );
e247aad1 478 my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
c1915ab9 479 $self->calculate_ranks() unless $self->_graphcalc_done;
e247aad1 480 if( !$self->has_cached_svg || $opts->{'recalc'} || $want_subgraph ) {
b365fbae 481 my @cmd = qw/dot -Tsvg/;
482 my( $svg, $err );
483 my $dotfile = File::Temp->new();
484 ## USE FOR DEBUGGING
485 # $dotfile->unlink_on_destroy(0);
486 binmode $dotfile, ':utf8';
487 print $dotfile $self->as_dot( $opts );
488 push( @cmd, $dotfile->filename );
489 run( \@cmd, ">", binary(), \$svg );
e247aad1 490 $svg = decode_utf8( $svg );
491 $self->cached_svg( $svg ) unless $want_subgraph;
492 return $svg;
493 } else {
494 return $self->cached_svg;
b365fbae 495 }
8e1394aa 496}
497
b22576c6 498
0ecb975c 499=head2 as_dot( \%options )
b22576c6 500
0ecb975c 501Returns a string that is the collation graph expressed in dot
502(i.e. GraphViz) format. Options include:
b22576c6 503
0ecb975c 504=over 4
b22576c6 505
0ecb975c 506=item * from
b22576c6 507
0ecb975c 508=item * to
df6d9812 509
0ecb975c 510=item * color_common
511
512=back
df6d9812 513
514=cut
515
516sub as_dot {
0ecb975c 517 my( $self, $opts ) = @_;
518 my $startrank = $opts->{'from'} if $opts;
519 my $endrank = $opts->{'to'} if $opts;
520 my $color_common = $opts->{'color_common'} if $opts;
b365fbae 521 my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
522 && $self->end->rank > 100;
523
b22576c6 524 # Check the arguments
525 if( $startrank ) {
526 return if $endrank && $startrank > $endrank;
527 return if $startrank > $self->end->rank;
528 }
529 if( defined $endrank ) {
530 return if $endrank < 0;
f1b3b33a 531 $endrank = undef if $endrank == $self->end->rank;
b22576c6 532 }
533
67da8d6c 534 my $graph_name = $self->tradition->name;
535 $graph_name =~ s/[^\w\s]//g;
536 $graph_name = join( '_', split( /\s+/, $graph_name ) );
f13b5582 537
538 my %graph_attrs = (
539 'rankdir' => 'LR',
540 'bgcolor' => 'none',
541 );
542 my %node_attrs = (
b8990398 543 'fontsize' => 14,
f13b5582 544 'fillcolor' => 'white',
545 'style' => 'filled',
546 'shape' => 'ellipse'
547 );
548 my %edge_attrs = (
549 'arrowhead' => 'open',
550 'color' => '#000000',
551 'fontcolor' => '#000000',
552 );
553
67da8d6c 554 my $dot = sprintf( "digraph %s {\n", $graph_name );
f13b5582 555 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
556 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
df6d9812 557
b22576c6 558 # Output substitute start/end readings if necessary
559 if( $startrank ) {
560 $dot .= "\t\"#SUBSTART#\" [ label=\"...\" ];\n";
561 }
562 if( $endrank ) {
563 $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n";
564 }
b365fbae 565 if( $STRAIGHTENHACK ) {
566 ## HACK part 1
567 $dot .= "\tsubgraph { rank=same \"#START#\" \"#SILENT#\" }\n";
568 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
569 }
b22576c6 570 my %used; # Keep track of the readings that actually appear in the graph
30ddc24c 571 # Sort the readings by rank if we have ranks; this speeds layout.
572 my @all_readings = $self->end->has_rank
573 ? sort { $a->rank <=> $b->rank } $self->readings
574 : $self->readings;
4633f9e4 575 # TODO Refrain from outputting lacuna nodes - just grey out the edges.
30ddc24c 576 foreach my $reading ( @all_readings ) {
b22576c6 577 # Only output readings within our rank range.
578 next if $startrank && $reading->rank < $startrank;
579 next if $endrank && $reading->rank > $endrank;
580 $used{$reading->id} = 1;
910a0a6d 581 # Need not output nodes without separate labels
3a2ebbf4 582 next if $reading->id eq $reading->text;
d4b75f44 583 my $rattrs;
30f0df34 584 my $label = $reading->text;
8f9cab7b 585 $label =~ s/\"/\\\"/g;
d4b75f44 586 $rattrs->{'label'} = $label;
0ecb975c 587 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
d4b75f44 588 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
df6d9812 589 }
3a2ebbf4 590
30ddc24c 591 # Add the real edges. Need to weight one edge per rank jump, in a
592 # continuous line.
b365fbae 593 # my $weighted = $self->_add_edge_weights;
b22576c6 594 my @edges = $self->paths;
3bdec618 595 my( %substart, %subend );
b22576c6 596 foreach my $edge ( @edges ) {
597 # Do we need to output this edge?
508fd430 598 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
027d819c 599 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
f13b5582 600 my $variables = { %edge_attrs, 'label' => $label };
30ddc24c 601
b22576c6 602 # Account for the rank gap if necessary
30ddc24c 603 my $rank0 = $self->reading( $edge->[0] )->rank
604 if $self->reading( $edge->[0] )->has_rank;
605 my $rank1 = $self->reading( $edge->[1] )->rank
606 if $self->reading( $edge->[1] )->has_rank;
607 if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
608 $variables->{'minlen'} = $rank1 - $rank0;
609 }
610
611 # Add the calculated edge weights
b365fbae 612 # if( exists $weighted->{$edge->[0]}
e247aad1 613 # && $weighted->{$edge->[0]} eq $edge->[1] ) {
614 # # $variables->{'color'} = 'red';
615 # $variables->{'weight'} = 3.0;
616 # }
30ddc24c 617
508fd430 618 # EXPERIMENTAL: make edge width reflect no. of witnesses
619 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
620 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
621
f13b5582 622 my $varopts = _dot_attr_string( $variables );
623 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
624 $edge->[0], $edge->[1], $varopts );
3bdec618 625 } elsif( $used{$edge->[0]} ) {
626 $subend{$edge->[0]} = 1;
627 } elsif( $used{$edge->[1]} ) {
628 $substart{$edge->[1]} = 1;
b22576c6 629 }
df6d9812 630 }
3bdec618 631 # Add substitute start and end edges if necessary
632 foreach my $node ( keys %substart ) {
027d819c 633 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 634 my $variables = { %edge_attrs, 'label' => $witstr };
635 my $varopts = _dot_attr_string( $variables );
636 $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;";
3bdec618 637 }
638 foreach my $node ( keys %subend ) {
027d819c 639 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 640 my $variables = { %edge_attrs, 'label' => $witstr };
641 my $varopts = _dot_attr_string( $variables );
642 $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;";
3bdec618 643 }
b365fbae 644 # HACK part 2
645 if( $STRAIGHTENHACK ) {
646 $dot .= "\t\"#END#\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
647 }
30ddc24c 648
df6d9812 649 $dot .= "}\n";
650 return $dot;
651}
652
f13b5582 653sub _dot_attr_string {
654 my( $hash ) = @_;
655 my @attrs;
656 foreach my $k ( sort keys %$hash ) {
657 my $v = $hash->{$k};
658 push( @attrs, $k.'="'.$v.'"' );
659 }
660 return( '[ ' . join( ', ', @attrs ) . ' ]' );
661}
662
30ddc24c 663sub _add_edge_weights {
664 my $self = shift;
665 # Walk the graph from START to END, choosing the successor node with
666 # the largest number of witness paths each time.
667 my $weighted = {};
668 my $curr = $self->start->id;
008fc8a6 669 my $ranked = $self->end->has_rank;
30ddc24c 670 while( $curr ne $self->end->id ) {
008fc8a6 671 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
30ddc24c 672 my @succ = sort { $self->path_witnesses( $curr, $a )
673 <=> $self->path_witnesses( $curr, $b ) }
674 $self->sequence->successors( $curr );
675 my $next = pop @succ;
008fc8a6 676 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
30ddc24c 677 # Try to avoid lacunae in the weighted path.
008fc8a6 678 while( @succ &&
679 ( $self->reading( $next )->is_lacuna ||
680 $nextrank - $rank > 1 ) ){
30ddc24c 681 $next = pop @succ;
682 }
683 $weighted->{$curr} = $next;
684 $curr = $next;
685 }
686 return $weighted;
687}
688
027d819c 689=head2 path_witnesses( $edge )
690
691Returns the list of sigils whose witnesses are associated with the given edge.
692The edge can be passed as either an array or an arrayref of ( $source, $target ).
693
694=cut
695
3a2ebbf4 696sub path_witnesses {
697 my( $self, @edge ) = @_;
698 # If edge is an arrayref, cope.
699 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
700 my $e = shift @edge;
701 @edge = @$e;
702 }
703 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
508fd430 704 return @wits;
3a2ebbf4 705}
706
027d819c 707sub _path_display_label {
508fd430 708 my $self = shift;
709 my @wits = sort @_;
8f9cab7b 710 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
711 if( scalar @wits > $maj ) {
f13b5582 712 # TODO break out a.c. wits
8f9cab7b 713 return 'majority';
714 } else {
715 return join( ', ', @wits );
716 }
717}
1dd07bda 718
719=head2 witnesses_at_rank
720
721Returns a list of witnesses that are not lacunose, for a given rank.
722
723=cut
724
725sub witnesses_at_rank {
726 my( $self, $rank ) = @_;
727}
8f9cab7b 728
4e5a7b2c 729=head2 as_graphml
8e1394aa 730
4e5a7b2c 731Returns a GraphML representation of the collation. The GraphML will contain
732two graphs. The first expresses the attributes of the readings and the witness
733paths that link them; the second expresses the relationships that link the
734readings. This is the native transfer format for a tradition.
8e1394aa 735
56eefa04 736=begin testing
737
738use Text::Tradition;
739
740my $READINGS = 311;
741my $PATHS = 361;
742
743my $datafile = 't/data/florilegium_tei_ps.xml';
744my $tradition = Text::Tradition->new( 'input' => 'TEI',
745 'name' => 'test0',
746 'file' => $datafile,
747 'linear' => 1 );
748
749ok( $tradition, "Got a tradition object" );
750is( scalar $tradition->witnesses, 13, "Found all witnesses" );
751ok( $tradition->collation, "Tradition has a collation" );
752
753my $c = $tradition->collation;
754is( scalar $c->readings, $READINGS, "Collation has all readings" );
755is( scalar $c->paths, $PATHS, "Collation has all paths" );
756is( scalar $c->relationships, 0, "Collation has all relationships" );
757
758# Add a few relationships
759$c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
760$c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
761$c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
762
763# Now write it to GraphML and parse it again.
764
765my $graphml = $c->as_graphml;
766my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
767is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
768is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
769is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
770
771=end testing
772
8e1394aa 773=cut
774
775sub as_graphml {
3a2ebbf4 776 my( $self ) = @_;
3d14b48e 777 $self->calculate_ranks unless $self->_graphcalc_done;
778
8e1394aa 779 # Some namespaces
780 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
781 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
782 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 783 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 784
785 # Create the document and root node
786 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
787 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
788 $graphml->setDocumentElement( $root );
789 $root->setNamespace( $xsi_ns, 'xsi', 0 );
790 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
791
e309421a 792 # Add the data keys for the graph
793 my %graph_data_keys;
794 my $gdi = 0;
1d310495 795 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
e309421a 796 foreach my $datum ( @graph_attributes ) {
797 $graph_data_keys{$datum} = 'dg'.$gdi++;
798 my $key = $root->addNewChild( $graphml_ns, 'key' );
799 $key->setAttribute( 'attr.name', $datum );
800 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
801 $key->setAttribute( 'for', 'graph' );
802 $key->setAttribute( 'id', $graph_data_keys{$datum} );
803 }
f6066bac 804
8e1394aa 805 # Add the data keys for nodes
ef9d481f 806 my %node_data_keys;
807 my $ndi = 0;
3a2ebbf4 808 my %node_data = (
809 id => 'string',
255875b8 810 text => 'string',
3a2ebbf4 811 rank => 'string',
812 is_start => 'boolean',
813 is_end => 'boolean',
814 is_lacuna => 'boolean',
815 );
816 foreach my $datum ( keys %node_data ) {
910a0a6d 817 $node_data_keys{$datum} = 'dn'.$ndi++;
818 my $key = $root->addNewChild( $graphml_ns, 'key' );
819 $key->setAttribute( 'attr.name', $datum );
3a2ebbf4 820 $key->setAttribute( 'attr.type', $node_data{$datum} );
910a0a6d 821 $key->setAttribute( 'for', 'node' );
822 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 823 }
824
df6d9812 825 # Add the data keys for edges, i.e. witnesses
ef9d481f 826 my $edi = 0;
827 my %edge_data_keys;
3a2ebbf4 828 my %edge_data = (
f523c7a8 829 class => 'string', # Class, deprecated soon
3a2ebbf4 830 witness => 'string', # ID/label for a path
831 relationship => 'string', # ID/label for a relationship
832 extra => 'boolean', # Path key
c84275ff 833 scope => 'string', # Relationship key
3d14b48e 834 annotation => 'string', # Relationship key
3a2ebbf4 835 non_correctable => 'boolean', # Relationship key
836 non_independent => 'boolean', # Relationship key
837 );
838 foreach my $datum ( keys %edge_data ) {
839 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 840 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 841 $key->setAttribute( 'attr.name', $datum );
842 $key->setAttribute( 'attr.type', $edge_data{$datum} );
910a0a6d 843 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 844 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 845 }
3a2ebbf4 846
22222af9 847 # Add the collation graph itself
2c669bca 848 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
849 $sgraph->setAttribute( 'edgedefault', 'directed' );
850 $sgraph->setAttribute( 'id', $self->tradition->name );
851 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
852 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
853 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
854 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
855 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
22222af9 856
e309421a 857 # Collation attribute data
858 foreach my $datum ( @graph_attributes ) {
2c669bca 859 my $value = $datum eq 'version' ? '3.0' : $self->$datum;
860 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 861 }
8e1394aa 862
863 my $node_ctr = 0;
864 my %node_hash;
22222af9 865 # Add our readings to the graph
3a2ebbf4 866 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
2c669bca 867 # Add to the main graph
868 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 869 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 870 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 871 $node_el->setAttribute( 'id', $node_xmlid );
255875b8 872 foreach my $d ( keys %node_data ) {
873 my $nval = $n->$d;
874 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
875 if defined $nval;
876 }
b15511bf 877 }
878
2c669bca 879 # Add the path edges to the sequence graph
df6d9812 880 my $edge_ctr = 0;
3a2ebbf4 881 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
882 # We add an edge in the graphml for every witness in $e.
508fd430 883 foreach my $wit ( sort $self->path_witnesses( $e ) ) {
3a2ebbf4 884 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
885 $node_hash{ $e->[0] },
886 $node_hash{ $e->[1] } );
2c669bca 887 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 888 $edge_el->setAttribute( 'source', $from );
889 $edge_el->setAttribute( 'target', $to );
890 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 891
892 # It's a witness path, so add the witness
893 my $base = $wit;
894 my $key = $edge_data_keys{'witness'};
895 # Is this an ante-corr witness?
896 my $aclabel = $self->ac_label;
897 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
898 # Keep the base witness
899 $base = $1;
900 # ...and record that this is an 'extra' reading path
901 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
902 }
903 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
f523c7a8 904 _add_graphml_data( $edge_el, $edge_data_keys{'class'}, 'path' );
3a2ebbf4 905 }
906 }
907
22222af9 908 # Add the relationship graph to the XML
027d819c 909 $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
2626f709 910 $node_data_keys{'id'}, \%edge_data_keys );
8e1394aa 911
94c00c71 912 # Save and return the thing
913 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 914 return $result;
df6d9812 915}
916
b15511bf 917sub _add_graphml_data {
918 my( $el, $key, $value ) = @_;
b15511bf 919 return unless defined $value;
c9bf3dbf 920 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 921 $data_el->setAttribute( 'key', $key );
922 $data_el->appendText( $value );
8e1394aa 923}
924
4e5a7b2c 925=head2 as_csv
910a0a6d 926
927Returns a CSV alignment table representation of the collation graph, one
2c669bca 928row per witness (or witness uncorrected.)
910a0a6d 929
930=cut
931
932sub as_csv {
3a2ebbf4 933 my( $self ) = @_;
1dd07bda 934 my $table = $self->alignment_table;
910a0a6d 935 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
936 my @result;
2c669bca 937 # Make the header row
938 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
939 push( @result, decode_utf8( $csv->string ) );
940 # Make the rest of the rows
941 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 942 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1dd07bda 943 my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
2c669bca 944 $csv->combine( @row );
910a0a6d 945 push( @result, decode_utf8( $csv->string ) );
946 }
3a2ebbf4 947 return join( "\n", @result );
910a0a6d 948}
949
1dd07bda 950=head2 alignment_table( $use_refs, $include_witnesses )
2c669bca 951
566f4595 952Return a reference to an alignment table, in a slightly enhanced CollateX
953format which looks like this:
954
955 $table = { alignment => [ { witness => "SIGIL",
4e5a7b2c 956 tokens => [ { t => "TEXT" }, ... ] },
566f4595 957 { witness => "SIG2",
4e5a7b2c 958 tokens => [ { t => "TEXT" }, ... ] },
566f4595 959 ... ],
960 length => TEXTLEN };
961
962If $use_refs is set to 1, the reading object is returned in the table
963instead of READINGTEXT; if not, the text of the reading is returned.
4e5a7b2c 964
965If $include_witnesses is set to a hashref, only the witnesses whose sigil
566f4595 966keys have a true hash value will be included.
2c669bca 967
968=cut
9f3ba6f7 969
1dd07bda 970sub alignment_table {
971 my( $self ) = @_;
c1915ab9 972 $self->calculate_ranks() unless $self->_graphcalc_done;
1dd07bda 973 return $self->cached_table if $self->has_cached_table;
974
0ecb975c 975 # Make sure we can do this
976 throw( "Need a linear graph in order to make an alignment table" )
977 unless $self->linear;
978 $self->calculate_ranks unless $self->end->has_rank;
979
2c669bca 980 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 981 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 982 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
eca16057 983 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 984 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1dd07bda 985 my @row = _make_witness_row( \@wit_path, \@all_pos );
2c669bca 986 push( @{$table->{'alignment'}},
987 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 988 if( $wit->is_layered ) {
989 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
861c3e27 990 $wit->sigil.$self->ac_label );
1dd07bda 991 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
2c669bca 992 push( @{$table->{'alignment'}},
993 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 994 }
995 }
1dd07bda 996 $self->cached_table( $table );
997 return $table;
910a0a6d 998}
999
1000sub _make_witness_row {
1dd07bda 1001 my( $path, $positions ) = @_;
910a0a6d 1002 my %char_hash;
1003 map { $char_hash{$_} = undef } @$positions;
2c669bca 1004 my $debug = 0;
910a0a6d 1005 foreach my $rdg ( @$path ) {
eca16057 1006 my $rtext = $rdg->text;
1007 $rtext = '#LACUNA#' if $rdg->is_lacuna;
2c669bca 1008 print STDERR "rank " . $rdg->rank . "\n" if $debug;
3a2ebbf4 1009 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
1dd07bda 1010 $char_hash{$rdg->rank} = { 't' => $rdg };
910a0a6d 1011 }
1012 my @row = map { $char_hash{$_} } @$positions;
eca16057 1013 # Fill in lacuna markers for undef spots in the row
1014 my $last_el = shift @row;
1015 my @filled_row = ( $last_el );
1016 foreach my $el ( @row ) {
0e476982 1017 # If we are using node reference, make the lacuna node appear many times
1018 # in the table. If not, use the lacuna tag.
1dd07bda 1019 if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1020 $el = $last_el;
eca16057 1021 }
1022 push( @filled_row, $el );
1023 $last_el = $el;
1024 }
1025 return @filled_row;
910a0a6d 1026}
1027
4e5a7b2c 1028=head1 NAVIGATION METHODS
910a0a6d 1029
4e5a7b2c 1030=head2 reading_sequence( $first, $last, $sigil, $backup )
e2902068 1031
1032Returns the ordered list of readings, starting with $first and ending
4e5a7b2c 1033with $last, for the witness given in $sigil. If a $backup sigil is
1034specified (e.g. when walking a layered witness), it will be used wherever
1035no $sigil path exists. If there is a base text reading, that will be
1036used wherever no path exists for $sigil or $backup.
e2902068 1037
1038=cut
1039
910a0a6d 1040# TODO Think about returning some lazy-eval iterator.
b0b4421a 1041# TODO Get rid of backup; we should know from what witness is whether we need it.
910a0a6d 1042
e2902068 1043sub reading_sequence {
861c3e27 1044 my( $self, $start, $end, $witness ) = @_;
e2902068 1045
930ff666 1046 $witness = $self->baselabel unless $witness;
e2902068 1047 my @readings = ( $start );
1048 my %seen;
1049 my $n = $start;
3a2ebbf4 1050 while( $n && $n->id ne $end->id ) {
1051 if( exists( $seen{$n->id} ) ) {
63778331 1052 throw( "Detected loop for $witness at " . $n->id );
910a0a6d 1053 }
3a2ebbf4 1054 $seen{$n->id} = 1;
910a0a6d 1055
861c3e27 1056 my $next = $self->next_reading( $n, $witness );
44771cf2 1057 unless( $next ) {
63778331 1058 throw( "Did not find any path for $witness from reading " . $n->id );
44771cf2 1059 }
910a0a6d 1060 push( @readings, $next );
1061 $n = $next;
e2902068 1062 }
1063 # Check that the last reading is our end reading.
1064 my $last = $readings[$#readings];
63778331 1065 throw( "Last reading found from " . $start->text .
1066 " for witness $witness is not the end!" ) # TODO do we get this far?
3a2ebbf4 1067 unless $last->id eq $end->id;
e2902068 1068
1069 return @readings;
1070}
1071
4e5a7b2c 1072=head2 next_reading( $reading, $sigil );
8e1394aa 1073
4a8828f0 1074Returns the reading that follows the given reading along the given witness
930ff666 1075path.
8e1394aa 1076
1077=cut
1078
4a8828f0 1079sub next_reading {
e2902068 1080 # Return the successor via the corresponding path.
8e1394aa 1081 my $self = shift;
3a2ebbf4 1082 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 1083 return undef unless $answer;
3a2ebbf4 1084 return $self->reading( $answer );
8e1394aa 1085}
1086
4e5a7b2c 1087=head2 prior_reading( $reading, $sigil )
8e1394aa 1088
4a8828f0 1089Returns the reading that precedes the given reading along the given witness
930ff666 1090path.
8e1394aa 1091
1092=cut
1093
4a8828f0 1094sub prior_reading {
e2902068 1095 # Return the predecessor via the corresponding path.
8e1394aa 1096 my $self = shift;
3a2ebbf4 1097 my $answer = $self->_find_linked_reading( 'prior', @_ );
1098 return $self->reading( $answer );
8e1394aa 1099}
1100
4a8828f0 1101sub _find_linked_reading {
861c3e27 1102 my( $self, $direction, $node, $path ) = @_;
1103
1104 # Get a backup if we are dealing with a layered witness
1105 my $alt_path;
1106 my $aclabel = $self->ac_label;
1107 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1108 $alt_path = $1;
1109 }
1110
e2902068 1111 my @linked_paths = $direction eq 'next'
3a2ebbf4 1112 ? $self->sequence->edges_from( $node )
1113 : $self->sequence->edges_to( $node );
e2902068 1114 return undef unless scalar( @linked_paths );
8e1394aa 1115
e2902068 1116 # We have to find the linked path that contains all of the
1117 # witnesses supplied in $path.
1118 my( @path_wits, @alt_path_wits );
4e5a7b2c 1119 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1120 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 1121 my $base_le;
1122 my $alt_le;
1123 foreach my $le ( @linked_paths ) {
3a2ebbf4 1124 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 1125 $base_le = $le;
910a0a6d 1126 }
508fd430 1127 my @le_wits = sort $self->path_witnesses( $le );
3a2ebbf4 1128 if( _is_within( \@path_wits, \@le_wits ) ) {
1129 # This is the right path.
1130 return $direction eq 'next' ? $le->[1] : $le->[0];
1131 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1132 $alt_le = $le;
1133 }
8e1394aa 1134 }
e2902068 1135 # Got this far? Return the alternate path if it exists.
3a2ebbf4 1136 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 1137 if $alt_le;
e2902068 1138
1139 # Got this far? Return the base path if it exists.
3a2ebbf4 1140 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 1141 if $base_le;
e2902068 1142
1143 # Got this far? We have no appropriate path.
2c669bca 1144 warn "Could not find $direction node from " . $node->id
910a0a6d 1145 . " along path $path";
8e1394aa 1146 return undef;
1147}
1148
4a8828f0 1149# Some set logic.
1150sub _is_within {
1151 my( $set1, $set2 ) = @_;
7854e12e 1152 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 1153 foreach my $el ( @$set1 ) {
910a0a6d 1154 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 1155 }
1156 return $ret;
1157}
1158
4e5a7b2c 1159# Return the string that joins together a list of witnesses for
1160# display on a single path.
1161sub _witnesses_of_label {
1162 my( $self, $label ) = @_;
1163 my $regex = $self->wit_list_separator;
1164 my @answer = split( /\Q$regex\E/, $label );
1165 return @answer;
b0b4421a 1166}
1167
d4b75f44 1168=head2 common_readings
1169
1170Returns the list of common readings in the graph (i.e. those readings that are
1171shared by all non-lacunose witnesses.)
1172
1173=cut
1174
1175sub common_readings {
1176 my $self = shift;
1177 my @common = grep { $_->is_common } $self->readings;
1178 return @common;
1179}
1180
b0b4421a 1181=head2 path_text( $sigil, $mainsigil [, $start, $end ] )
1182
1183Returns the text of a witness (plus its backup, if we are using a layer)
1184as stored in the collation. The text is returned as a string, where the
1185individual readings are joined with spaces and the meta-readings (e.g.
1186lacunae) are omitted. Optional specification of $start and $end allows
1187the generation of a subset of the witness text.
4e5a7b2c 1188
b0b4421a 1189=cut
1190
1191sub path_text {
861c3e27 1192 my( $self, $wit, $start, $end ) = @_;
b0b4421a 1193 $start = $self->start unless $start;
1194 $end = $self->end unless $end;
861c3e27 1195 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
b0b4421a 1196 return join( ' ', map { $_->text } @path );
1197}
4e5a7b2c 1198
1199=head1 INITIALIZATION METHODS
1200
1201These are mostly for use by parsers.
1202
1203=head2 make_witness_path( $witness )
1204
1205Link the array of readings contained in $witness->path (and in
1206$witness->uncorrected_path if it exists) into collation paths.
1207Clear out the arrays when finished.
de51424a 1208
4e5a7b2c 1209=head2 make_witness_paths
1210
1211Call make_witness_path for all witnesses in the tradition.
1212
1213=cut
930ff666 1214
7e450e44 1215# For use when a collation is constructed from a base text and an apparatus.
1216# We have the sequences of readings and just need to add path edges.
1f7aa795 1217# When we are done, clear out the witness path attributes, as they are no
1218# longer needed.
1219# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 1220
6a222840 1221sub make_witness_paths {
1222 my( $self ) = @_;
910a0a6d 1223 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 1224 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 1225 $self->make_witness_path( $wit );
7854e12e 1226 }
7854e12e 1227}
1228
6a222840 1229sub make_witness_path {
7854e12e 1230 my( $self, $wit ) = @_;
1231 my @chain = @{$wit->path};
15d2d3df 1232 my $sig = $wit->sigil;
7854e12e 1233 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 1234 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 1235 }
1f7aa795 1236 if( $wit->is_layered ) {
d9e873d0 1237 @chain = @{$wit->uncorrected_path};
1238 foreach my $idx( 0 .. $#chain-1 ) {
1239 my $source = $chain[$idx];
1240 my $target = $chain[$idx+1];
1241 $self->add_path( $source, $target, $sig.$self->ac_label )
1242 unless $self->has_path( $source, $target, $sig );
1243 }
15d2d3df 1244 }
1f7aa795 1245 $wit->clear_path;
1246 $wit->clear_uncorrected_path;
e2902068 1247}
1248
4e5a7b2c 1249=head2 calculate_ranks
1250
1251Calculate the reading ranks (that is, their aligned positions relative
1252to each other) for the graph. This can only be called on linear collations.
1253
b365fbae 1254=begin testing
1255
1256use Text::Tradition;
1257
1258my $cxfile = 't/data/Collatex-16.xml';
1259my $t = Text::Tradition->new(
1260 'name' => 'inline',
1261 'input' => 'CollateX',
1262 'file' => $cxfile,
1263 );
1264my $c = $t->collation;
1265
1266# Make an svg
bfcbcecb 1267my $table = $c->alignment_table;
1268ok( $c->has_cached_table, "Alignment table was cached" );
1269is( $c->alignment_table, $table, "Cached table returned upon second call" );
b365fbae 1270$c->calculate_ranks;
bfcbcecb 1271is( $c->alignment_table, $table, "Cached table retained with no rank change" );
b365fbae 1272$c->add_relationship( 'n9', 'n23', { 'type' => 'spelling' } );
bfcbcecb 1273isnt( $c->alignment_table, $table, "Alignment table changed after relationship add" );
b365fbae 1274
1275=end testing
1276
4e5a7b2c 1277=cut
1278
910a0a6d 1279sub calculate_ranks {
1280 my $self = shift;
b365fbae 1281 # Save the existing ranks, in case we need to invalidate the cached SVG.
1282 my %existing_ranks;
910a0a6d 1283 # Walk a version of the graph where every node linked by a relationship
1284 # edge is fundamentally the same node, and do a topological ranking on
1285 # the nodes in this graph.
c9bf3dbf 1286 my $topo_graph = Graph->new();
910a0a6d 1287 my %rel_containers;
1288 my $rel_ctr = 0;
1289 # Add the nodes
1290 foreach my $r ( $self->readings ) {
3a2ebbf4 1291 next if exists $rel_containers{$r->id};
910a0a6d 1292 my @rels = $r->related_readings( 'colocated' );
1293 if( @rels ) {
1294 # Make a relationship container.
1295 push( @rels, $r );
c9bf3dbf 1296 my $rn = 'rel_container_' . $rel_ctr++;
1297 $topo_graph->add_vertex( $rn );
910a0a6d 1298 foreach( @rels ) {
3a2ebbf4 1299 $rel_containers{$_->id} = $rn;
910a0a6d 1300 }
1301 } else {
1302 # Add a new node to mirror the old node.
3a2ebbf4 1303 $rel_containers{$r->id} = $r->id;
1304 $topo_graph->add_vertex( $r->id );
910a0a6d 1305 }
4a8828f0 1306 }
3a1f2523 1307
3a2ebbf4 1308 # Add the edges.
910a0a6d 1309 foreach my $r ( $self->readings ) {
b365fbae 1310 $existing_ranks{$r} = $r->rank;
3a2ebbf4 1311 foreach my $n ( $self->sequence->successors( $r->id ) ) {
1312 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
1313 $rel_containers{$n} );
4e5a7b2c 1314 # $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 1315 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 1316 }
1317 }
1318
1319 # Now do the rankings, starting with the start node.
3a2ebbf4 1320 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 1321 my $node_ranks = { $topo_start => 0 };
910a0a6d 1322 my @curr_origin = ( $topo_start );
1323 # A little iterative function.
1324 while( @curr_origin ) {
c9bf3dbf 1325 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 1326 }
1327 # Transfer our rankings from the topological graph to the real one.
1328 foreach my $r ( $self->readings ) {
3a2ebbf4 1329 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1330 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 1331 } else {
63778331 1332 # Die. Find the last rank we calculated.
1333 my @all_defined = sort { $node_ranks->{$rel_containers{$a->id}}
1334 <=> $node_ranks->{$rel_containers{$b->id}} }
1335 $self->readings;
1336 my $last = pop @all_defined;
1337 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
67da8d6c 1338 }
de51424a 1339 }
bfcbcecb 1340 # Do we need to invalidate the cached data?
1341 if( $self->has_cached_svg || $self->has_cached_table ) {
b365fbae 1342 foreach my $r ( $self->readings ) {
1343 next if $existing_ranks{$r} == $r->rank;
c1915ab9 1344 # Something has changed, so clear the cache
bfcbcecb 1345 $self->_clear_cache;
c1915ab9 1346 # ...and recalculate the common readings.
1347 $self->calculate_common_readings();
b365fbae 1348 last;
1349 }
1350 }
c1915ab9 1351 # The graph calculation information is now up to date.
1352 $self->_graphcalc_done(1);
8e1394aa 1353}
3a1f2523 1354
910a0a6d 1355sub _assign_rank {
c9bf3dbf 1356 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1357 # Look at each of the children of @current_nodes. If all the child's
1358 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1359 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1360 # parent gets a rank.
910a0a6d 1361 my @next_nodes;
1362 foreach my $c ( @current_nodes ) {
c9bf3dbf 1363 warn "Current reading $c has no rank!"
1364 unless exists $node_ranks->{$c};
1365 # print STDERR "Looking at child of node $c, rank "
1366 # . $node_ranks->{$c} . "\n";
1367 foreach my $child ( $graph->successors( $c ) ) {
1368 next if exists $node_ranks->{$child};
910a0a6d 1369 my $highest_rank = -1;
1370 my $skip = 0;
c9bf3dbf 1371 foreach my $parent ( $graph->predecessors( $child ) ) {
1372 if( exists $node_ranks->{$parent} ) {
1373 $highest_rank = $node_ranks->{$parent}
1374 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1375 } else {
1376 $skip = 1;
1377 last;
1378 }
1379 }
1380 next if $skip;
c9bf3dbf 1381 my $c_rank = $highest_rank + 1;
1382 # print STDERR "Assigning rank $c_rank to node $child \n";
1383 $node_ranks->{$child} = $c_rank;
910a0a6d 1384 push( @next_nodes, $child );
1385 }
1386 }
1387 return @next_nodes;
4cdd82f1 1388}
910a0a6d 1389
c1915ab9 1390sub _clear_cache {
1391 my $self = shift;
1392 $self->wipe_svg if $self->has_cached_svg;
1393 $self->wipe_table if $self->has_cached_table;
1394}
1395
1396
4e5a7b2c 1397=head2 flatten_ranks
1398
1399A convenience method for parsing collation data. Searches the graph for readings
1400with the same text at the same rank, and merges any that are found.
1401
1402=cut
1403
0e476982 1404sub flatten_ranks {
1405 my $self = shift;
1406 my %unique_rank_rdg;
1407 foreach my $rdg ( $self->readings ) {
1408 next unless $rdg->has_rank;
1409 my $key = $rdg->rank . "||" . $rdg->text;
1410 if( exists $unique_rank_rdg{$key} ) {
1411 # Combine!
56eefa04 1412 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1413 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
0ecb975c 1414 # TODO see if this now makes a common point.
0e476982 1415 } else {
1416 $unique_rank_rdg{$key} = $rdg;
1417 }
1418 }
1419}
4633f9e4 1420
1421
d4b75f44 1422=head2 calculate_common_readings
1423
1424Goes through the graph identifying the readings that appear in every witness
1425(apart from those with lacunae at that spot.) Marks them as common and returns
1426the list.
1427
1428=begin testing
1429
1430use Text::Tradition;
1431
1432my $cxfile = 't/data/Collatex-16.xml';
1433my $t = Text::Tradition->new(
1434 'name' => 'inline',
1435 'input' => 'CollateX',
1436 'file' => $cxfile,
1437 );
1438my $c = $t->collation;
1439
1440my @common = $c->calculate_common_readings();
1441is( scalar @common, 8, "Found correct number of common readings" );
1442my @marked = sort $c->common_readings();
1443is( scalar @common, 8, "All common readings got marked as such" );
1444my @expected = qw/ n1 n12 n16 n19 n20 n5 n6 n7 /;
1445is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1446
1447=end testing
1448
1449=cut
1450
1451sub calculate_common_readings {
1452 my $self = shift;
1453 my @common;
c1915ab9 1454 map { $_->is_common( 0 ) } $self->readings;
1455 # Implicitly calls calculate_ranks
1dd07bda 1456 my $table = $self->alignment_table;
d4b75f44 1457 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
7f52eac8 1458 my @row = map { $_->{'tokens'}->[$idx]
1459 ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1460 @{$table->{'alignment'}};
d4b75f44 1461 my %hash;
1462 foreach my $r ( @row ) {
1463 if( $r ) {
1464 $hash{$r->id} = $r unless $r->is_meta;
1465 } else {
1466 $hash{'UNDEF'} = $r;
1467 }
1468 }
1469 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1470 my( $r ) = values %hash;
1471 $r->is_common( 1 );
1472 push( @common, $r );
1473 }
1474 }
1475 return @common;
1476}
1477
861c3e27 1478=head2 text_from_paths
1479
1480Calculate the text array for all witnesses from the path, for later consistency
1481checking. Only to be used if there is no non-graph-based way to know the
1482original texts.
1483
1484=cut
1485
1486sub text_from_paths {
1487 my $self = shift;
1488 foreach my $wit ( $self->tradition->witnesses ) {
1489 my @text = split( /\s+/,
1490 $self->reading_sequence( $self->start, $self->end, $wit->sigil ) );
1491 $wit->text( \@text );
1492 if( $wit->is_layered ) {
1493 my @uctext = split( /\s+/,
1494 $self->reading_sequence( $self->start, $self->end,
1495 $wit->sigil.$self->ac_label ) );
1496 $wit->text( \@uctext );
1497 }
1498 }
1499}
0e476982 1500
4e5a7b2c 1501=head1 UTILITY FUNCTIONS
1502
1503=head2 common_predecessor( $reading_a, $reading_b )
8e1394aa 1504
4e5a7b2c 1505Find the last reading that occurs in sequence before both the given readings.
1506
1507=head2 common_successor( $reading_a, $reading_b )
1508
1509Find the first reading that occurs in sequence after both the given readings.
1510
22222af9 1511=begin testing
1512
1513use Text::Tradition;
1514
1515my $cxfile = 't/data/Collatex-16.xml';
1516my $t = Text::Tradition->new(
1517 'name' => 'inline',
1518 'input' => 'CollateX',
1519 'file' => $cxfile,
1520 );
1521my $c = $t->collation;
1522
4e5a7b2c 1523is( $c->common_predecessor( 'n9', 'n23' )->id,
22222af9 1524 'n20', "Found correct common predecessor" );
4e5a7b2c 1525is( $c->common_successor( 'n9', 'n23' )->id,
22222af9 1526 '#END#', "Found correct common successor" );
1527
4e5a7b2c 1528is( $c->common_predecessor( 'n19', 'n17' )->id,
22222af9 1529 'n16', "Found correct common predecessor for readings on same path" );
4e5a7b2c 1530is( $c->common_successor( 'n21', 'n26' )->id,
22222af9 1531 '#END#', "Found correct common successor for readings on same path" );
1532
1533=end testing
1534
1535=cut
1536
1537## Return the closest reading that is a predecessor of both the given readings.
1538sub common_predecessor {
1539 my $self = shift;
4e5a7b2c 1540 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1541 return $self->_common_in_path( $r1, $r2, 'predecessors' );
22222af9 1542}
1543
1544sub common_successor {
1545 my $self = shift;
4e5a7b2c 1546 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1547 return $self->_common_in_path( $r1, $r2, 'successors' );
22222af9 1548}
1549
027d819c 1550sub _common_in_path {
22222af9 1551 my( $self, $r1, $r2, $dir ) = @_;
1552 my $iter = $r1->rank > $r2->rank ? $r1->rank : $r2->rank;
1553 $iter = $self->end->rank - $iter if $dir eq 'successors';
1554 my @candidates;
1555 my @last_checked = ( $r1, $r2 );
1556 my %all_seen;
1557 while( !@candidates ) {
1558 my @new_lc;
1559 foreach my $lc ( @last_checked ) {
1560 foreach my $p ( $lc->$dir ) {
1561 if( $all_seen{$p->id} ) {
1562 push( @candidates, $p );
1563 } else {
1564 $all_seen{$p->id} = 1;
1565 push( @new_lc, $p );
1566 }
1567 }
1568 }
1569 @last_checked = @new_lc;
1570 }
1571 my @answer = sort { $a->rank <=> $b->rank } @candidates;
1572 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1573}
1574
63778331 1575sub throw {
1576 Text::Tradition::Error->throw(
1577 'ident' => 'Collation error',
1578 'message' => $_[0],
1579 );
1580}
1581
dd3b58b0 1582no Moose;
1583__PACKAGE__->meta->make_immutable;
e867486f 1584
027d819c 1585=head1 LICENSE
e867486f 1586
027d819c 1587This package is free software and is provided "as is" without express
1588or implied warranty. You can redistribute it and/or modify it under
1589the same terms as Perl itself.
e867486f 1590
027d819c 1591=head1 AUTHOR
e867486f 1592
027d819c 1593Tara L Andrews E<lt>aurum@cpan.orgE<gt>