refactor GraphML write/parse to use Moose introspection
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
bfcbcecb 5use File::Which;
c9bf3dbf 6use Graph;
8e1394aa 7use IPC::Run qw( run binary );
910a0a6d 8use Text::CSV_XS;
b15511bf 9use Text::Tradition::Collation::Reading;
22222af9 10use Text::Tradition::Collation::RelationshipStore;
63778331 11use Text::Tradition::Error;
df6d9812 12use XML::LibXML;
a344ffcf 13use XML::LibXML::XPathContext;
dd3b58b0 14use Moose;
15
3a2ebbf4 16has 'sequence' => (
d047cd52 17 is => 'ro',
3a2ebbf4 18 isa => 'Graph',
19 default => sub { Graph->new() },
d047cd52 20 handles => {
3a2ebbf4 21 paths => 'edges',
d047cd52 22 },
d047cd52 23 );
3a2ebbf4 24
25has 'relations' => (
26 is => 'ro',
22222af9 27 isa => 'Text::Tradition::Collation::RelationshipStore',
28 handles => {
29 relationships => 'relationships',
30 related_readings => 'related_readings',
202ccb18 31 get_relationship => 'get_relationship',
ee801e17 32 del_relationship => 'del_relationship',
22222af9 33 },
34 writer => '_set_relations',
3a2ebbf4 35 );
dd3b58b0 36
3a2ebbf4 37has 'tradition' => (
38 is => 'ro',
d047cd52 39 isa => 'Text::Tradition',
8d9a1cd8 40 weak_ref => 1,
d047cd52 41 );
dd3b58b0 42
3a2ebbf4 43has 'readings' => (
44 isa => 'HashRef[Text::Tradition::Collation::Reading]',
45 traits => ['Hash'],
46 handles => {
47 reading => 'get',
48 _add_reading => 'set',
49 del_reading => 'delete',
50 has_reading => 'exists',
51 readings => 'values',
52 },
53 default => sub { {} },
54 );
910a0a6d 55
4a8828f0 56has 'wit_list_separator' => (
7854e12e 57 is => 'rw',
58 isa => 'Str',
59 default => ', ',
60 );
61
62has 'baselabel' => (
63 is => 'rw',
64 isa => 'Str',
65 default => 'base text',
66 );
4a8828f0 67
15d2d3df 68has 'linear' => (
69 is => 'rw',
70 isa => 'Bool',
71 default => 1,
72 );
c84275ff 73
ef9d481f 74has 'ac_label' => (
75 is => 'rw',
76 isa => 'Str',
77 default => ' (a.c.)',
78 );
3a2ebbf4 79
4e483aa5 80has 'wordsep' => (
81 is => 'rw',
82 isa => 'Str',
83 default => ' ',
84 );
85
3a2ebbf4 86has 'start' => (
87 is => 'ro',
88 isa => 'Text::Tradition::Collation::Reading',
89 writer => '_set_start',
90 weak_ref => 1,
91 );
92
93has 'end' => (
94 is => 'ro',
95 isa => 'Text::Tradition::Collation::Reading',
96 writer => '_set_end',
97 weak_ref => 1,
98 );
b365fbae 99
100has 'cached_svg' => (
101 is => 'rw',
102 isa => 'Str',
103 predicate => 'has_cached_svg',
104 clearer => 'wipe_svg',
105 );
1dd07bda 106
107has 'cached_table' => (
108 is => 'rw',
109 isa => 'HashRef',
110 predicate => 'has_cached_table',
111 clearer => 'wipe_table',
112 );
c1915ab9 113
114has '_graphcalc_done' => (
115 is => 'rw',
116 isa => 'Bool',
117 default => undef,
118 );
1f563ac3 119
4e5a7b2c 120=head1 NAME
121
122Text::Tradition::Collation - a software model for a text collation
123
124=head1 SYNOPSIS
125
126 use Text::Tradition;
127 my $t = Text::Tradition->new(
128 'name' => 'this is a text',
129 'input' => 'TEI',
130 'file' => '/path/to/tei_parallel_seg_file.xml' );
131
132 my $c = $t->collation;
133 my @readings = $c->readings;
134 my @paths = $c->paths;
135 my @relationships = $c->relationships;
136
137 my $svg_variant_graph = $t->collation->as_svg();
138
139=head1 DESCRIPTION
140
141Text::Tradition is a library for representation and analysis of collated
142texts, particularly medieval ones. The Collation is the central feature of
143a Tradition, where the text, its sequence of readings, and its relationships
144between readings are actually kept.
145
146=head1 CONSTRUCTOR
147
148=head2 new
149
150The constructor. Takes a hash or hashref of the following arguments:
151
152=over
153
154=item * tradition - The Text::Tradition object to which the collation
155belongs. Required.
156
157=item * linear - Whether the collation should be linear; that is, whether
158transposed readings should be treated as two linked readings rather than one,
159and therefore whether the collation graph is acyclic. Defaults to true.
160
4e5a7b2c 161=item * baselabel - The default label for the path taken by a base text
162(if any). Defaults to 'base text'.
163
164=item * wit_list_separator - The string to join a list of witnesses for
165purposes of making labels in display graphs. Defaults to ', '.
166
167=item * ac_label - The extra label to tack onto a witness sigil when
168representing another layer of path for the given witness - that is, when
169a text has more than one possible reading due to scribal corrections or
170the like. Defaults to ' (a.c.)'.
171
4e483aa5 172=item * wordsep - The string used to separate words in the original text.
173Defaults to ' '.
174
4e5a7b2c 175=back
176
177=head1 ACCESSORS
178
179=head2 tradition
180
181=head2 linear
182
4e5a7b2c 183=head2 wit_list_separator
184
185=head2 baselabel
186
187=head2 ac_label
188
4e483aa5 189=head2 wordsep
190
4e5a7b2c 191Simple accessors for collation attributes.
192
193=head2 start
194
195The meta-reading at the start of every witness path.
196
197=head2 end
198
199The meta-reading at the end of every witness path.
200
201=head2 readings
202
203Returns all Reading objects in the graph.
204
205=head2 reading( $id )
206
207Returns the Reading object corresponding to the given ID.
208
209=head2 add_reading( $reading_args )
210
211Adds a new reading object to the collation.
212See L<Text::Tradition::Collation::Reading> for the available arguments.
213
214=head2 del_reading( $object_or_id )
215
216Removes the given reading from the collation, implicitly removing its
217paths and relationships.
218
4e483aa5 219=head2 merge_readings( $main, $second, $concatenate, $with_str )
220
221Merges the $second reading into the $main one. If $concatenate is true, then
222the merged node will carry the text of both readings, concatenated with either
223$with_str (if specified) or a sensible default (the empty string if the
224appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
4e5a7b2c 225
4e483aa5 226The first two arguments may be either readings or reading IDs.
4e5a7b2c 227
228=head2 has_reading( $id )
229
230Predicate to see whether a given reading ID is in the graph.
231
232=head2 reading_witnesses( $object_or_id )
233
234Returns a list of sigils whose witnesses contain the reading.
235
236=head2 paths
237
238Returns all reading paths within the document - that is, all edges in the
239collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
240
241=head2 add_path( $source, $target, $sigil )
242
243Links the given readings in the collation in sequence, under the given witness
244sigil. The readings may be specified by object or ID.
245
246=head2 del_path( $source, $target, $sigil )
247
248Links the given readings in the collation in sequence, under the given witness
249sigil. The readings may be specified by object or ID.
250
251=head2 has_path( $source, $target );
252
253Returns true if the two readings are linked in sequence in any witness.
254The readings may be specified by object or ID.
255
256=head2 relationships
257
258Returns all Relationship objects in the collation.
259
260=head2 add_relationship( $reading, $other_reading, $options )
261
262Adds a new relationship of the type given in $options between the two readings,
263which may be specified by object or ID. Returns a value of ( $status, @vectors)
264where $status is true on success, and @vectors is a list of relationship edges
265that were ultimately added.
266See L<Text::Tradition::Collation::Relationship> for the available options.
267
268=cut
dd3b58b0 269
d047cd52 270sub BUILD {
3a2ebbf4 271 my $self = shift;
22222af9 272 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
3a2ebbf4 273 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
274 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 275}
784877d9 276
3a2ebbf4 277### Reading construct/destruct functions
278
279sub add_reading {
280 my( $self, $reading ) = @_;
281 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
282 my %args = %$reading;
283 $reading = Text::Tradition::Collation::Reading->new(
284 'collation' => $self,
285 %args );
286 }
287 # First check to see if a reading with this ID exists.
288 if( $self->reading( $reading->id ) ) {
63778331 289 throw( "Collation already has a reading with id " . $reading->id );
3a2ebbf4 290 }
c1915ab9 291 $self->_graphcalc_done(0);
3a2ebbf4 292 $self->_add_reading( $reading->id => $reading );
293 # Once the reading has been added, put it in both graphs.
294 $self->sequence->add_vertex( $reading->id );
22222af9 295 $self->relations->add_reading( $reading->id );
3a2ebbf4 296 return $reading;
eca16057 297};
298
3a2ebbf4 299around del_reading => sub {
300 my $orig = shift;
301 my $self = shift;
302 my $arg = shift;
303
304 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
305 $arg = $arg->id;
306 }
3a2ebbf4 307 # Remove the reading from the graphs.
c1915ab9 308 $self->_graphcalc_done(0);
4e483aa5 309 $self->_clear_cache; # Explicitly clear caches to GC the reading
3a2ebbf4 310 $self->sequence->delete_vertex( $arg );
22222af9 311 $self->relations->delete_reading( $arg );
3a2ebbf4 312
313 # Carry on.
314 $self->$orig( $arg );
315};
7854e12e 316
4e483aa5 317=begin testing
318
319use Text::Tradition;
320
321my $cxfile = 't/data/Collatex-16.xml';
322my $t = Text::Tradition->new(
323 'name' => 'inline',
324 'input' => 'CollateX',
325 'file' => $cxfile,
326 );
327my $c = $t->collation;
328
329my $rno = scalar $c->readings;
330# Split n21 for testing purposes
331my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
332my $old_r = $c->reading( 'n21' );
333$old_r->alter_text( 'to' );
334$c->del_path( 'n20', 'n21', 'A' );
335$c->add_path( 'n20', 'n21p0', 'A' );
336$c->add_path( 'n21p0', 'n21', 'A' );
337$c->flatten_ranks();
338ok( $c->reading( 'n21p0' ), "New reading exists" );
339is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
340
341# Combine n3 and n4
342$c->merge_readings( 'n3', 'n4', 1 );
343ok( !$c->reading('n4'), "Reading n4 is gone" );
344is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
345
346# Collapse n25 and n26
347$c->merge_readings( 'n25', 'n26' );
348ok( !$c->reading('n26'), "Reading n26 is gone" );
349is( $c->reading('n25')->text, 'rood', "Reading n25 has an unchanged word" );
350
351# Combine n21 and n21p0
352my $remaining = $c->reading('n21');
353$remaining ||= $c->reading('n22'); # one of these should still exist
354$c->merge_readings( 'n21p0', $remaining, 1 );
355ok( !$c->reading('n21'), "Reading $remaining is gone" );
356is( $c->reading('n21p0')->text, 'unto', "Reading n21p0 merged correctly" );
357
358=end testing
359
360=cut
7854e12e 361
3a2ebbf4 362sub merge_readings {
363 my $self = shift;
364
365 # We only need the IDs for adding paths to the graph, not the reading
366 # objects themselves.
4e483aa5 367 my( $kept, $deleted, $combine, $combine_char ) = $self->_stringify_args( @_ );
c1915ab9 368 $self->_graphcalc_done(0);
3a2ebbf4 369
370 # The kept reading should inherit the paths and the relationships
371 # of the deleted reading.
372 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
373 my @vector = ( $kept );
374 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
375 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 376 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 377 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
378 $self->sequence->add_edge( @vector );
379 my $fwits = $self->sequence->get_edge_attributes( @vector );
380 @wits{keys %$fwits} = values %$fwits;
381 $self->sequence->set_edge_attributes( @vector, \%wits );
382 }
22222af9 383 $self->relations->merge_readings( $kept, $deleted, $combine_char );
3a2ebbf4 384
385 # Do the deletion deed.
4e483aa5 386 if( $combine ) {
49d4f2ac 387 my $kept_obj = $self->reading( $kept );
4e483aa5 388 my $del_obj = $self->reading( $deleted );
389 my $joinstr = $combine_char;
390 unless( defined $joinstr ) {
391 $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
392 $joinstr = $self->wordsep unless defined $joinstr;
393 }
394 $kept_obj->alter_text( join( $joinstr, $kept_obj->text, $del_obj->text ) );
49d4f2ac 395 }
3a2ebbf4 396 $self->del_reading( $deleted );
397}
7854e12e 398
3265b0ce 399
3a2ebbf4 400# Helper function for manipulating the graph.
401sub _stringify_args {
4e483aa5 402 my( $self, $first, $second, @args ) = @_;
3a2ebbf4 403 $first = $first->id
404 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
405 $second = $second->id
406 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
4e483aa5 407 return( $first, $second, @args );
3a2ebbf4 408}
df6d9812 409
4e5a7b2c 410# Helper function for manipulating the graph.
411sub _objectify_args {
412 my( $self, $first, $second, $arg ) = @_;
413 $first = $self->reading( $first )
414 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
415 $second = $self->reading( $second )
416 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
417 return( $first, $second, $arg );
418}
3a2ebbf4 419### Path logic
420
421sub add_path {
422 my $self = shift;
423
424 # We only need the IDs for adding paths to the graph, not the reading
425 # objects themselves.
426 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
427
c1915ab9 428 $self->_graphcalc_done(0);
3a2ebbf4 429 # Connect the readings
430 $self->sequence->add_edge( $source, $target );
431 # Note the witness in question
432 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 433};
434
3a2ebbf4 435sub del_path {
436 my $self = shift;
49d4f2ac 437 my @args;
438 if( ref( $_[0] ) eq 'ARRAY' ) {
439 my $e = shift @_;
440 @args = ( @$e, @_ );
441 } else {
442 @args = @_;
443 }
3a2ebbf4 444
445 # We only need the IDs for adding paths to the graph, not the reading
446 # objects themselves.
49d4f2ac 447 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 448
c1915ab9 449 $self->_graphcalc_done(0);
3a2ebbf4 450 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 451 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 452 }
453 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
454 $self->sequence->delete_edge( $source, $target );
455 }
784877d9 456}
457
3a2ebbf4 458
15d2d3df 459# Extra graph-alike utility
460sub has_path {
3a2ebbf4 461 my $self = shift;
462 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
463 return undef unless $self->sequence->has_edge( $source, $target );
464 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 465}
466
4e5a7b2c 467=head2 clear_witness( @sigil_list )
3a2ebbf4 468
4e5a7b2c 469Clear the given witnesses out of the collation entirely, removing references
470to them in paths, and removing readings that belong only to them. Should only
471be called via $tradition->del_witness.
3a2ebbf4 472
473=cut
474
4e5a7b2c 475sub clear_witness {
476 my( $self, @sigils ) = @_;
477
c1915ab9 478 $self->_graphcalc_done(0);
4e5a7b2c 479 # Clear the witness(es) out of the paths
480 foreach my $e ( $self->paths ) {
481 foreach my $sig ( @sigils ) {
482 $self->del_path( $e, $sig );
483 }
484 }
485
486 # Clear out the newly unused readings
487 foreach my $r ( $self->readings ) {
488 unless( $self->reading_witnesses( $r ) ) {
489 $self->del_reading( $r );
490 }
491 }
492}
3a2ebbf4 493
494sub add_relationship {
495 my $self = shift;
22222af9 496 my( $source, $target, $opts ) = $self->_stringify_args( @_ );
63778331 497 my( @vectors ) = $self->relations->add_relationship( $source,
64ae6270 498 $self->reading( $source ), $target, $self->reading( $target ), $opts );
c1915ab9 499 $self->_graphcalc_done(0);
63778331 500 return @vectors;
22222af9 501}
ef9d481f 502
ca6e6095 503around qw/ get_relationship del_relationship / => sub {
504 my $orig = shift;
505 my $self = shift;
506 my @args = @_;
507 if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
508 @args = @{$_[0]};
509 }
510 my( $source, $target ) = $self->_stringify_args( @args );
511 $self->$orig( $source, $target );
512};
513
22222af9 514=head2 reading_witnesses( $reading )
910a0a6d 515
22222af9 516Return a list of sigils corresponding to the witnesses in which the reading appears.
3265b0ce 517
22222af9 518=cut
1d310495 519
1d310495 520sub reading_witnesses {
521 my( $self, $reading ) = @_;
522 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 523 # arbitrarily chosen "incoming". Thus, special-case the start node.
524 if( $reading eq $self->start ) {
525 return map { $_->sigil } $self->tradition->witnesses;
526 }
1d310495 527 my %all_witnesses;
528 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
529 my $wits = $self->sequence->get_edge_attributes( @$e );
530 @all_witnesses{ keys %$wits } = 1;
531 }
c12bb878 532 my $acstr = $self->ac_label;
533 foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
534 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
535 }
1d310495 536 return keys %all_witnesses;
910a0a6d 537}
538
4e5a7b2c 539=head1 OUTPUT METHODS
8e1394aa 540
0ecb975c 541=head2 as_svg( \%options )
8e1394aa 542
0068967c 543Returns an SVG string that represents the graph, via as_dot and graphviz.
bfcbcecb 544See as_dot for a list of options. Must have GraphViz (dot) installed to run.
8e1394aa 545
546=cut
547
548sub as_svg {
0ecb975c 549 my( $self, $opts ) = @_;
bfcbcecb 550 throw( "Need GraphViz installed to output SVG" )
551 unless File::Which::which( 'dot' );
e247aad1 552 my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
c1915ab9 553 $self->calculate_ranks() unless $self->_graphcalc_done;
e247aad1 554 if( !$self->has_cached_svg || $opts->{'recalc'} || $want_subgraph ) {
b365fbae 555 my @cmd = qw/dot -Tsvg/;
556 my( $svg, $err );
557 my $dotfile = File::Temp->new();
558 ## USE FOR DEBUGGING
559 # $dotfile->unlink_on_destroy(0);
560 binmode $dotfile, ':utf8';
561 print $dotfile $self->as_dot( $opts );
562 push( @cmd, $dotfile->filename );
563 run( \@cmd, ">", binary(), \$svg );
e247aad1 564 $svg = decode_utf8( $svg );
565 $self->cached_svg( $svg ) unless $want_subgraph;
566 return $svg;
567 } else {
568 return $self->cached_svg;
b365fbae 569 }
8e1394aa 570}
571
b22576c6 572
0ecb975c 573=head2 as_dot( \%options )
b22576c6 574
0ecb975c 575Returns a string that is the collation graph expressed in dot
576(i.e. GraphViz) format. Options include:
b22576c6 577
0ecb975c 578=over 4
b22576c6 579
0ecb975c 580=item * from
b22576c6 581
0ecb975c 582=item * to
df6d9812 583
0ecb975c 584=item * color_common
585
586=back
df6d9812 587
588=cut
589
590sub as_dot {
0ecb975c 591 my( $self, $opts ) = @_;
592 my $startrank = $opts->{'from'} if $opts;
593 my $endrank = $opts->{'to'} if $opts;
594 my $color_common = $opts->{'color_common'} if $opts;
b365fbae 595 my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
596 && $self->end->rank > 100;
597
b22576c6 598 # Check the arguments
599 if( $startrank ) {
600 return if $endrank && $startrank > $endrank;
601 return if $startrank > $self->end->rank;
602 }
603 if( defined $endrank ) {
604 return if $endrank < 0;
f1b3b33a 605 $endrank = undef if $endrank == $self->end->rank;
b22576c6 606 }
607
67da8d6c 608 my $graph_name = $self->tradition->name;
609 $graph_name =~ s/[^\w\s]//g;
610 $graph_name = join( '_', split( /\s+/, $graph_name ) );
f13b5582 611
612 my %graph_attrs = (
613 'rankdir' => 'LR',
614 'bgcolor' => 'none',
615 );
616 my %node_attrs = (
b8990398 617 'fontsize' => 14,
f13b5582 618 'fillcolor' => 'white',
619 'style' => 'filled',
620 'shape' => 'ellipse'
621 );
622 my %edge_attrs = (
623 'arrowhead' => 'open',
624 'color' => '#000000',
625 'fontcolor' => '#000000',
626 );
627
67da8d6c 628 my $dot = sprintf( "digraph %s {\n", $graph_name );
f13b5582 629 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
630 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
df6d9812 631
b22576c6 632 # Output substitute start/end readings if necessary
633 if( $startrank ) {
634 $dot .= "\t\"#SUBSTART#\" [ label=\"...\" ];\n";
635 }
636 if( $endrank ) {
637 $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n";
638 }
b365fbae 639 if( $STRAIGHTENHACK ) {
640 ## HACK part 1
641 $dot .= "\tsubgraph { rank=same \"#START#\" \"#SILENT#\" }\n";
642 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
643 }
b22576c6 644 my %used; # Keep track of the readings that actually appear in the graph
30ddc24c 645 # Sort the readings by rank if we have ranks; this speeds layout.
646 my @all_readings = $self->end->has_rank
647 ? sort { $a->rank <=> $b->rank } $self->readings
648 : $self->readings;
4633f9e4 649 # TODO Refrain from outputting lacuna nodes - just grey out the edges.
30ddc24c 650 foreach my $reading ( @all_readings ) {
b22576c6 651 # Only output readings within our rank range.
652 next if $startrank && $reading->rank < $startrank;
653 next if $endrank && $reading->rank > $endrank;
654 $used{$reading->id} = 1;
910a0a6d 655 # Need not output nodes without separate labels
3a2ebbf4 656 next if $reading->id eq $reading->text;
d4b75f44 657 my $rattrs;
30f0df34 658 my $label = $reading->text;
629e27b0 659 $label .= '-' if $reading->join_next;
660 $label = "-$label" if $reading->join_prior;
8f9cab7b 661 $label =~ s/\"/\\\"/g;
d4b75f44 662 $rattrs->{'label'} = $label;
0ecb975c 663 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
d4b75f44 664 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
df6d9812 665 }
3a2ebbf4 666
30ddc24c 667 # Add the real edges. Need to weight one edge per rank jump, in a
668 # continuous line.
b365fbae 669 # my $weighted = $self->_add_edge_weights;
b22576c6 670 my @edges = $self->paths;
3bdec618 671 my( %substart, %subend );
b22576c6 672 foreach my $edge ( @edges ) {
673 # Do we need to output this edge?
508fd430 674 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
027d819c 675 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
f13b5582 676 my $variables = { %edge_attrs, 'label' => $label };
30ddc24c 677
b22576c6 678 # Account for the rank gap if necessary
30ddc24c 679 my $rank0 = $self->reading( $edge->[0] )->rank
680 if $self->reading( $edge->[0] )->has_rank;
681 my $rank1 = $self->reading( $edge->[1] )->rank
682 if $self->reading( $edge->[1] )->has_rank;
683 if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
684 $variables->{'minlen'} = $rank1 - $rank0;
685 }
686
687 # Add the calculated edge weights
b365fbae 688 # if( exists $weighted->{$edge->[0]}
e247aad1 689 # && $weighted->{$edge->[0]} eq $edge->[1] ) {
690 # # $variables->{'color'} = 'red';
691 # $variables->{'weight'} = 3.0;
692 # }
30ddc24c 693
508fd430 694 # EXPERIMENTAL: make edge width reflect no. of witnesses
695 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
696 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
697
f13b5582 698 my $varopts = _dot_attr_string( $variables );
699 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
700 $edge->[0], $edge->[1], $varopts );
3bdec618 701 } elsif( $used{$edge->[0]} ) {
702 $subend{$edge->[0]} = 1;
703 } elsif( $used{$edge->[1]} ) {
704 $substart{$edge->[1]} = 1;
b22576c6 705 }
df6d9812 706 }
3bdec618 707 # Add substitute start and end edges if necessary
708 foreach my $node ( keys %substart ) {
027d819c 709 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 710 my $variables = { %edge_attrs, 'label' => $witstr };
711 my $varopts = _dot_attr_string( $variables );
712 $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;";
3bdec618 713 }
714 foreach my $node ( keys %subend ) {
027d819c 715 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 716 my $variables = { %edge_attrs, 'label' => $witstr };
717 my $varopts = _dot_attr_string( $variables );
718 $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;";
3bdec618 719 }
b365fbae 720 # HACK part 2
721 if( $STRAIGHTENHACK ) {
722 $dot .= "\t\"#END#\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
723 }
30ddc24c 724
df6d9812 725 $dot .= "}\n";
726 return $dot;
727}
728
f13b5582 729sub _dot_attr_string {
730 my( $hash ) = @_;
731 my @attrs;
732 foreach my $k ( sort keys %$hash ) {
733 my $v = $hash->{$k};
734 push( @attrs, $k.'="'.$v.'"' );
735 }
736 return( '[ ' . join( ', ', @attrs ) . ' ]' );
737}
738
30ddc24c 739sub _add_edge_weights {
740 my $self = shift;
741 # Walk the graph from START to END, choosing the successor node with
742 # the largest number of witness paths each time.
743 my $weighted = {};
744 my $curr = $self->start->id;
008fc8a6 745 my $ranked = $self->end->has_rank;
30ddc24c 746 while( $curr ne $self->end->id ) {
008fc8a6 747 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
30ddc24c 748 my @succ = sort { $self->path_witnesses( $curr, $a )
749 <=> $self->path_witnesses( $curr, $b ) }
750 $self->sequence->successors( $curr );
751 my $next = pop @succ;
008fc8a6 752 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
30ddc24c 753 # Try to avoid lacunae in the weighted path.
008fc8a6 754 while( @succ &&
755 ( $self->reading( $next )->is_lacuna ||
756 $nextrank - $rank > 1 ) ){
30ddc24c 757 $next = pop @succ;
758 }
759 $weighted->{$curr} = $next;
760 $curr = $next;
761 }
762 return $weighted;
763}
764
027d819c 765=head2 path_witnesses( $edge )
766
767Returns the list of sigils whose witnesses are associated with the given edge.
768The edge can be passed as either an array or an arrayref of ( $source, $target ).
769
770=cut
771
3a2ebbf4 772sub path_witnesses {
773 my( $self, @edge ) = @_;
774 # If edge is an arrayref, cope.
775 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
776 my $e = shift @edge;
777 @edge = @$e;
778 }
779 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
508fd430 780 return @wits;
3a2ebbf4 781}
782
027d819c 783sub _path_display_label {
508fd430 784 my $self = shift;
785 my @wits = sort @_;
8f9cab7b 786 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
787 if( scalar @wits > $maj ) {
f13b5582 788 # TODO break out a.c. wits
8f9cab7b 789 return 'majority';
790 } else {
791 return join( ', ', @wits );
792 }
793}
1dd07bda 794
795=head2 witnesses_at_rank
796
797Returns a list of witnesses that are not lacunose, for a given rank.
798
799=cut
800
801sub witnesses_at_rank {
802 my( $self, $rank ) = @_;
803}
8f9cab7b 804
4e5a7b2c 805=head2 as_graphml
8e1394aa 806
4e5a7b2c 807Returns a GraphML representation of the collation. The GraphML will contain
808two graphs. The first expresses the attributes of the readings and the witness
809paths that link them; the second expresses the relationships that link the
810readings. This is the native transfer format for a tradition.
8e1394aa 811
56eefa04 812=begin testing
813
814use Text::Tradition;
815
816my $READINGS = 311;
817my $PATHS = 361;
818
819my $datafile = 't/data/florilegium_tei_ps.xml';
820my $tradition = Text::Tradition->new( 'input' => 'TEI',
821 'name' => 'test0',
822 'file' => $datafile,
823 'linear' => 1 );
824
825ok( $tradition, "Got a tradition object" );
826is( scalar $tradition->witnesses, 13, "Found all witnesses" );
827ok( $tradition->collation, "Tradition has a collation" );
828
829my $c = $tradition->collation;
830is( scalar $c->readings, $READINGS, "Collation has all readings" );
831is( scalar $c->paths, $PATHS, "Collation has all paths" );
832is( scalar $c->relationships, 0, "Collation has all relationships" );
833
834# Add a few relationships
835$c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
836$c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
837$c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
838
839# Now write it to GraphML and parse it again.
840
841my $graphml = $c->as_graphml;
842my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
843is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
844is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
845is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
846
847=end testing
848
8e1394aa 849=cut
850
851sub as_graphml {
3a2ebbf4 852 my( $self ) = @_;
3d14b48e 853 $self->calculate_ranks unless $self->_graphcalc_done;
854
8e1394aa 855 # Some namespaces
856 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
857 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
858 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 859 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 860
861 # Create the document and root node
862 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
863 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
864 $graphml->setDocumentElement( $root );
865 $root->setNamespace( $xsi_ns, 'xsi', 0 );
866 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
bbd064a9 867
868 # List of attribute types to save on our objects and their corresponding
869 # GraphML types
870 my %save_types = (
871 'Str' => 'string',
872 'Int' => 'int',
873 'Bool' => 'boolean',
874 'RelationshipType' => 'string',
875 'RelationshipScope' => 'string',
876 );
877
878 # List of attribute names *not* to save on our objects.
879 # We will also not save any attribute beginning with _.
880 my %skipsave;
881 map { $skipsave{$_} = 1 } qw/ cached_svg /;
8e1394aa 882
bbd064a9 883 # Add the data keys for the graph. Include an extra key 'version' for the
884 # GraphML output version.
e309421a 885 my %graph_data_keys;
886 my $gdi = 0;
bbd064a9 887 my %graph_attributes = ( 'version' => 'string' );
888 # Graph attributes include those of Tradition and those of Collation.
889 my %gattr_from;
890 my $tmeta = $self->tradition->meta;
891 my $cmeta = $self->meta;
892 map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
893 map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
894 foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
895 next if $attr->name =~ /^_/;
896 next if $skipsave{$attr->name};
897 next unless $save_types{$attr->type_constraint->name};
898 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
899 }
900
901 foreach my $datum ( sort keys %graph_attributes ) {
e309421a 902 $graph_data_keys{$datum} = 'dg'.$gdi++;
903 my $key = $root->addNewChild( $graphml_ns, 'key' );
904 $key->setAttribute( 'attr.name', $datum );
bbd064a9 905 $key->setAttribute( 'attr.type', $graph_attributes{$datum} );
e309421a 906 $key->setAttribute( 'for', 'graph' );
907 $key->setAttribute( 'id', $graph_data_keys{$datum} );
908 }
f6066bac 909
bbd064a9 910 # Add the data keys for reading nodes
911 my %reading_attributes;
912 my $rmeta = Text::Tradition::Collation::Reading->meta;
913 foreach my $attr( $rmeta->get_all_attributes ) {
914 next if $attr->name =~ /^_/;
915 next if $skipsave{$attr->name};
916 next unless $save_types{$attr->type_constraint->name};
917 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
918 }
ef9d481f 919 my %node_data_keys;
920 my $ndi = 0;
bbd064a9 921 foreach my $datum ( sort keys %reading_attributes ) {
910a0a6d 922 $node_data_keys{$datum} = 'dn'.$ndi++;
923 my $key = $root->addNewChild( $graphml_ns, 'key' );
924 $key->setAttribute( 'attr.name', $datum );
bbd064a9 925 $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
910a0a6d 926 $key->setAttribute( 'for', 'node' );
927 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 928 }
929
bbd064a9 930 # Add the data keys for edges, that is, paths and relationships. Path
931 # data does not come from a Moose class so is here manually.
ef9d481f 932 my $edi = 0;
933 my %edge_data_keys;
bbd064a9 934 my %edge_attributes = (
3a2ebbf4 935 witness => 'string', # ID/label for a path
3a2ebbf4 936 extra => 'boolean', # Path key
3a2ebbf4 937 );
bbd064a9 938 my @path_attributes = keys %edge_attributes; # track our manual additions
939 my $pmeta = Text::Tradition::Collation::Relationship->meta;
940 foreach my $attr( $pmeta->get_all_attributes ) {
941 next if $attr->name =~ /^_/;
942 next if $skipsave{$attr->name};
943 next unless $save_types{$attr->type_constraint->name};
944 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
945 }
946 foreach my $datum ( sort keys %edge_attributes ) {
3a2ebbf4 947 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 948 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 949 $key->setAttribute( 'attr.name', $datum );
bbd064a9 950 $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
910a0a6d 951 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 952 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 953 }
3a2ebbf4 954
22222af9 955 # Add the collation graph itself
2c669bca 956 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
957 $sgraph->setAttribute( 'edgedefault', 'directed' );
958 $sgraph->setAttribute( 'id', $self->tradition->name );
959 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
960 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
961 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
962 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
963 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
22222af9 964
e309421a 965 # Collation attribute data
bbd064a9 966 foreach my $datum ( keys %graph_attributes ) {
967 my $value;
968 if( $datum eq 'version' ) {
969 $value = '3.1';
970 } elsif( $gattr_from{$datum} eq 'Tradition' ) {
971 $value = $self->tradition->$datum;
972 } else {
973 $value = $self->$datum;
974 }
2c669bca 975 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 976 }
8e1394aa 977
978 my $node_ctr = 0;
979 my %node_hash;
22222af9 980 # Add our readings to the graph
3a2ebbf4 981 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
2c669bca 982 # Add to the main graph
983 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 984 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 985 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 986 $node_el->setAttribute( 'id', $node_xmlid );
bbd064a9 987 foreach my $d ( keys %reading_attributes ) {
255875b8 988 my $nval = $n->$d;
989 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
990 if defined $nval;
991 }
b15511bf 992 }
993
2c669bca 994 # Add the path edges to the sequence graph
df6d9812 995 my $edge_ctr = 0;
3a2ebbf4 996 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
997 # We add an edge in the graphml for every witness in $e.
508fd430 998 foreach my $wit ( sort $self->path_witnesses( $e ) ) {
3a2ebbf4 999 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1000 $node_hash{ $e->[0] },
1001 $node_hash{ $e->[1] } );
2c669bca 1002 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 1003 $edge_el->setAttribute( 'source', $from );
1004 $edge_el->setAttribute( 'target', $to );
1005 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 1006
1007 # It's a witness path, so add the witness
1008 my $base = $wit;
1009 my $key = $edge_data_keys{'witness'};
1010 # Is this an ante-corr witness?
1011 my $aclabel = $self->ac_label;
1012 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1013 # Keep the base witness
1014 $base = $1;
1015 # ...and record that this is an 'extra' reading path
1016 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1017 }
1018 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1019 }
1020 }
1021
22222af9 1022 # Add the relationship graph to the XML
bbd064a9 1023 map { delete $edge_data_keys{$_} } @path_attributes;
027d819c 1024 $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
2626f709 1025 $node_data_keys{'id'}, \%edge_data_keys );
8e1394aa 1026
94c00c71 1027 # Save and return the thing
1028 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 1029 return $result;
df6d9812 1030}
1031
b15511bf 1032sub _add_graphml_data {
1033 my( $el, $key, $value ) = @_;
b15511bf 1034 return unless defined $value;
c9bf3dbf 1035 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 1036 $data_el->setAttribute( 'key', $key );
1037 $data_el->appendText( $value );
8e1394aa 1038}
1039
4e5a7b2c 1040=head2 as_csv
910a0a6d 1041
1042Returns a CSV alignment table representation of the collation graph, one
2c669bca 1043row per witness (or witness uncorrected.)
910a0a6d 1044
1045=cut
1046
1047sub as_csv {
3a2ebbf4 1048 my( $self ) = @_;
1dd07bda 1049 my $table = $self->alignment_table;
910a0a6d 1050 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
1051 my @result;
2c669bca 1052 # Make the header row
1053 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1054 push( @result, decode_utf8( $csv->string ) );
1055 # Make the rest of the rows
1056 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 1057 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1dd07bda 1058 my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
2c669bca 1059 $csv->combine( @row );
910a0a6d 1060 push( @result, decode_utf8( $csv->string ) );
1061 }
3a2ebbf4 1062 return join( "\n", @result );
910a0a6d 1063}
1064
1dd07bda 1065=head2 alignment_table( $use_refs, $include_witnesses )
2c669bca 1066
566f4595 1067Return a reference to an alignment table, in a slightly enhanced CollateX
1068format which looks like this:
1069
1070 $table = { alignment => [ { witness => "SIGIL",
4e5a7b2c 1071 tokens => [ { t => "TEXT" }, ... ] },
566f4595 1072 { witness => "SIG2",
4e5a7b2c 1073 tokens => [ { t => "TEXT" }, ... ] },
566f4595 1074 ... ],
1075 length => TEXTLEN };
1076
1077If $use_refs is set to 1, the reading object is returned in the table
1078instead of READINGTEXT; if not, the text of the reading is returned.
4e5a7b2c 1079
1080If $include_witnesses is set to a hashref, only the witnesses whose sigil
566f4595 1081keys have a true hash value will be included.
2c669bca 1082
1083=cut
9f3ba6f7 1084
1dd07bda 1085sub alignment_table {
1086 my( $self ) = @_;
c1915ab9 1087 $self->calculate_ranks() unless $self->_graphcalc_done;
1dd07bda 1088 return $self->cached_table if $self->has_cached_table;
1089
0ecb975c 1090 # Make sure we can do this
1091 throw( "Need a linear graph in order to make an alignment table" )
1092 unless $self->linear;
1093 $self->calculate_ranks unless $self->end->has_rank;
1094
2c669bca 1095 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 1096 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 1097 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
eca16057 1098 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 1099 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1dd07bda 1100 my @row = _make_witness_row( \@wit_path, \@all_pos );
2c669bca 1101 push( @{$table->{'alignment'}},
1102 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 1103 if( $wit->is_layered ) {
1104 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
861c3e27 1105 $wit->sigil.$self->ac_label );
1dd07bda 1106 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
2c669bca 1107 push( @{$table->{'alignment'}},
1108 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 1109 }
1110 }
1dd07bda 1111 $self->cached_table( $table );
1112 return $table;
910a0a6d 1113}
1114
1115sub _make_witness_row {
1dd07bda 1116 my( $path, $positions ) = @_;
910a0a6d 1117 my %char_hash;
1118 map { $char_hash{$_} = undef } @$positions;
2c669bca 1119 my $debug = 0;
910a0a6d 1120 foreach my $rdg ( @$path ) {
eca16057 1121 my $rtext = $rdg->text;
1122 $rtext = '#LACUNA#' if $rdg->is_lacuna;
2c669bca 1123 print STDERR "rank " . $rdg->rank . "\n" if $debug;
3a2ebbf4 1124 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
1dd07bda 1125 $char_hash{$rdg->rank} = { 't' => $rdg };
910a0a6d 1126 }
1127 my @row = map { $char_hash{$_} } @$positions;
eca16057 1128 # Fill in lacuna markers for undef spots in the row
1129 my $last_el = shift @row;
1130 my @filled_row = ( $last_el );
1131 foreach my $el ( @row ) {
0e476982 1132 # If we are using node reference, make the lacuna node appear many times
1133 # in the table. If not, use the lacuna tag.
1dd07bda 1134 if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1135 $el = $last_el;
eca16057 1136 }
1137 push( @filled_row, $el );
1138 $last_el = $el;
1139 }
1140 return @filled_row;
910a0a6d 1141}
1142
4e5a7b2c 1143=head1 NAVIGATION METHODS
910a0a6d 1144
4e5a7b2c 1145=head2 reading_sequence( $first, $last, $sigil, $backup )
e2902068 1146
1147Returns the ordered list of readings, starting with $first and ending
4e5a7b2c 1148with $last, for the witness given in $sigil. If a $backup sigil is
1149specified (e.g. when walking a layered witness), it will be used wherever
1150no $sigil path exists. If there is a base text reading, that will be
1151used wherever no path exists for $sigil or $backup.
e2902068 1152
1153=cut
1154
910a0a6d 1155# TODO Think about returning some lazy-eval iterator.
b0b4421a 1156# TODO Get rid of backup; we should know from what witness is whether we need it.
910a0a6d 1157
e2902068 1158sub reading_sequence {
861c3e27 1159 my( $self, $start, $end, $witness ) = @_;
e2902068 1160
930ff666 1161 $witness = $self->baselabel unless $witness;
e2902068 1162 my @readings = ( $start );
1163 my %seen;
1164 my $n = $start;
3a2ebbf4 1165 while( $n && $n->id ne $end->id ) {
1166 if( exists( $seen{$n->id} ) ) {
63778331 1167 throw( "Detected loop for $witness at " . $n->id );
910a0a6d 1168 }
3a2ebbf4 1169 $seen{$n->id} = 1;
910a0a6d 1170
861c3e27 1171 my $next = $self->next_reading( $n, $witness );
44771cf2 1172 unless( $next ) {
63778331 1173 throw( "Did not find any path for $witness from reading " . $n->id );
44771cf2 1174 }
910a0a6d 1175 push( @readings, $next );
1176 $n = $next;
e2902068 1177 }
1178 # Check that the last reading is our end reading.
1179 my $last = $readings[$#readings];
63778331 1180 throw( "Last reading found from " . $start->text .
1181 " for witness $witness is not the end!" ) # TODO do we get this far?
3a2ebbf4 1182 unless $last->id eq $end->id;
e2902068 1183
1184 return @readings;
1185}
1186
4e5a7b2c 1187=head2 next_reading( $reading, $sigil );
8e1394aa 1188
4a8828f0 1189Returns the reading that follows the given reading along the given witness
930ff666 1190path.
8e1394aa 1191
1192=cut
1193
4a8828f0 1194sub next_reading {
e2902068 1195 # Return the successor via the corresponding path.
8e1394aa 1196 my $self = shift;
3a2ebbf4 1197 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 1198 return undef unless $answer;
3a2ebbf4 1199 return $self->reading( $answer );
8e1394aa 1200}
1201
4e5a7b2c 1202=head2 prior_reading( $reading, $sigil )
8e1394aa 1203
4a8828f0 1204Returns the reading that precedes the given reading along the given witness
930ff666 1205path.
8e1394aa 1206
1207=cut
1208
4a8828f0 1209sub prior_reading {
e2902068 1210 # Return the predecessor via the corresponding path.
8e1394aa 1211 my $self = shift;
3a2ebbf4 1212 my $answer = $self->_find_linked_reading( 'prior', @_ );
1213 return $self->reading( $answer );
8e1394aa 1214}
1215
4a8828f0 1216sub _find_linked_reading {
861c3e27 1217 my( $self, $direction, $node, $path ) = @_;
1218
1219 # Get a backup if we are dealing with a layered witness
1220 my $alt_path;
1221 my $aclabel = $self->ac_label;
1222 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1223 $alt_path = $1;
1224 }
1225
e2902068 1226 my @linked_paths = $direction eq 'next'
3a2ebbf4 1227 ? $self->sequence->edges_from( $node )
1228 : $self->sequence->edges_to( $node );
e2902068 1229 return undef unless scalar( @linked_paths );
8e1394aa 1230
e2902068 1231 # We have to find the linked path that contains all of the
1232 # witnesses supplied in $path.
1233 my( @path_wits, @alt_path_wits );
4e5a7b2c 1234 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1235 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 1236 my $base_le;
1237 my $alt_le;
1238 foreach my $le ( @linked_paths ) {
3a2ebbf4 1239 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 1240 $base_le = $le;
910a0a6d 1241 }
508fd430 1242 my @le_wits = sort $self->path_witnesses( $le );
3a2ebbf4 1243 if( _is_within( \@path_wits, \@le_wits ) ) {
1244 # This is the right path.
1245 return $direction eq 'next' ? $le->[1] : $le->[0];
1246 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1247 $alt_le = $le;
1248 }
8e1394aa 1249 }
e2902068 1250 # Got this far? Return the alternate path if it exists.
3a2ebbf4 1251 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 1252 if $alt_le;
e2902068 1253
1254 # Got this far? Return the base path if it exists.
3a2ebbf4 1255 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 1256 if $base_le;
e2902068 1257
1258 # Got this far? We have no appropriate path.
2c669bca 1259 warn "Could not find $direction node from " . $node->id
910a0a6d 1260 . " along path $path";
8e1394aa 1261 return undef;
1262}
1263
4a8828f0 1264# Some set logic.
1265sub _is_within {
1266 my( $set1, $set2 ) = @_;
7854e12e 1267 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 1268 foreach my $el ( @$set1 ) {
910a0a6d 1269 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 1270 }
1271 return $ret;
1272}
1273
4e5a7b2c 1274# Return the string that joins together a list of witnesses for
1275# display on a single path.
1276sub _witnesses_of_label {
1277 my( $self, $label ) = @_;
1278 my $regex = $self->wit_list_separator;
1279 my @answer = split( /\Q$regex\E/, $label );
1280 return @answer;
b0b4421a 1281}
1282
d4b75f44 1283=head2 common_readings
1284
1285Returns the list of common readings in the graph (i.e. those readings that are
1286shared by all non-lacunose witnesses.)
1287
1288=cut
1289
1290sub common_readings {
1291 my $self = shift;
1292 my @common = grep { $_->is_common } $self->readings;
1293 return @common;
1294}
1295
b0b4421a 1296=head2 path_text( $sigil, $mainsigil [, $start, $end ] )
1297
1298Returns the text of a witness (plus its backup, if we are using a layer)
1299as stored in the collation. The text is returned as a string, where the
1300individual readings are joined with spaces and the meta-readings (e.g.
1301lacunae) are omitted. Optional specification of $start and $end allows
1302the generation of a subset of the witness text.
4e5a7b2c 1303
b0b4421a 1304=cut
1305
1306sub path_text {
861c3e27 1307 my( $self, $wit, $start, $end ) = @_;
b0b4421a 1308 $start = $self->start unless $start;
1309 $end = $self->end unless $end;
861c3e27 1310 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
629e27b0 1311 my $pathtext = '';
1312 my $last;
1313 foreach my $r ( @path ) {
1314 if( $r->join_prior || !$last || $last->join_next ) {
1315 $pathtext .= $r->text;
1316 } else {
1317 $pathtext .= ' ' . $r->text;
1318 }
1319 $last = $r;
1320 }
1321 return $pathtext;
b0b4421a 1322}
4e5a7b2c 1323
1324=head1 INITIALIZATION METHODS
1325
1326These are mostly for use by parsers.
1327
1328=head2 make_witness_path( $witness )
1329
1330Link the array of readings contained in $witness->path (and in
1331$witness->uncorrected_path if it exists) into collation paths.
1332Clear out the arrays when finished.
de51424a 1333
4e5a7b2c 1334=head2 make_witness_paths
1335
1336Call make_witness_path for all witnesses in the tradition.
1337
1338=cut
930ff666 1339
7e450e44 1340# For use when a collation is constructed from a base text and an apparatus.
1341# We have the sequences of readings and just need to add path edges.
1f7aa795 1342# When we are done, clear out the witness path attributes, as they are no
1343# longer needed.
1344# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 1345
6a222840 1346sub make_witness_paths {
1347 my( $self ) = @_;
910a0a6d 1348 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 1349 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 1350 $self->make_witness_path( $wit );
7854e12e 1351 }
7854e12e 1352}
1353
6a222840 1354sub make_witness_path {
7854e12e 1355 my( $self, $wit ) = @_;
1356 my @chain = @{$wit->path};
15d2d3df 1357 my $sig = $wit->sigil;
7854e12e 1358 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 1359 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 1360 }
1f7aa795 1361 if( $wit->is_layered ) {
d9e873d0 1362 @chain = @{$wit->uncorrected_path};
1363 foreach my $idx( 0 .. $#chain-1 ) {
1364 my $source = $chain[$idx];
1365 my $target = $chain[$idx+1];
1366 $self->add_path( $source, $target, $sig.$self->ac_label )
1367 unless $self->has_path( $source, $target, $sig );
1368 }
15d2d3df 1369 }
1f7aa795 1370 $wit->clear_path;
1371 $wit->clear_uncorrected_path;
e2902068 1372}
1373
4e5a7b2c 1374=head2 calculate_ranks
1375
1376Calculate the reading ranks (that is, their aligned positions relative
1377to each other) for the graph. This can only be called on linear collations.
1378
b365fbae 1379=begin testing
1380
1381use Text::Tradition;
1382
1383my $cxfile = 't/data/Collatex-16.xml';
1384my $t = Text::Tradition->new(
1385 'name' => 'inline',
1386 'input' => 'CollateX',
1387 'file' => $cxfile,
1388 );
1389my $c = $t->collation;
1390
1391# Make an svg
bfcbcecb 1392my $table = $c->alignment_table;
1393ok( $c->has_cached_table, "Alignment table was cached" );
1394is( $c->alignment_table, $table, "Cached table returned upon second call" );
b365fbae 1395$c->calculate_ranks;
bfcbcecb 1396is( $c->alignment_table, $table, "Cached table retained with no rank change" );
b365fbae 1397$c->add_relationship( 'n9', 'n23', { 'type' => 'spelling' } );
bfcbcecb 1398isnt( $c->alignment_table, $table, "Alignment table changed after relationship add" );
b365fbae 1399
1400=end testing
1401
4e5a7b2c 1402=cut
1403
910a0a6d 1404sub calculate_ranks {
1405 my $self = shift;
b365fbae 1406 # Save the existing ranks, in case we need to invalidate the cached SVG.
1407 my %existing_ranks;
910a0a6d 1408 # Walk a version of the graph where every node linked by a relationship
1409 # edge is fundamentally the same node, and do a topological ranking on
1410 # the nodes in this graph.
c9bf3dbf 1411 my $topo_graph = Graph->new();
910a0a6d 1412 my %rel_containers;
1413 my $rel_ctr = 0;
1414 # Add the nodes
1415 foreach my $r ( $self->readings ) {
3a2ebbf4 1416 next if exists $rel_containers{$r->id};
910a0a6d 1417 my @rels = $r->related_readings( 'colocated' );
1418 if( @rels ) {
1419 # Make a relationship container.
1420 push( @rels, $r );
c9bf3dbf 1421 my $rn = 'rel_container_' . $rel_ctr++;
1422 $topo_graph->add_vertex( $rn );
910a0a6d 1423 foreach( @rels ) {
3a2ebbf4 1424 $rel_containers{$_->id} = $rn;
910a0a6d 1425 }
1426 } else {
1427 # Add a new node to mirror the old node.
3a2ebbf4 1428 $rel_containers{$r->id} = $r->id;
1429 $topo_graph->add_vertex( $r->id );
910a0a6d 1430 }
4a8828f0 1431 }
3a1f2523 1432
3a2ebbf4 1433 # Add the edges.
910a0a6d 1434 foreach my $r ( $self->readings ) {
b365fbae 1435 $existing_ranks{$r} = $r->rank;
3a2ebbf4 1436 foreach my $n ( $self->sequence->successors( $r->id ) ) {
1437 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
1438 $rel_containers{$n} );
4e5a7b2c 1439 # $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 1440 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 1441 }
1442 }
1443
1444 # Now do the rankings, starting with the start node.
3a2ebbf4 1445 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 1446 my $node_ranks = { $topo_start => 0 };
910a0a6d 1447 my @curr_origin = ( $topo_start );
1448 # A little iterative function.
1449 while( @curr_origin ) {
c9bf3dbf 1450 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 1451 }
1452 # Transfer our rankings from the topological graph to the real one.
1453 foreach my $r ( $self->readings ) {
3a2ebbf4 1454 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1455 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 1456 } else {
63778331 1457 # Die. Find the last rank we calculated.
1458 my @all_defined = sort { $node_ranks->{$rel_containers{$a->id}}
1459 <=> $node_ranks->{$rel_containers{$b->id}} }
1460 $self->readings;
1461 my $last = pop @all_defined;
1462 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
67da8d6c 1463 }
de51424a 1464 }
bfcbcecb 1465 # Do we need to invalidate the cached data?
1466 if( $self->has_cached_svg || $self->has_cached_table ) {
b365fbae 1467 foreach my $r ( $self->readings ) {
7c293912 1468 next if defined( $existing_ranks{$r} )
1469 && $existing_ranks{$r} == $r->rank;
c1915ab9 1470 # Something has changed, so clear the cache
bfcbcecb 1471 $self->_clear_cache;
c1915ab9 1472 # ...and recalculate the common readings.
1473 $self->calculate_common_readings();
b365fbae 1474 last;
1475 }
1476 }
c1915ab9 1477 # The graph calculation information is now up to date.
1478 $self->_graphcalc_done(1);
8e1394aa 1479}
3a1f2523 1480
910a0a6d 1481sub _assign_rank {
c9bf3dbf 1482 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1483 # Look at each of the children of @current_nodes. If all the child's
1484 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1485 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1486 # parent gets a rank.
910a0a6d 1487 my @next_nodes;
1488 foreach my $c ( @current_nodes ) {
c9bf3dbf 1489 warn "Current reading $c has no rank!"
1490 unless exists $node_ranks->{$c};
1491 # print STDERR "Looking at child of node $c, rank "
1492 # . $node_ranks->{$c} . "\n";
1493 foreach my $child ( $graph->successors( $c ) ) {
1494 next if exists $node_ranks->{$child};
910a0a6d 1495 my $highest_rank = -1;
1496 my $skip = 0;
c9bf3dbf 1497 foreach my $parent ( $graph->predecessors( $child ) ) {
1498 if( exists $node_ranks->{$parent} ) {
1499 $highest_rank = $node_ranks->{$parent}
1500 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1501 } else {
1502 $skip = 1;
1503 last;
1504 }
1505 }
1506 next if $skip;
c9bf3dbf 1507 my $c_rank = $highest_rank + 1;
1508 # print STDERR "Assigning rank $c_rank to node $child \n";
1509 $node_ranks->{$child} = $c_rank;
910a0a6d 1510 push( @next_nodes, $child );
1511 }
1512 }
1513 return @next_nodes;
4cdd82f1 1514}
910a0a6d 1515
c1915ab9 1516sub _clear_cache {
1517 my $self = shift;
1518 $self->wipe_svg if $self->has_cached_svg;
1519 $self->wipe_table if $self->has_cached_table;
1520}
1521
1522
4e5a7b2c 1523=head2 flatten_ranks
1524
1525A convenience method for parsing collation data. Searches the graph for readings
1526with the same text at the same rank, and merges any that are found.
1527
1528=cut
1529
0e476982 1530sub flatten_ranks {
1531 my $self = shift;
1532 my %unique_rank_rdg;
1533 foreach my $rdg ( $self->readings ) {
1534 next unless $rdg->has_rank;
1535 my $key = $rdg->rank . "||" . $rdg->text;
1536 if( exists $unique_rank_rdg{$key} ) {
1537 # Combine!
56eefa04 1538 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1539 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
0ecb975c 1540 # TODO see if this now makes a common point.
0e476982 1541 } else {
1542 $unique_rank_rdg{$key} = $rdg;
1543 }
1544 }
1545}
4633f9e4 1546
1547
d4b75f44 1548=head2 calculate_common_readings
1549
1550Goes through the graph identifying the readings that appear in every witness
1551(apart from those with lacunae at that spot.) Marks them as common and returns
1552the list.
1553
1554=begin testing
1555
1556use Text::Tradition;
1557
1558my $cxfile = 't/data/Collatex-16.xml';
1559my $t = Text::Tradition->new(
1560 'name' => 'inline',
1561 'input' => 'CollateX',
1562 'file' => $cxfile,
1563 );
1564my $c = $t->collation;
1565
1566my @common = $c->calculate_common_readings();
1567is( scalar @common, 8, "Found correct number of common readings" );
1568my @marked = sort $c->common_readings();
1569is( scalar @common, 8, "All common readings got marked as such" );
1570my @expected = qw/ n1 n12 n16 n19 n20 n5 n6 n7 /;
1571is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1572
1573=end testing
1574
1575=cut
1576
1577sub calculate_common_readings {
1578 my $self = shift;
1579 my @common;
c1915ab9 1580 map { $_->is_common( 0 ) } $self->readings;
1581 # Implicitly calls calculate_ranks
1dd07bda 1582 my $table = $self->alignment_table;
d4b75f44 1583 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
7f52eac8 1584 my @row = map { $_->{'tokens'}->[$idx]
1585 ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1586 @{$table->{'alignment'}};
d4b75f44 1587 my %hash;
1588 foreach my $r ( @row ) {
1589 if( $r ) {
1590 $hash{$r->id} = $r unless $r->is_meta;
1591 } else {
1592 $hash{'UNDEF'} = $r;
1593 }
1594 }
1595 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1596 my( $r ) = values %hash;
1597 $r->is_common( 1 );
1598 push( @common, $r );
1599 }
1600 }
1601 return @common;
1602}
1603
861c3e27 1604=head2 text_from_paths
1605
1606Calculate the text array for all witnesses from the path, for later consistency
1607checking. Only to be used if there is no non-graph-based way to know the
1608original texts.
1609
1610=cut
1611
1612sub text_from_paths {
1613 my $self = shift;
1614 foreach my $wit ( $self->tradition->witnesses ) {
1615 my @text = split( /\s+/,
1616 $self->reading_sequence( $self->start, $self->end, $wit->sigil ) );
1617 $wit->text( \@text );
1618 if( $wit->is_layered ) {
1619 my @uctext = split( /\s+/,
1620 $self->reading_sequence( $self->start, $self->end,
1621 $wit->sigil.$self->ac_label ) );
1622 $wit->text( \@uctext );
1623 }
1624 }
1625}
0e476982 1626
4e5a7b2c 1627=head1 UTILITY FUNCTIONS
1628
1629=head2 common_predecessor( $reading_a, $reading_b )
8e1394aa 1630
4e5a7b2c 1631Find the last reading that occurs in sequence before both the given readings.
1632
1633=head2 common_successor( $reading_a, $reading_b )
1634
1635Find the first reading that occurs in sequence after both the given readings.
1636
22222af9 1637=begin testing
1638
1639use Text::Tradition;
1640
1641my $cxfile = 't/data/Collatex-16.xml';
1642my $t = Text::Tradition->new(
1643 'name' => 'inline',
1644 'input' => 'CollateX',
1645 'file' => $cxfile,
1646 );
1647my $c = $t->collation;
1648
4e5a7b2c 1649is( $c->common_predecessor( 'n9', 'n23' )->id,
22222af9 1650 'n20', "Found correct common predecessor" );
4e5a7b2c 1651is( $c->common_successor( 'n9', 'n23' )->id,
22222af9 1652 '#END#', "Found correct common successor" );
1653
4e5a7b2c 1654is( $c->common_predecessor( 'n19', 'n17' )->id,
22222af9 1655 'n16', "Found correct common predecessor for readings on same path" );
4e5a7b2c 1656is( $c->common_successor( 'n21', 'n26' )->id,
22222af9 1657 '#END#', "Found correct common successor for readings on same path" );
1658
1659=end testing
1660
1661=cut
1662
1663## Return the closest reading that is a predecessor of both the given readings.
1664sub common_predecessor {
1665 my $self = shift;
4e5a7b2c 1666 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1667 return $self->_common_in_path( $r1, $r2, 'predecessors' );
22222af9 1668}
1669
1670sub common_successor {
1671 my $self = shift;
4e5a7b2c 1672 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1673 return $self->_common_in_path( $r1, $r2, 'successors' );
22222af9 1674}
1675
027d819c 1676sub _common_in_path {
22222af9 1677 my( $self, $r1, $r2, $dir ) = @_;
1678 my $iter = $r1->rank > $r2->rank ? $r1->rank : $r2->rank;
1679 $iter = $self->end->rank - $iter if $dir eq 'successors';
1680 my @candidates;
1681 my @last_checked = ( $r1, $r2 );
1682 my %all_seen;
1683 while( !@candidates ) {
1684 my @new_lc;
1685 foreach my $lc ( @last_checked ) {
1686 foreach my $p ( $lc->$dir ) {
1687 if( $all_seen{$p->id} ) {
1688 push( @candidates, $p );
1689 } else {
1690 $all_seen{$p->id} = 1;
1691 push( @new_lc, $p );
1692 }
1693 }
1694 }
1695 @last_checked = @new_lc;
1696 }
1697 my @answer = sort { $a->rank <=> $b->rank } @candidates;
1698 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1699}
1700
63778331 1701sub throw {
1702 Text::Tradition::Error->throw(
1703 'ident' => 'Collation error',
1704 'message' => $_[0],
1705 );
1706}
1707
dd3b58b0 1708no Moose;
1709__PACKAGE__->meta->make_immutable;
e867486f 1710
027d819c 1711=head1 LICENSE
e867486f 1712
027d819c 1713This package is free software and is provided "as is" without express
1714or implied warranty. You can redistribute it and/or modify it under
1715the same terms as Perl itself.
e867486f 1716
027d819c 1717=head1 AUTHOR
e867486f 1718
027d819c 1719Tara L Andrews E<lt>aurum@cpan.orgE<gt>