only run the hack for larger graphs
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
c9bf3dbf 5use Graph;
8e1394aa 6use IPC::Run qw( run binary );
910a0a6d 7use Text::CSV_XS;
b15511bf 8use Text::Tradition::Collation::Reading;
22222af9 9use Text::Tradition::Collation::RelationshipStore;
63778331 10use Text::Tradition::Error;
df6d9812 11use XML::LibXML;
a344ffcf 12use XML::LibXML::XPathContext;
dd3b58b0 13use Moose;
14
3a2ebbf4 15has 'sequence' => (
d047cd52 16 is => 'ro',
3a2ebbf4 17 isa => 'Graph',
18 default => sub { Graph->new() },
d047cd52 19 handles => {
3a2ebbf4 20 paths => 'edges',
d047cd52 21 },
d047cd52 22 );
3a2ebbf4 23
24has 'relations' => (
25 is => 'ro',
22222af9 26 isa => 'Text::Tradition::Collation::RelationshipStore',
27 handles => {
28 relationships => 'relationships',
29 related_readings => 'related_readings',
30 },
31 writer => '_set_relations',
3a2ebbf4 32 );
dd3b58b0 33
3a2ebbf4 34has 'tradition' => (
35 is => 'ro',
d047cd52 36 isa => 'Text::Tradition',
8d9a1cd8 37 weak_ref => 1,
d047cd52 38 );
dd3b58b0 39
3a2ebbf4 40has 'readings' => (
41 isa => 'HashRef[Text::Tradition::Collation::Reading]',
42 traits => ['Hash'],
43 handles => {
44 reading => 'get',
45 _add_reading => 'set',
46 del_reading => 'delete',
47 has_reading => 'exists',
48 readings => 'values',
49 },
50 default => sub { {} },
51 );
910a0a6d 52
4a8828f0 53has 'wit_list_separator' => (
7854e12e 54 is => 'rw',
55 isa => 'Str',
56 default => ', ',
57 );
58
59has 'baselabel' => (
60 is => 'rw',
61 isa => 'Str',
62 default => 'base text',
63 );
4a8828f0 64
15d2d3df 65has 'linear' => (
66 is => 'rw',
67 isa => 'Bool',
68 default => 1,
69 );
c84275ff 70
ef9d481f 71has 'ac_label' => (
72 is => 'rw',
73 isa => 'Str',
74 default => ' (a.c.)',
75 );
3a2ebbf4 76
77has 'start' => (
78 is => 'ro',
79 isa => 'Text::Tradition::Collation::Reading',
80 writer => '_set_start',
81 weak_ref => 1,
82 );
83
84has 'end' => (
85 is => 'ro',
86 isa => 'Text::Tradition::Collation::Reading',
87 writer => '_set_end',
88 weak_ref => 1,
89 );
1f563ac3 90
4e5a7b2c 91=head1 NAME
92
93Text::Tradition::Collation - a software model for a text collation
94
95=head1 SYNOPSIS
96
97 use Text::Tradition;
98 my $t = Text::Tradition->new(
99 'name' => 'this is a text',
100 'input' => 'TEI',
101 'file' => '/path/to/tei_parallel_seg_file.xml' );
102
103 my $c = $t->collation;
104 my @readings = $c->readings;
105 my @paths = $c->paths;
106 my @relationships = $c->relationships;
107
108 my $svg_variant_graph = $t->collation->as_svg();
109
110=head1 DESCRIPTION
111
112Text::Tradition is a library for representation and analysis of collated
113texts, particularly medieval ones. The Collation is the central feature of
114a Tradition, where the text, its sequence of readings, and its relationships
115between readings are actually kept.
116
117=head1 CONSTRUCTOR
118
119=head2 new
120
121The constructor. Takes a hash or hashref of the following arguments:
122
123=over
124
125=item * tradition - The Text::Tradition object to which the collation
126belongs. Required.
127
128=item * linear - Whether the collation should be linear; that is, whether
129transposed readings should be treated as two linked readings rather than one,
130and therefore whether the collation graph is acyclic. Defaults to true.
131
4e5a7b2c 132=item * baselabel - The default label for the path taken by a base text
133(if any). Defaults to 'base text'.
134
135=item * wit_list_separator - The string to join a list of witnesses for
136purposes of making labels in display graphs. Defaults to ', '.
137
138=item * ac_label - The extra label to tack onto a witness sigil when
139representing another layer of path for the given witness - that is, when
140a text has more than one possible reading due to scribal corrections or
141the like. Defaults to ' (a.c.)'.
142
143=back
144
145=head1 ACCESSORS
146
147=head2 tradition
148
149=head2 linear
150
4e5a7b2c 151=head2 wit_list_separator
152
153=head2 baselabel
154
155=head2 ac_label
156
157Simple accessors for collation attributes.
158
159=head2 start
160
161The meta-reading at the start of every witness path.
162
163=head2 end
164
165The meta-reading at the end of every witness path.
166
167=head2 readings
168
169Returns all Reading objects in the graph.
170
171=head2 reading( $id )
172
173Returns the Reading object corresponding to the given ID.
174
175=head2 add_reading( $reading_args )
176
177Adds a new reading object to the collation.
178See L<Text::Tradition::Collation::Reading> for the available arguments.
179
180=head2 del_reading( $object_or_id )
181
182Removes the given reading from the collation, implicitly removing its
183paths and relationships.
184
185=head2 merge_readings( $main, $second )
186
187Merges the $second reading into the $main one.
188The arguments may be either readings or reading IDs.
189
190=head2 has_reading( $id )
191
192Predicate to see whether a given reading ID is in the graph.
193
194=head2 reading_witnesses( $object_or_id )
195
196Returns a list of sigils whose witnesses contain the reading.
197
198=head2 paths
199
200Returns all reading paths within the document - that is, all edges in the
201collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
202
203=head2 add_path( $source, $target, $sigil )
204
205Links the given readings in the collation in sequence, under the given witness
206sigil. The readings may be specified by object or ID.
207
208=head2 del_path( $source, $target, $sigil )
209
210Links the given readings in the collation in sequence, under the given witness
211sigil. The readings may be specified by object or ID.
212
213=head2 has_path( $source, $target );
214
215Returns true if the two readings are linked in sequence in any witness.
216The readings may be specified by object or ID.
217
218=head2 relationships
219
220Returns all Relationship objects in the collation.
221
222=head2 add_relationship( $reading, $other_reading, $options )
223
224Adds a new relationship of the type given in $options between the two readings,
225which may be specified by object or ID. Returns a value of ( $status, @vectors)
226where $status is true on success, and @vectors is a list of relationship edges
227that were ultimately added.
228See L<Text::Tradition::Collation::Relationship> for the available options.
229
230=cut
dd3b58b0 231
d047cd52 232sub BUILD {
3a2ebbf4 233 my $self = shift;
22222af9 234 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
3a2ebbf4 235 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
236 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 237}
784877d9 238
3a2ebbf4 239### Reading construct/destruct functions
240
241sub add_reading {
242 my( $self, $reading ) = @_;
243 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
244 my %args = %$reading;
245 $reading = Text::Tradition::Collation::Reading->new(
246 'collation' => $self,
247 %args );
248 }
249 # First check to see if a reading with this ID exists.
250 if( $self->reading( $reading->id ) ) {
63778331 251 throw( "Collation already has a reading with id " . $reading->id );
3a2ebbf4 252 }
253 $self->_add_reading( $reading->id => $reading );
254 # Once the reading has been added, put it in both graphs.
255 $self->sequence->add_vertex( $reading->id );
22222af9 256 $self->relations->add_reading( $reading->id );
3a2ebbf4 257 return $reading;
eca16057 258};
259
3a2ebbf4 260around del_reading => sub {
261 my $orig = shift;
262 my $self = shift;
263 my $arg = shift;
264
265 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
266 $arg = $arg->id;
267 }
3a2ebbf4 268 # Remove the reading from the graphs.
269 $self->sequence->delete_vertex( $arg );
22222af9 270 $self->relations->delete_reading( $arg );
3a2ebbf4 271
272 # Carry on.
273 $self->$orig( $arg );
274};
7854e12e 275
3a2ebbf4 276# merge_readings( $main, $to_be_deleted );
7854e12e 277
3a2ebbf4 278sub merge_readings {
279 my $self = shift;
280
281 # We only need the IDs for adding paths to the graph, not the reading
282 # objects themselves.
49d4f2ac 283 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
3a2ebbf4 284
285 # The kept reading should inherit the paths and the relationships
286 # of the deleted reading.
287 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
288 my @vector = ( $kept );
289 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
290 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 291 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 292 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
293 $self->sequence->add_edge( @vector );
294 my $fwits = $self->sequence->get_edge_attributes( @vector );
295 @wits{keys %$fwits} = values %$fwits;
296 $self->sequence->set_edge_attributes( @vector, \%wits );
297 }
22222af9 298 $self->relations->merge_readings( $kept, $deleted, $combine_char );
3a2ebbf4 299
300 # Do the deletion deed.
49d4f2ac 301 if( $combine_char ) {
302 my $kept_obj = $self->reading( $kept );
303 my $new_text = join( $combine_char, $kept_obj->text,
304 $self->reading( $deleted )->text );
305 $kept_obj->alter_text( $new_text );
306 }
3a2ebbf4 307 $self->del_reading( $deleted );
308}
7854e12e 309
3265b0ce 310
3a2ebbf4 311# Helper function for manipulating the graph.
312sub _stringify_args {
313 my( $self, $first, $second, $arg ) = @_;
314 $first = $first->id
315 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
316 $second = $second->id
317 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
318 return( $first, $second, $arg );
319}
df6d9812 320
4e5a7b2c 321# Helper function for manipulating the graph.
322sub _objectify_args {
323 my( $self, $first, $second, $arg ) = @_;
324 $first = $self->reading( $first )
325 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
326 $second = $self->reading( $second )
327 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
328 return( $first, $second, $arg );
329}
3a2ebbf4 330### Path logic
331
332sub add_path {
333 my $self = shift;
334
335 # We only need the IDs for adding paths to the graph, not the reading
336 # objects themselves.
337 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
338
339 # Connect the readings
340 $self->sequence->add_edge( $source, $target );
341 # Note the witness in question
342 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 343};
344
3a2ebbf4 345sub del_path {
346 my $self = shift;
49d4f2ac 347 my @args;
348 if( ref( $_[0] ) eq 'ARRAY' ) {
349 my $e = shift @_;
350 @args = ( @$e, @_ );
351 } else {
352 @args = @_;
353 }
3a2ebbf4 354
355 # We only need the IDs for adding paths to the graph, not the reading
356 # objects themselves.
49d4f2ac 357 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 358
359 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 360 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 361 }
362 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
363 $self->sequence->delete_edge( $source, $target );
364 }
784877d9 365}
366
3a2ebbf4 367
15d2d3df 368# Extra graph-alike utility
369sub has_path {
3a2ebbf4 370 my $self = shift;
371 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
372 return undef unless $self->sequence->has_edge( $source, $target );
373 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 374}
375
4e5a7b2c 376=head2 clear_witness( @sigil_list )
3a2ebbf4 377
4e5a7b2c 378Clear the given witnesses out of the collation entirely, removing references
379to them in paths, and removing readings that belong only to them. Should only
380be called via $tradition->del_witness.
3a2ebbf4 381
382=cut
383
4e5a7b2c 384sub clear_witness {
385 my( $self, @sigils ) = @_;
386
387 # Clear the witness(es) out of the paths
388 foreach my $e ( $self->paths ) {
389 foreach my $sig ( @sigils ) {
390 $self->del_path( $e, $sig );
391 }
392 }
393
394 # Clear out the newly unused readings
395 foreach my $r ( $self->readings ) {
396 unless( $self->reading_witnesses( $r ) ) {
397 $self->del_reading( $r );
398 }
399 }
400}
3a2ebbf4 401
402sub add_relationship {
403 my $self = shift;
22222af9 404 my( $source, $target, $opts ) = $self->_stringify_args( @_ );
63778331 405 my( @vectors ) = $self->relations->add_relationship( $source,
64ae6270 406 $self->reading( $source ), $target, $self->reading( $target ), $opts );
407 # Force a full rank recalculation every time. Yuck.
63778331 408 $self->calculate_ranks() if $self->end->has_rank;
409 return @vectors;
22222af9 410}
ef9d481f 411
22222af9 412=head2 reading_witnesses( $reading )
910a0a6d 413
22222af9 414Return a list of sigils corresponding to the witnesses in which the reading appears.
3265b0ce 415
22222af9 416=cut
1d310495 417
1d310495 418sub reading_witnesses {
419 my( $self, $reading ) = @_;
420 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 421 # arbitrarily chosen "incoming". Thus, special-case the start node.
422 if( $reading eq $self->start ) {
423 return map { $_->sigil } $self->tradition->witnesses;
424 }
1d310495 425 my %all_witnesses;
426 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
427 my $wits = $self->sequence->get_edge_attributes( @$e );
428 @all_witnesses{ keys %$wits } = 1;
429 }
430 return keys %all_witnesses;
910a0a6d 431}
432
4e5a7b2c 433=head1 OUTPUT METHODS
8e1394aa 434
0ecb975c 435=head2 as_svg( \%options )
8e1394aa 436
0068967c 437Returns an SVG string that represents the graph, via as_dot and graphviz.
0ecb975c 438See as_dot for a list of options.
8e1394aa 439
440=cut
441
442sub as_svg {
0ecb975c 443 my( $self, $opts ) = @_;
3a2ebbf4 444
8e1394aa 445 my @cmd = qw/dot -Tsvg/;
446 my( $svg, $err );
910a0a6d 447 my $dotfile = File::Temp->new();
a344ffcf 448 ## USE FOR DEBUGGING
eca16057 449 # $dotfile->unlink_on_destroy(0);
910a0a6d 450 binmode $dotfile, ':utf8';
0ecb975c 451 print $dotfile $self->as_dot( $opts );
910a0a6d 452 push( @cmd, $dotfile->filename );
453 run( \@cmd, ">", binary(), \$svg );
a344ffcf 454 # HACK part 3 - remove silent node+edge
455 my $parser = XML::LibXML->new();
456 my $svgdom = $parser->parse_string( $svg );
457 my $xpc = XML::LibXML::XPathContext->new( $svgdom->documentElement );
458 $xpc->registerNs( 'svg', 'http://www.w3.org/2000/svg' );
459 my @hacknodes = $xpc->findnodes( '//svg:g[contains(child::svg:title, "#SILENT#")]' );
460 foreach my $h ( @hacknodes ) {
461 $h->parentNode->removeChild( $h );
462 }
463 return decode_utf8( $svgdom->toString() );
8e1394aa 464}
465
b22576c6 466
0ecb975c 467=head2 as_dot( \%options )
b22576c6 468
0ecb975c 469Returns a string that is the collation graph expressed in dot
470(i.e. GraphViz) format. Options include:
b22576c6 471
0ecb975c 472=over 4
b22576c6 473
0ecb975c 474=item * from
b22576c6 475
0ecb975c 476=item * to
df6d9812 477
0ecb975c 478=item * color_common
479
480=back
df6d9812 481
482=cut
483
484sub as_dot {
0ecb975c 485 my( $self, $opts ) = @_;
486 my $startrank = $opts->{'from'} if $opts;
487 my $endrank = $opts->{'to'} if $opts;
488 my $color_common = $opts->{'color_common'} if $opts;
8960ee08 489 my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
490 && $self->end->rank > 100;
b22576c6 491
492 # Check the arguments
493 if( $startrank ) {
494 return if $endrank && $startrank > $endrank;
495 return if $startrank > $self->end->rank;
496 }
497 if( defined $endrank ) {
498 return if $endrank < 0;
f1b3b33a 499 $endrank = undef if $endrank == $self->end->rank;
b22576c6 500 }
501
df6d9812 502 # TODO consider making some of these things configurable
67da8d6c 503 my $graph_name = $self->tradition->name;
504 $graph_name =~ s/[^\w\s]//g;
505 $graph_name = join( '_', split( /\s+/, $graph_name ) );
f13b5582 506
507 my %graph_attrs = (
508 'rankdir' => 'LR',
509 'bgcolor' => 'none',
510 );
511 my %node_attrs = (
512 'fontsize' => 11,
513 'fillcolor' => 'white',
514 'style' => 'filled',
515 'shape' => 'ellipse'
516 );
517 my %edge_attrs = (
518 'arrowhead' => 'open',
519 'color' => '#000000',
520 'fontcolor' => '#000000',
521 );
522
67da8d6c 523 my $dot = sprintf( "digraph %s {\n", $graph_name );
f13b5582 524 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
525 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
df6d9812 526
b22576c6 527 # Output substitute start/end readings if necessary
528 if( $startrank ) {
529 $dot .= "\t\"#SUBSTART#\" [ label=\"...\" ];\n";
530 }
531 if( $endrank ) {
532 $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n";
533 }
8960ee08 534 if( $STRAIGHTENHACK ) {
a344ffcf 535 ## HACK part 1
536 $dot .= "\tsubgraph { rank=same \"#START#\" \"#SILENT#\" }\n";
537 $dot .= "\t\"#SILENT#\" [ color=white,penwidth=0,label=\"\" ];"
538 }
b22576c6 539 my %used; # Keep track of the readings that actually appear in the graph
df6d9812 540 foreach my $reading ( $self->readings ) {
b22576c6 541 # Only output readings within our rank range.
542 next if $startrank && $reading->rank < $startrank;
543 next if $endrank && $reading->rank > $endrank;
544 $used{$reading->id} = 1;
910a0a6d 545 # Need not output nodes without separate labels
3a2ebbf4 546 next if $reading->id eq $reading->text;
d4b75f44 547 my $rattrs;
30f0df34 548 my $label = $reading->text;
8f9cab7b 549 $label =~ s/\"/\\\"/g;
d4b75f44 550 $rattrs->{'label'} = $label;
0ecb975c 551 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
d4b75f44 552 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
df6d9812 553 }
3a2ebbf4 554
b22576c6 555 # Add the real edges
556 my @edges = $self->paths;
3bdec618 557 my( %substart, %subend );
b22576c6 558 foreach my $edge ( @edges ) {
559 # Do we need to output this edge?
508fd430 560 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
f13b5582 561 my $label = $self->path_display_label( $self->path_witnesses( $edge ) );
562 my $variables = { %edge_attrs, 'label' => $label };
b22576c6 563 # Account for the rank gap if necessary
b0b4421a 564 if( $self->reading( $edge->[1] )->has_rank
565 && $self->reading( $edge->[0] )->has_rank
566 && $self->reading( $edge->[1] )->rank
567 - $self->reading( $edge->[0] )->rank > 1 ) {
568 $variables->{'minlen'} = $self->reading( $edge->[1] )->rank
b22576c6 569 - $self->reading( $edge->[0] )->rank;
b0b4421a 570 }
508fd430 571 # EXPERIMENTAL: make edge width reflect no. of witnesses
572 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
573 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
574
f13b5582 575 my $varopts = _dot_attr_string( $variables );
576 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
577 $edge->[0], $edge->[1], $varopts );
3bdec618 578 } elsif( $used{$edge->[0]} ) {
579 $subend{$edge->[0]} = 1;
580 } elsif( $used{$edge->[1]} ) {
581 $substart{$edge->[1]} = 1;
b22576c6 582 }
df6d9812 583 }
3bdec618 584 # Add substitute start and end edges if necessary
585 foreach my $node ( keys %substart ) {
586 my $witstr = $self->path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 587 my $variables = { %edge_attrs, 'label' => $witstr };
588 my $varopts = _dot_attr_string( $variables );
589 $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;";
3bdec618 590 }
591 foreach my $node ( keys %subend ) {
592 my $witstr = $self->path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 593 my $variables = { %edge_attrs, 'label' => $witstr };
594 my $varopts = _dot_attr_string( $variables );
595 $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;";
3bdec618 596 }
a344ffcf 597 # HACK part 2
8960ee08 598 if( $STRAIGHTENHACK ) {
a344ffcf 599 $dot .= "\t\"#END#\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
600 }
3bdec618 601
df6d9812 602 $dot .= "}\n";
603 return $dot;
604}
605
f13b5582 606sub _dot_attr_string {
607 my( $hash ) = @_;
608 my @attrs;
609 foreach my $k ( sort keys %$hash ) {
610 my $v = $hash->{$k};
611 push( @attrs, $k.'="'.$v.'"' );
612 }
613 return( '[ ' . join( ', ', @attrs ) . ' ]' );
614}
615
3a2ebbf4 616sub path_witnesses {
617 my( $self, @edge ) = @_;
618 # If edge is an arrayref, cope.
619 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
620 my $e = shift @edge;
621 @edge = @$e;
622 }
623 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
508fd430 624 return @wits;
3a2ebbf4 625}
626
8f9cab7b 627sub path_display_label {
508fd430 628 my $self = shift;
629 my @wits = sort @_;
8f9cab7b 630 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
631 if( scalar @wits > $maj ) {
f13b5582 632 # TODO break out a.c. wits
8f9cab7b 633 return 'majority';
634 } else {
635 return join( ', ', @wits );
636 }
637}
638
639
4e5a7b2c 640=head2 as_graphml
8e1394aa 641
4e5a7b2c 642Returns a GraphML representation of the collation. The GraphML will contain
643two graphs. The first expresses the attributes of the readings and the witness
644paths that link them; the second expresses the relationships that link the
645readings. This is the native transfer format for a tradition.
8e1394aa 646
56eefa04 647=begin testing
648
649use Text::Tradition;
650
651my $READINGS = 311;
652my $PATHS = 361;
653
654my $datafile = 't/data/florilegium_tei_ps.xml';
655my $tradition = Text::Tradition->new( 'input' => 'TEI',
656 'name' => 'test0',
657 'file' => $datafile,
658 'linear' => 1 );
659
660ok( $tradition, "Got a tradition object" );
661is( scalar $tradition->witnesses, 13, "Found all witnesses" );
662ok( $tradition->collation, "Tradition has a collation" );
663
664my $c = $tradition->collation;
665is( scalar $c->readings, $READINGS, "Collation has all readings" );
666is( scalar $c->paths, $PATHS, "Collation has all paths" );
667is( scalar $c->relationships, 0, "Collation has all relationships" );
668
669# Add a few relationships
670$c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
671$c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
672$c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
673
674# Now write it to GraphML and parse it again.
675
676my $graphml = $c->as_graphml;
677my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
678is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
679is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
680is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
681
682=end testing
683
8e1394aa 684=cut
685
686sub as_graphml {
3a2ebbf4 687 my( $self ) = @_;
8e1394aa 688
689 # Some namespaces
690 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
691 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
692 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 693 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 694
695 # Create the document and root node
696 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
697 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
698 $graphml->setDocumentElement( $root );
699 $root->setNamespace( $xsi_ns, 'xsi', 0 );
700 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
701
e309421a 702 # Add the data keys for the graph
703 my %graph_data_keys;
704 my $gdi = 0;
1d310495 705 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
e309421a 706 foreach my $datum ( @graph_attributes ) {
707 $graph_data_keys{$datum} = 'dg'.$gdi++;
708 my $key = $root->addNewChild( $graphml_ns, 'key' );
709 $key->setAttribute( 'attr.name', $datum );
710 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
711 $key->setAttribute( 'for', 'graph' );
712 $key->setAttribute( 'id', $graph_data_keys{$datum} );
713 }
f6066bac 714
8e1394aa 715 # Add the data keys for nodes
ef9d481f 716 my %node_data_keys;
717 my $ndi = 0;
3a2ebbf4 718 my %node_data = (
719 id => 'string',
255875b8 720 text => 'string',
3a2ebbf4 721 rank => 'string',
722 is_start => 'boolean',
723 is_end => 'boolean',
724 is_lacuna => 'boolean',
725 );
726 foreach my $datum ( keys %node_data ) {
910a0a6d 727 $node_data_keys{$datum} = 'dn'.$ndi++;
728 my $key = $root->addNewChild( $graphml_ns, 'key' );
729 $key->setAttribute( 'attr.name', $datum );
3a2ebbf4 730 $key->setAttribute( 'attr.type', $node_data{$datum} );
910a0a6d 731 $key->setAttribute( 'for', 'node' );
732 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 733 }
734
df6d9812 735 # Add the data keys for edges, i.e. witnesses
ef9d481f 736 my $edi = 0;
737 my %edge_data_keys;
3a2ebbf4 738 my %edge_data = (
f523c7a8 739 class => 'string', # Class, deprecated soon
3a2ebbf4 740 witness => 'string', # ID/label for a path
741 relationship => 'string', # ID/label for a relationship
742 extra => 'boolean', # Path key
c84275ff 743 scope => 'string', # Relationship key
3a2ebbf4 744 non_correctable => 'boolean', # Relationship key
745 non_independent => 'boolean', # Relationship key
746 );
747 foreach my $datum ( keys %edge_data ) {
748 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 749 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 750 $key->setAttribute( 'attr.name', $datum );
751 $key->setAttribute( 'attr.type', $edge_data{$datum} );
910a0a6d 752 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 753 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 754 }
3a2ebbf4 755
22222af9 756 # Add the collation graph itself
2c669bca 757 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
758 $sgraph->setAttribute( 'edgedefault', 'directed' );
759 $sgraph->setAttribute( 'id', $self->tradition->name );
760 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
761 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
762 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
763 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
764 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
22222af9 765
e309421a 766 # Collation attribute data
767 foreach my $datum ( @graph_attributes ) {
2c669bca 768 my $value = $datum eq 'version' ? '3.0' : $self->$datum;
769 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 770 }
8e1394aa 771
772 my $node_ctr = 0;
773 my %node_hash;
22222af9 774 # Add our readings to the graph
3a2ebbf4 775 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
2c669bca 776 # Add to the main graph
777 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 778 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 779 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 780 $node_el->setAttribute( 'id', $node_xmlid );
255875b8 781 foreach my $d ( keys %node_data ) {
782 my $nval = $n->$d;
783 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
784 if defined $nval;
785 }
b15511bf 786 }
787
2c669bca 788 # Add the path edges to the sequence graph
df6d9812 789 my $edge_ctr = 0;
3a2ebbf4 790 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
791 # We add an edge in the graphml for every witness in $e.
508fd430 792 foreach my $wit ( sort $self->path_witnesses( $e ) ) {
3a2ebbf4 793 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
794 $node_hash{ $e->[0] },
795 $node_hash{ $e->[1] } );
2c669bca 796 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 797 $edge_el->setAttribute( 'source', $from );
798 $edge_el->setAttribute( 'target', $to );
799 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 800
801 # It's a witness path, so add the witness
802 my $base = $wit;
803 my $key = $edge_data_keys{'witness'};
804 # Is this an ante-corr witness?
805 my $aclabel = $self->ac_label;
806 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
807 # Keep the base witness
808 $base = $1;
809 # ...and record that this is an 'extra' reading path
810 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
811 }
812 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
f523c7a8 813 _add_graphml_data( $edge_el, $edge_data_keys{'class'}, 'path' );
3a2ebbf4 814 }
815 }
816
22222af9 817 # Add the relationship graph to the XML
2626f709 818 $self->relations->as_graphml( $graphml_ns, $root, \%node_hash,
819 $node_data_keys{'id'}, \%edge_data_keys );
8e1394aa 820
94c00c71 821 # Save and return the thing
822 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 823 return $result;
df6d9812 824}
825
b15511bf 826sub _add_graphml_data {
827 my( $el, $key, $value ) = @_;
b15511bf 828 return unless defined $value;
c9bf3dbf 829 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 830 $data_el->setAttribute( 'key', $key );
831 $data_el->appendText( $value );
8e1394aa 832}
833
4e5a7b2c 834=head2 as_csv
910a0a6d 835
836Returns a CSV alignment table representation of the collation graph, one
2c669bca 837row per witness (or witness uncorrected.)
910a0a6d 838
839=cut
840
841sub as_csv {
3a2ebbf4 842 my( $self ) = @_;
910a0a6d 843 my $table = $self->make_alignment_table;
844 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
845 my @result;
2c669bca 846 # Make the header row
847 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
848 push( @result, decode_utf8( $csv->string ) );
849 # Make the rest of the rows
850 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 851 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
2c669bca 852 my @row = map { $_ ? $_->{'t'} : $_ } @rowobjs;
853 $csv->combine( @row );
910a0a6d 854 push( @result, decode_utf8( $csv->string ) );
855 }
3a2ebbf4 856 return join( "\n", @result );
910a0a6d 857}
858
4e5a7b2c 859=head2 make_alignment_table( $use_refs, $include_witnesses )
2c669bca 860
566f4595 861Return a reference to an alignment table, in a slightly enhanced CollateX
862format which looks like this:
863
864 $table = { alignment => [ { witness => "SIGIL",
4e5a7b2c 865 tokens => [ { t => "TEXT" }, ... ] },
566f4595 866 { witness => "SIG2",
4e5a7b2c 867 tokens => [ { t => "TEXT" }, ... ] },
566f4595 868 ... ],
869 length => TEXTLEN };
870
871If $use_refs is set to 1, the reading object is returned in the table
872instead of READINGTEXT; if not, the text of the reading is returned.
4e5a7b2c 873
874If $include_witnesses is set to a hashref, only the witnesses whose sigil
566f4595 875keys have a true hash value will be included.
2c669bca 876
877=cut
9f3ba6f7 878
910a0a6d 879sub make_alignment_table {
08e0fb85 880 my( $self, $noderefs, $include ) = @_;
0ecb975c 881 # Make sure we can do this
882 throw( "Need a linear graph in order to make an alignment table" )
883 unless $self->linear;
884 $self->calculate_ranks unless $self->end->has_rank;
885
2c669bca 886 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 887 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 888 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
2c669bca 889 if( $include ) {
566f4595 890 next unless $include->{$wit->sigil};
2c669bca 891 }
eca16057 892 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 893 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
894 my @row = _make_witness_row( \@wit_path, \@all_pos, $noderefs );
2c669bca 895 push( @{$table->{'alignment'}},
896 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 897 if( $wit->is_layered ) {
898 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
861c3e27 899 $wit->sigil.$self->ac_label );
1f7aa795 900 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos, $noderefs );
2c669bca 901 push( @{$table->{'alignment'}},
902 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 903 }
904 }
2c669bca 905 return $table;
910a0a6d 906}
907
908sub _make_witness_row {
0e476982 909 my( $path, $positions, $noderefs ) = @_;
910a0a6d 910 my %char_hash;
911 map { $char_hash{$_} = undef } @$positions;
2c669bca 912 my $debug = 0;
910a0a6d 913 foreach my $rdg ( @$path ) {
eca16057 914 my $rtext = $rdg->text;
915 $rtext = '#LACUNA#' if $rdg->is_lacuna;
2c669bca 916 print STDERR "rank " . $rdg->rank . "\n" if $debug;
3a2ebbf4 917 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
2c669bca 918 $char_hash{$rdg->rank} = $noderefs ? { 't' => $rdg }
919 : { 't' => $rtext };
910a0a6d 920 }
921 my @row = map { $char_hash{$_} } @$positions;
eca16057 922 # Fill in lacuna markers for undef spots in the row
923 my $last_el = shift @row;
924 my @filled_row = ( $last_el );
925 foreach my $el ( @row ) {
0e476982 926 # If we are using node reference, make the lacuna node appear many times
927 # in the table. If not, use the lacuna tag.
928 if( $last_el && _el_is_lacuna( $last_el ) && !defined $el ) {
566f4595 929 $el = $noderefs ? $last_el : { 't' => '#LACUNA#' };
eca16057 930 }
931 push( @filled_row, $el );
932 $last_el = $el;
933 }
934 return @filled_row;
910a0a6d 935}
936
0e476982 937# Tiny utility function to say if a table element is a lacuna
938sub _el_is_lacuna {
939 my $el = shift;
2c669bca 940 return 1 if $el->{'t'} eq '#LACUNA#';
941 return 1 if ref( $el->{'t'} ) eq 'Text::Tradition::Collation::Reading'
942 && $el->{'t'}->is_lacuna;
0e476982 943 return 0;
944}
945
910a0a6d 946# Helper to turn the witnesses along columns rather than rows. Assumes
947# equal-sized rows.
948sub _turn_table {
949 my( $table ) = @_;
950 my $result = [];
951 return $result unless scalar @$table;
952 my $nrows = scalar @{$table->[0]};
953 foreach my $idx ( 0 .. $nrows - 1 ) {
954 foreach my $wit ( 0 .. $#{$table} ) {
955 $result->[$idx]->[$wit] = $table->[$wit]->[$idx];
956 }
957 }
958 return $result;
959}
960
4e5a7b2c 961=head1 NAVIGATION METHODS
910a0a6d 962
4e5a7b2c 963=head2 reading_sequence( $first, $last, $sigil, $backup )
e2902068 964
965Returns the ordered list of readings, starting with $first and ending
4e5a7b2c 966with $last, for the witness given in $sigil. If a $backup sigil is
967specified (e.g. when walking a layered witness), it will be used wherever
968no $sigil path exists. If there is a base text reading, that will be
969used wherever no path exists for $sigil or $backup.
e2902068 970
971=cut
972
910a0a6d 973# TODO Think about returning some lazy-eval iterator.
b0b4421a 974# TODO Get rid of backup; we should know from what witness is whether we need it.
910a0a6d 975
e2902068 976sub reading_sequence {
861c3e27 977 my( $self, $start, $end, $witness ) = @_;
e2902068 978
930ff666 979 $witness = $self->baselabel unless $witness;
e2902068 980 my @readings = ( $start );
981 my %seen;
982 my $n = $start;
3a2ebbf4 983 while( $n && $n->id ne $end->id ) {
984 if( exists( $seen{$n->id} ) ) {
63778331 985 throw( "Detected loop for $witness at " . $n->id );
910a0a6d 986 }
3a2ebbf4 987 $seen{$n->id} = 1;
910a0a6d 988
861c3e27 989 my $next = $self->next_reading( $n, $witness );
44771cf2 990 unless( $next ) {
63778331 991 throw( "Did not find any path for $witness from reading " . $n->id );
44771cf2 992 }
910a0a6d 993 push( @readings, $next );
994 $n = $next;
e2902068 995 }
996 # Check that the last reading is our end reading.
997 my $last = $readings[$#readings];
63778331 998 throw( "Last reading found from " . $start->text .
999 " for witness $witness is not the end!" ) # TODO do we get this far?
3a2ebbf4 1000 unless $last->id eq $end->id;
e2902068 1001
1002 return @readings;
1003}
1004
4e5a7b2c 1005=head2 next_reading( $reading, $sigil );
8e1394aa 1006
4a8828f0 1007Returns the reading that follows the given reading along the given witness
930ff666 1008path.
8e1394aa 1009
1010=cut
1011
4a8828f0 1012sub next_reading {
e2902068 1013 # Return the successor via the corresponding path.
8e1394aa 1014 my $self = shift;
3a2ebbf4 1015 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 1016 return undef unless $answer;
3a2ebbf4 1017 return $self->reading( $answer );
8e1394aa 1018}
1019
4e5a7b2c 1020=head2 prior_reading( $reading, $sigil )
8e1394aa 1021
4a8828f0 1022Returns the reading that precedes the given reading along the given witness
930ff666 1023path.
8e1394aa 1024
1025=cut
1026
4a8828f0 1027sub prior_reading {
e2902068 1028 # Return the predecessor via the corresponding path.
8e1394aa 1029 my $self = shift;
3a2ebbf4 1030 my $answer = $self->_find_linked_reading( 'prior', @_ );
1031 return $self->reading( $answer );
8e1394aa 1032}
1033
4a8828f0 1034sub _find_linked_reading {
861c3e27 1035 my( $self, $direction, $node, $path ) = @_;
1036
1037 # Get a backup if we are dealing with a layered witness
1038 my $alt_path;
1039 my $aclabel = $self->ac_label;
1040 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1041 $alt_path = $1;
1042 }
1043
e2902068 1044 my @linked_paths = $direction eq 'next'
3a2ebbf4 1045 ? $self->sequence->edges_from( $node )
1046 : $self->sequence->edges_to( $node );
e2902068 1047 return undef unless scalar( @linked_paths );
8e1394aa 1048
e2902068 1049 # We have to find the linked path that contains all of the
1050 # witnesses supplied in $path.
1051 my( @path_wits, @alt_path_wits );
4e5a7b2c 1052 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1053 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 1054 my $base_le;
1055 my $alt_le;
1056 foreach my $le ( @linked_paths ) {
3a2ebbf4 1057 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 1058 $base_le = $le;
910a0a6d 1059 }
508fd430 1060 my @le_wits = sort $self->path_witnesses( $le );
3a2ebbf4 1061 if( _is_within( \@path_wits, \@le_wits ) ) {
1062 # This is the right path.
1063 return $direction eq 'next' ? $le->[1] : $le->[0];
1064 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1065 $alt_le = $le;
1066 }
8e1394aa 1067 }
e2902068 1068 # Got this far? Return the alternate path if it exists.
3a2ebbf4 1069 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 1070 if $alt_le;
e2902068 1071
1072 # Got this far? Return the base path if it exists.
3a2ebbf4 1073 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 1074 if $base_le;
e2902068 1075
1076 # Got this far? We have no appropriate path.
2c669bca 1077 warn "Could not find $direction node from " . $node->id
910a0a6d 1078 . " along path $path";
8e1394aa 1079 return undef;
1080}
1081
4a8828f0 1082# Some set logic.
1083sub _is_within {
1084 my( $set1, $set2 ) = @_;
7854e12e 1085 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 1086 foreach my $el ( @$set1 ) {
910a0a6d 1087 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 1088 }
1089 return $ret;
1090}
1091
4e5a7b2c 1092# Return the string that joins together a list of witnesses for
1093# display on a single path.
1094sub _witnesses_of_label {
1095 my( $self, $label ) = @_;
1096 my $regex = $self->wit_list_separator;
1097 my @answer = split( /\Q$regex\E/, $label );
1098 return @answer;
b0b4421a 1099}
1100
d4b75f44 1101=head2 common_readings
1102
1103Returns the list of common readings in the graph (i.e. those readings that are
1104shared by all non-lacunose witnesses.)
1105
1106=cut
1107
1108sub common_readings {
1109 my $self = shift;
1110 my @common = grep { $_->is_common } $self->readings;
1111 return @common;
1112}
1113
b0b4421a 1114=head2 path_text( $sigil, $mainsigil [, $start, $end ] )
1115
1116Returns the text of a witness (plus its backup, if we are using a layer)
1117as stored in the collation. The text is returned as a string, where the
1118individual readings are joined with spaces and the meta-readings (e.g.
1119lacunae) are omitted. Optional specification of $start and $end allows
1120the generation of a subset of the witness text.
4e5a7b2c 1121
b0b4421a 1122=cut
1123
1124sub path_text {
861c3e27 1125 my( $self, $wit, $start, $end ) = @_;
b0b4421a 1126 $start = $self->start unless $start;
1127 $end = $self->end unless $end;
861c3e27 1128 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
b0b4421a 1129 return join( ' ', map { $_->text } @path );
1130}
4e5a7b2c 1131
1132=head1 INITIALIZATION METHODS
1133
1134These are mostly for use by parsers.
1135
1136=head2 make_witness_path( $witness )
1137
1138Link the array of readings contained in $witness->path (and in
1139$witness->uncorrected_path if it exists) into collation paths.
1140Clear out the arrays when finished.
de51424a 1141
4e5a7b2c 1142=head2 make_witness_paths
1143
1144Call make_witness_path for all witnesses in the tradition.
1145
1146=cut
930ff666 1147
7e450e44 1148# For use when a collation is constructed from a base text and an apparatus.
1149# We have the sequences of readings and just need to add path edges.
1f7aa795 1150# When we are done, clear out the witness path attributes, as they are no
1151# longer needed.
1152# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 1153
6a222840 1154sub make_witness_paths {
1155 my( $self ) = @_;
910a0a6d 1156 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 1157 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 1158 $self->make_witness_path( $wit );
7854e12e 1159 }
7854e12e 1160}
1161
6a222840 1162sub make_witness_path {
7854e12e 1163 my( $self, $wit ) = @_;
1164 my @chain = @{$wit->path};
15d2d3df 1165 my $sig = $wit->sigil;
7854e12e 1166 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 1167 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 1168 }
1f7aa795 1169 if( $wit->is_layered ) {
d9e873d0 1170 @chain = @{$wit->uncorrected_path};
1171 foreach my $idx( 0 .. $#chain-1 ) {
1172 my $source = $chain[$idx];
1173 my $target = $chain[$idx+1];
1174 $self->add_path( $source, $target, $sig.$self->ac_label )
1175 unless $self->has_path( $source, $target, $sig );
1176 }
15d2d3df 1177 }
1f7aa795 1178 $wit->clear_path;
1179 $wit->clear_uncorrected_path;
e2902068 1180}
1181
4e5a7b2c 1182=head2 calculate_ranks
1183
1184Calculate the reading ranks (that is, their aligned positions relative
1185to each other) for the graph. This can only be called on linear collations.
1186
1187=cut
1188
910a0a6d 1189sub calculate_ranks {
1190 my $self = shift;
1191 # Walk a version of the graph where every node linked by a relationship
1192 # edge is fundamentally the same node, and do a topological ranking on
1193 # the nodes in this graph.
c9bf3dbf 1194 my $topo_graph = Graph->new();
910a0a6d 1195 my %rel_containers;
1196 my $rel_ctr = 0;
1197 # Add the nodes
1198 foreach my $r ( $self->readings ) {
3a2ebbf4 1199 next if exists $rel_containers{$r->id};
910a0a6d 1200 my @rels = $r->related_readings( 'colocated' );
1201 if( @rels ) {
1202 # Make a relationship container.
1203 push( @rels, $r );
c9bf3dbf 1204 my $rn = 'rel_container_' . $rel_ctr++;
1205 $topo_graph->add_vertex( $rn );
910a0a6d 1206 foreach( @rels ) {
3a2ebbf4 1207 $rel_containers{$_->id} = $rn;
910a0a6d 1208 }
1209 } else {
1210 # Add a new node to mirror the old node.
3a2ebbf4 1211 $rel_containers{$r->id} = $r->id;
1212 $topo_graph->add_vertex( $r->id );
910a0a6d 1213 }
4a8828f0 1214 }
3a1f2523 1215
3a2ebbf4 1216 # Add the edges.
910a0a6d 1217 foreach my $r ( $self->readings ) {
3a2ebbf4 1218 foreach my $n ( $self->sequence->successors( $r->id ) ) {
1219 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
1220 $rel_containers{$n} );
4e5a7b2c 1221 # $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 1222 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 1223 }
1224 }
1225
1226 # Now do the rankings, starting with the start node.
3a2ebbf4 1227 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 1228 my $node_ranks = { $topo_start => 0 };
910a0a6d 1229 my @curr_origin = ( $topo_start );
1230 # A little iterative function.
1231 while( @curr_origin ) {
c9bf3dbf 1232 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 1233 }
1234 # Transfer our rankings from the topological graph to the real one.
1235 foreach my $r ( $self->readings ) {
3a2ebbf4 1236 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1237 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 1238 } else {
63778331 1239 # Die. Find the last rank we calculated.
1240 my @all_defined = sort { $node_ranks->{$rel_containers{$a->id}}
1241 <=> $node_ranks->{$rel_containers{$b->id}} }
1242 $self->readings;
1243 my $last = pop @all_defined;
1244 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
67da8d6c 1245 }
de51424a 1246 }
8e1394aa 1247}
3a1f2523 1248
910a0a6d 1249sub _assign_rank {
c9bf3dbf 1250 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1251 # Look at each of the children of @current_nodes. If all the child's
1252 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1253 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1254 # parent gets a rank.
910a0a6d 1255 my @next_nodes;
1256 foreach my $c ( @current_nodes ) {
c9bf3dbf 1257 warn "Current reading $c has no rank!"
1258 unless exists $node_ranks->{$c};
1259 # print STDERR "Looking at child of node $c, rank "
1260 # . $node_ranks->{$c} . "\n";
1261 foreach my $child ( $graph->successors( $c ) ) {
1262 next if exists $node_ranks->{$child};
910a0a6d 1263 my $highest_rank = -1;
1264 my $skip = 0;
c9bf3dbf 1265 foreach my $parent ( $graph->predecessors( $child ) ) {
1266 if( exists $node_ranks->{$parent} ) {
1267 $highest_rank = $node_ranks->{$parent}
1268 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1269 } else {
1270 $skip = 1;
1271 last;
1272 }
1273 }
1274 next if $skip;
c9bf3dbf 1275 my $c_rank = $highest_rank + 1;
1276 # print STDERR "Assigning rank $c_rank to node $child \n";
1277 $node_ranks->{$child} = $c_rank;
910a0a6d 1278 push( @next_nodes, $child );
1279 }
1280 }
1281 return @next_nodes;
4cdd82f1 1282}
910a0a6d 1283
4e5a7b2c 1284=head2 flatten_ranks
1285
1286A convenience method for parsing collation data. Searches the graph for readings
1287with the same text at the same rank, and merges any that are found.
1288
1289=cut
1290
0e476982 1291sub flatten_ranks {
1292 my $self = shift;
1293 my %unique_rank_rdg;
1294 foreach my $rdg ( $self->readings ) {
1295 next unless $rdg->has_rank;
1296 my $key = $rdg->rank . "||" . $rdg->text;
1297 if( exists $unique_rank_rdg{$key} ) {
1298 # Combine!
56eefa04 1299 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1300 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
0ecb975c 1301 # TODO see if this now makes a common point.
0e476982 1302 } else {
1303 $unique_rank_rdg{$key} = $rdg;
1304 }
1305 }
1306}
1307
d4b75f44 1308=head2 calculate_common_readings
1309
1310Goes through the graph identifying the readings that appear in every witness
1311(apart from those with lacunae at that spot.) Marks them as common and returns
1312the list.
1313
1314=begin testing
1315
1316use Text::Tradition;
1317
1318my $cxfile = 't/data/Collatex-16.xml';
1319my $t = Text::Tradition->new(
1320 'name' => 'inline',
1321 'input' => 'CollateX',
1322 'file' => $cxfile,
1323 );
1324my $c = $t->collation;
1325
1326my @common = $c->calculate_common_readings();
1327is( scalar @common, 8, "Found correct number of common readings" );
1328my @marked = sort $c->common_readings();
1329is( scalar @common, 8, "All common readings got marked as such" );
1330my @expected = qw/ n1 n12 n16 n19 n20 n5 n6 n7 /;
1331is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1332
1333=end testing
1334
1335=cut
1336
1337sub calculate_common_readings {
1338 my $self = shift;
1339 my @common;
1340 my $table = $self->make_alignment_table( 1 );
1341 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1342 my @row = map { $_->{'tokens'}->[$idx]->{'t'} } @{$table->{'alignment'}};
1343 my %hash;
1344 foreach my $r ( @row ) {
1345 if( $r ) {
1346 $hash{$r->id} = $r unless $r->is_meta;
1347 } else {
1348 $hash{'UNDEF'} = $r;
1349 }
1350 }
1351 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1352 my( $r ) = values %hash;
1353 $r->is_common( 1 );
1354 push( @common, $r );
1355 }
1356 }
1357 return @common;
1358}
1359
861c3e27 1360=head2 text_from_paths
1361
1362Calculate the text array for all witnesses from the path, for later consistency
1363checking. Only to be used if there is no non-graph-based way to know the
1364original texts.
1365
1366=cut
1367
1368sub text_from_paths {
1369 my $self = shift;
1370 foreach my $wit ( $self->tradition->witnesses ) {
1371 my @text = split( /\s+/,
1372 $self->reading_sequence( $self->start, $self->end, $wit->sigil ) );
1373 $wit->text( \@text );
1374 if( $wit->is_layered ) {
1375 my @uctext = split( /\s+/,
1376 $self->reading_sequence( $self->start, $self->end,
1377 $wit->sigil.$self->ac_label ) );
1378 $wit->text( \@uctext );
1379 }
1380 }
1381}
0e476982 1382
4e5a7b2c 1383=head1 UTILITY FUNCTIONS
1384
1385=head2 common_predecessor( $reading_a, $reading_b )
8e1394aa 1386
4e5a7b2c 1387Find the last reading that occurs in sequence before both the given readings.
1388
1389=head2 common_successor( $reading_a, $reading_b )
1390
1391Find the first reading that occurs in sequence after both the given readings.
1392
22222af9 1393=begin testing
1394
1395use Text::Tradition;
1396
1397my $cxfile = 't/data/Collatex-16.xml';
1398my $t = Text::Tradition->new(
1399 'name' => 'inline',
1400 'input' => 'CollateX',
1401 'file' => $cxfile,
1402 );
1403my $c = $t->collation;
1404
4e5a7b2c 1405is( $c->common_predecessor( 'n9', 'n23' )->id,
22222af9 1406 'n20', "Found correct common predecessor" );
4e5a7b2c 1407is( $c->common_successor( 'n9', 'n23' )->id,
22222af9 1408 '#END#', "Found correct common successor" );
1409
4e5a7b2c 1410is( $c->common_predecessor( 'n19', 'n17' )->id,
22222af9 1411 'n16', "Found correct common predecessor for readings on same path" );
4e5a7b2c 1412is( $c->common_successor( 'n21', 'n26' )->id,
22222af9 1413 '#END#', "Found correct common successor for readings on same path" );
1414
1415=end testing
1416
1417=cut
1418
1419## Return the closest reading that is a predecessor of both the given readings.
1420sub common_predecessor {
1421 my $self = shift;
4e5a7b2c 1422 my( $r1, $r2 ) = $self->_objectify_args( @_ );
1423 return $self->common_in_path( $r1, $r2, 'predecessors' );
22222af9 1424}
1425
1426sub common_successor {
1427 my $self = shift;
4e5a7b2c 1428 my( $r1, $r2 ) = $self->_objectify_args( @_ );
1429 return $self->common_in_path( $r1, $r2, 'successors' );
22222af9 1430}
1431
1432sub common_in_path {
1433 my( $self, $r1, $r2, $dir ) = @_;
1434 my $iter = $r1->rank > $r2->rank ? $r1->rank : $r2->rank;
1435 $iter = $self->end->rank - $iter if $dir eq 'successors';
1436 my @candidates;
1437 my @last_checked = ( $r1, $r2 );
1438 my %all_seen;
1439 while( !@candidates ) {
1440 my @new_lc;
1441 foreach my $lc ( @last_checked ) {
1442 foreach my $p ( $lc->$dir ) {
1443 if( $all_seen{$p->id} ) {
1444 push( @candidates, $p );
1445 } else {
1446 $all_seen{$p->id} = 1;
1447 push( @new_lc, $p );
1448 }
1449 }
1450 }
1451 @last_checked = @new_lc;
1452 }
1453 my @answer = sort { $a->rank <=> $b->rank } @candidates;
1454 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1455}
1456
63778331 1457sub throw {
1458 Text::Tradition::Error->throw(
1459 'ident' => 'Collation error',
1460 'message' => $_[0],
1461 );
1462}
1463
dd3b58b0 1464no Moose;
1465__PACKAGE__->meta->make_immutable;
e867486f 1466
1467=head1 BUGS / TODO
1468
1469=over
1470
4e5a7b2c 1471=item * Get rid of $backup in reading_sequence
e867486f 1472
1473=back