detect and mark common readings
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
c9bf3dbf 5use Graph;
8e1394aa 6use IPC::Run qw( run binary );
910a0a6d 7use Text::CSV_XS;
b15511bf 8use Text::Tradition::Collation::Reading;
22222af9 9use Text::Tradition::Collation::RelationshipStore;
63778331 10use Text::Tradition::Error;
df6d9812 11use XML::LibXML;
dd3b58b0 12use Moose;
13
3a2ebbf4 14has 'sequence' => (
d047cd52 15 is => 'ro',
3a2ebbf4 16 isa => 'Graph',
17 default => sub { Graph->new() },
d047cd52 18 handles => {
3a2ebbf4 19 paths => 'edges',
d047cd52 20 },
d047cd52 21 );
3a2ebbf4 22
23has 'relations' => (
24 is => 'ro',
22222af9 25 isa => 'Text::Tradition::Collation::RelationshipStore',
26 handles => {
27 relationships => 'relationships',
28 related_readings => 'related_readings',
29 },
30 writer => '_set_relations',
3a2ebbf4 31 );
dd3b58b0 32
3a2ebbf4 33has 'tradition' => (
34 is => 'ro',
d047cd52 35 isa => 'Text::Tradition',
8d9a1cd8 36 weak_ref => 1,
d047cd52 37 );
dd3b58b0 38
3a2ebbf4 39has 'readings' => (
40 isa => 'HashRef[Text::Tradition::Collation::Reading]',
41 traits => ['Hash'],
42 handles => {
43 reading => 'get',
44 _add_reading => 'set',
45 del_reading => 'delete',
46 has_reading => 'exists',
47 readings => 'values',
48 },
49 default => sub { {} },
50 );
910a0a6d 51
4a8828f0 52has 'wit_list_separator' => (
7854e12e 53 is => 'rw',
54 isa => 'Str',
55 default => ', ',
56 );
57
58has 'baselabel' => (
59 is => 'rw',
60 isa => 'Str',
61 default => 'base text',
62 );
4a8828f0 63
15d2d3df 64has 'linear' => (
65 is => 'rw',
66 isa => 'Bool',
67 default => 1,
68 );
c84275ff 69
ef9d481f 70has 'ac_label' => (
71 is => 'rw',
72 isa => 'Str',
73 default => ' (a.c.)',
74 );
3a2ebbf4 75
76has 'start' => (
77 is => 'ro',
78 isa => 'Text::Tradition::Collation::Reading',
79 writer => '_set_start',
80 weak_ref => 1,
81 );
82
83has 'end' => (
84 is => 'ro',
85 isa => 'Text::Tradition::Collation::Reading',
86 writer => '_set_end',
87 weak_ref => 1,
88 );
1f563ac3 89
4e5a7b2c 90=head1 NAME
91
92Text::Tradition::Collation - a software model for a text collation
93
94=head1 SYNOPSIS
95
96 use Text::Tradition;
97 my $t = Text::Tradition->new(
98 'name' => 'this is a text',
99 'input' => 'TEI',
100 'file' => '/path/to/tei_parallel_seg_file.xml' );
101
102 my $c = $t->collation;
103 my @readings = $c->readings;
104 my @paths = $c->paths;
105 my @relationships = $c->relationships;
106
107 my $svg_variant_graph = $t->collation->as_svg();
108
109=head1 DESCRIPTION
110
111Text::Tradition is a library for representation and analysis of collated
112texts, particularly medieval ones. The Collation is the central feature of
113a Tradition, where the text, its sequence of readings, and its relationships
114between readings are actually kept.
115
116=head1 CONSTRUCTOR
117
118=head2 new
119
120The constructor. Takes a hash or hashref of the following arguments:
121
122=over
123
124=item * tradition - The Text::Tradition object to which the collation
125belongs. Required.
126
127=item * linear - Whether the collation should be linear; that is, whether
128transposed readings should be treated as two linked readings rather than one,
129and therefore whether the collation graph is acyclic. Defaults to true.
130
4e5a7b2c 131=item * baselabel - The default label for the path taken by a base text
132(if any). Defaults to 'base text'.
133
134=item * wit_list_separator - The string to join a list of witnesses for
135purposes of making labels in display graphs. Defaults to ', '.
136
137=item * ac_label - The extra label to tack onto a witness sigil when
138representing another layer of path for the given witness - that is, when
139a text has more than one possible reading due to scribal corrections or
140the like. Defaults to ' (a.c.)'.
141
142=back
143
144=head1 ACCESSORS
145
146=head2 tradition
147
148=head2 linear
149
4e5a7b2c 150=head2 wit_list_separator
151
152=head2 baselabel
153
154=head2 ac_label
155
156Simple accessors for collation attributes.
157
158=head2 start
159
160The meta-reading at the start of every witness path.
161
162=head2 end
163
164The meta-reading at the end of every witness path.
165
166=head2 readings
167
168Returns all Reading objects in the graph.
169
170=head2 reading( $id )
171
172Returns the Reading object corresponding to the given ID.
173
174=head2 add_reading( $reading_args )
175
176Adds a new reading object to the collation.
177See L<Text::Tradition::Collation::Reading> for the available arguments.
178
179=head2 del_reading( $object_or_id )
180
181Removes the given reading from the collation, implicitly removing its
182paths and relationships.
183
184=head2 merge_readings( $main, $second )
185
186Merges the $second reading into the $main one.
187The arguments may be either readings or reading IDs.
188
189=head2 has_reading( $id )
190
191Predicate to see whether a given reading ID is in the graph.
192
193=head2 reading_witnesses( $object_or_id )
194
195Returns a list of sigils whose witnesses contain the reading.
196
197=head2 paths
198
199Returns all reading paths within the document - that is, all edges in the
200collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
201
202=head2 add_path( $source, $target, $sigil )
203
204Links the given readings in the collation in sequence, under the given witness
205sigil. The readings may be specified by object or ID.
206
207=head2 del_path( $source, $target, $sigil )
208
209Links the given readings in the collation in sequence, under the given witness
210sigil. The readings may be specified by object or ID.
211
212=head2 has_path( $source, $target );
213
214Returns true if the two readings are linked in sequence in any witness.
215The readings may be specified by object or ID.
216
217=head2 relationships
218
219Returns all Relationship objects in the collation.
220
221=head2 add_relationship( $reading, $other_reading, $options )
222
223Adds a new relationship of the type given in $options between the two readings,
224which may be specified by object or ID. Returns a value of ( $status, @vectors)
225where $status is true on success, and @vectors is a list of relationship edges
226that were ultimately added.
227See L<Text::Tradition::Collation::Relationship> for the available options.
228
229=cut
dd3b58b0 230
d047cd52 231sub BUILD {
3a2ebbf4 232 my $self = shift;
22222af9 233 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
3a2ebbf4 234 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
235 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 236}
784877d9 237
3a2ebbf4 238### Reading construct/destruct functions
239
240sub add_reading {
241 my( $self, $reading ) = @_;
242 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
243 my %args = %$reading;
244 $reading = Text::Tradition::Collation::Reading->new(
245 'collation' => $self,
246 %args );
247 }
248 # First check to see if a reading with this ID exists.
249 if( $self->reading( $reading->id ) ) {
63778331 250 throw( "Collation already has a reading with id " . $reading->id );
3a2ebbf4 251 }
252 $self->_add_reading( $reading->id => $reading );
253 # Once the reading has been added, put it in both graphs.
254 $self->sequence->add_vertex( $reading->id );
22222af9 255 $self->relations->add_reading( $reading->id );
3a2ebbf4 256 return $reading;
eca16057 257};
258
3a2ebbf4 259around del_reading => sub {
260 my $orig = shift;
261 my $self = shift;
262 my $arg = shift;
263
264 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
265 $arg = $arg->id;
266 }
3a2ebbf4 267 # Remove the reading from the graphs.
268 $self->sequence->delete_vertex( $arg );
22222af9 269 $self->relations->delete_reading( $arg );
3a2ebbf4 270
271 # Carry on.
272 $self->$orig( $arg );
273};
7854e12e 274
3a2ebbf4 275# merge_readings( $main, $to_be_deleted );
7854e12e 276
3a2ebbf4 277sub merge_readings {
278 my $self = shift;
279
280 # We only need the IDs for adding paths to the graph, not the reading
281 # objects themselves.
49d4f2ac 282 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
3a2ebbf4 283
284 # The kept reading should inherit the paths and the relationships
285 # of the deleted reading.
286 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
287 my @vector = ( $kept );
288 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
289 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 290 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 291 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
292 $self->sequence->add_edge( @vector );
293 my $fwits = $self->sequence->get_edge_attributes( @vector );
294 @wits{keys %$fwits} = values %$fwits;
295 $self->sequence->set_edge_attributes( @vector, \%wits );
296 }
22222af9 297 $self->relations->merge_readings( $kept, $deleted, $combine_char );
3a2ebbf4 298
299 # Do the deletion deed.
49d4f2ac 300 if( $combine_char ) {
301 my $kept_obj = $self->reading( $kept );
302 my $new_text = join( $combine_char, $kept_obj->text,
303 $self->reading( $deleted )->text );
304 $kept_obj->alter_text( $new_text );
305 }
3a2ebbf4 306 $self->del_reading( $deleted );
307}
7854e12e 308
3265b0ce 309
3a2ebbf4 310# Helper function for manipulating the graph.
311sub _stringify_args {
312 my( $self, $first, $second, $arg ) = @_;
313 $first = $first->id
314 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
315 $second = $second->id
316 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
317 return( $first, $second, $arg );
318}
df6d9812 319
4e5a7b2c 320# Helper function for manipulating the graph.
321sub _objectify_args {
322 my( $self, $first, $second, $arg ) = @_;
323 $first = $self->reading( $first )
324 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
325 $second = $self->reading( $second )
326 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
327 return( $first, $second, $arg );
328}
3a2ebbf4 329### Path logic
330
331sub add_path {
332 my $self = shift;
333
334 # We only need the IDs for adding paths to the graph, not the reading
335 # objects themselves.
336 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
337
338 # Connect the readings
339 $self->sequence->add_edge( $source, $target );
340 # Note the witness in question
341 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 342};
343
3a2ebbf4 344sub del_path {
345 my $self = shift;
49d4f2ac 346 my @args;
347 if( ref( $_[0] ) eq 'ARRAY' ) {
348 my $e = shift @_;
349 @args = ( @$e, @_ );
350 } else {
351 @args = @_;
352 }
3a2ebbf4 353
354 # We only need the IDs for adding paths to the graph, not the reading
355 # objects themselves.
49d4f2ac 356 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 357
358 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 359 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 360 }
361 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
362 $self->sequence->delete_edge( $source, $target );
363 }
784877d9 364}
365
3a2ebbf4 366
15d2d3df 367# Extra graph-alike utility
368sub has_path {
3a2ebbf4 369 my $self = shift;
370 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
371 return undef unless $self->sequence->has_edge( $source, $target );
372 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 373}
374
4e5a7b2c 375=head2 clear_witness( @sigil_list )
3a2ebbf4 376
4e5a7b2c 377Clear the given witnesses out of the collation entirely, removing references
378to them in paths, and removing readings that belong only to them. Should only
379be called via $tradition->del_witness.
3a2ebbf4 380
381=cut
382
4e5a7b2c 383sub clear_witness {
384 my( $self, @sigils ) = @_;
385
386 # Clear the witness(es) out of the paths
387 foreach my $e ( $self->paths ) {
388 foreach my $sig ( @sigils ) {
389 $self->del_path( $e, $sig );
390 }
391 }
392
393 # Clear out the newly unused readings
394 foreach my $r ( $self->readings ) {
395 unless( $self->reading_witnesses( $r ) ) {
396 $self->del_reading( $r );
397 }
398 }
399}
3a2ebbf4 400
401sub add_relationship {
402 my $self = shift;
22222af9 403 my( $source, $target, $opts ) = $self->_stringify_args( @_ );
63778331 404 my( @vectors ) = $self->relations->add_relationship( $source,
64ae6270 405 $self->reading( $source ), $target, $self->reading( $target ), $opts );
406 # Force a full rank recalculation every time. Yuck.
63778331 407 $self->calculate_ranks() if $self->end->has_rank;
408 return @vectors;
22222af9 409}
ef9d481f 410
22222af9 411=head2 reading_witnesses( $reading )
910a0a6d 412
22222af9 413Return a list of sigils corresponding to the witnesses in which the reading appears.
3265b0ce 414
22222af9 415=cut
1d310495 416
1d310495 417sub reading_witnesses {
418 my( $self, $reading ) = @_;
419 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 420 # arbitrarily chosen "incoming". Thus, special-case the start node.
421 if( $reading eq $self->start ) {
422 return map { $_->sigil } $self->tradition->witnesses;
423 }
1d310495 424 my %all_witnesses;
425 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
426 my $wits = $self->sequence->get_edge_attributes( @$e );
427 @all_witnesses{ keys %$wits } = 1;
428 }
429 return keys %all_witnesses;
910a0a6d 430}
431
4e5a7b2c 432=head1 OUTPUT METHODS
8e1394aa 433
4e5a7b2c 434=head2 as_svg
8e1394aa 435
0068967c 436Returns an SVG string that represents the graph, via as_dot and graphviz.
8e1394aa 437
438=cut
439
440sub as_svg {
3a2ebbf4 441 my( $self ) = @_;
442
8e1394aa 443 my @cmd = qw/dot -Tsvg/;
444 my( $svg, $err );
910a0a6d 445 my $dotfile = File::Temp->new();
d9e873d0 446 ## TODO REMOVE
eca16057 447 # $dotfile->unlink_on_destroy(0);
910a0a6d 448 binmode $dotfile, ':utf8';
449 print $dotfile $self->as_dot();
450 push( @cmd, $dotfile->filename );
451 run( \@cmd, ">", binary(), \$svg );
452 $svg = decode_utf8( $svg );
8e1394aa 453 return $svg;
454}
455
4e5a7b2c 456=head2 svg_subgraph( $from, $to )
b22576c6 457
458Returns an SVG string that represents the portion of the graph given by the
459specified range. The $from and $to variables refer to ranks within the graph.
460
461=cut
462
463sub svg_subgraph {
464 my( $self, $from, $to ) = @_;
465
466 my $dot = $self->as_dot( $from, $to );
467 unless( $dot ) {
63778331 468 throw( "Could not output a graph with range $from - $to" );
b22576c6 469 }
470
471 my @cmd = qw/dot -Tsvg/;
472 my( $svg, $err );
473 my $dotfile = File::Temp->new();
474 ## TODO REMOVE
475 # $dotfile->unlink_on_destroy(0);
476 binmode $dotfile, ':utf8';
477 print $dotfile $dot;
478 push( @cmd, $dotfile->filename );
479 run( \@cmd, ">", binary(), \$svg );
480 $svg = decode_utf8( $svg );
481 return $svg;
482}
483
484
4e5a7b2c 485=head2 as_dot( $from, $to )
df6d9812 486
487Returns a string that is the collation graph expressed in dot
4e5a7b2c 488(i.e. GraphViz) format. If $from or $to is passed, as_dot creates
489a subgraph rather than the entire graph.
df6d9812 490
491=cut
492
493sub as_dot {
b22576c6 494 my( $self, $startrank, $endrank ) = @_;
495
496 # Check the arguments
497 if( $startrank ) {
498 return if $endrank && $startrank > $endrank;
499 return if $startrank > $self->end->rank;
500 }
501 if( defined $endrank ) {
502 return if $endrank < 0;
f1b3b33a 503 $endrank = undef if $endrank == $self->end->rank;
b22576c6 504 }
505
df6d9812 506 # TODO consider making some of these things configurable
67da8d6c 507 my $graph_name = $self->tradition->name;
508 $graph_name =~ s/[^\w\s]//g;
509 $graph_name = join( '_', split( /\s+/, $graph_name ) );
f13b5582 510
511 my %graph_attrs = (
512 'rankdir' => 'LR',
513 'bgcolor' => 'none',
514 );
515 my %node_attrs = (
516 'fontsize' => 11,
517 'fillcolor' => 'white',
518 'style' => 'filled',
519 'shape' => 'ellipse'
520 );
521 my %edge_attrs = (
522 'arrowhead' => 'open',
523 'color' => '#000000',
524 'fontcolor' => '#000000',
525 );
526
67da8d6c 527 my $dot = sprintf( "digraph %s {\n", $graph_name );
f13b5582 528 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
529 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
df6d9812 530
b22576c6 531 # Output substitute start/end readings if necessary
532 if( $startrank ) {
533 $dot .= "\t\"#SUBSTART#\" [ label=\"...\" ];\n";
534 }
535 if( $endrank ) {
536 $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n";
537 }
538 my %used; # Keep track of the readings that actually appear in the graph
df6d9812 539 foreach my $reading ( $self->readings ) {
b22576c6 540 # Only output readings within our rank range.
541 next if $startrank && $reading->rank < $startrank;
542 next if $endrank && $reading->rank > $endrank;
543 $used{$reading->id} = 1;
910a0a6d 544 # Need not output nodes without separate labels
3a2ebbf4 545 next if $reading->id eq $reading->text;
d4b75f44 546 my $rattrs;
30f0df34 547 my $label = $reading->text;
8f9cab7b 548 $label =~ s/\"/\\\"/g;
d4b75f44 549 $rattrs->{'label'} = $label;
550 # TODO make this an option?
551 # $rattrs->{'fillcolor'} = 'green' if $reading->is_common;
552 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
df6d9812 553 }
3a2ebbf4 554
b22576c6 555 # Add the real edges
556 my @edges = $self->paths;
3bdec618 557 my( %substart, %subend );
b22576c6 558 foreach my $edge ( @edges ) {
559 # Do we need to output this edge?
508fd430 560 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
f13b5582 561 my $label = $self->path_display_label( $self->path_witnesses( $edge ) );
562 my $variables = { %edge_attrs, 'label' => $label };
b22576c6 563 # Account for the rank gap if necessary
b0b4421a 564 if( $self->reading( $edge->[1] )->has_rank
565 && $self->reading( $edge->[0] )->has_rank
566 && $self->reading( $edge->[1] )->rank
567 - $self->reading( $edge->[0] )->rank > 1 ) {
568 $variables->{'minlen'} = $self->reading( $edge->[1] )->rank
b22576c6 569 - $self->reading( $edge->[0] )->rank;
b0b4421a 570 }
508fd430 571 # EXPERIMENTAL: make edge width reflect no. of witnesses
572 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
573 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
574
f13b5582 575 my $varopts = _dot_attr_string( $variables );
576 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
577 $edge->[0], $edge->[1], $varopts );
3bdec618 578 } elsif( $used{$edge->[0]} ) {
579 $subend{$edge->[0]} = 1;
580 } elsif( $used{$edge->[1]} ) {
581 $substart{$edge->[1]} = 1;
b22576c6 582 }
df6d9812 583 }
3bdec618 584 # Add substitute start and end edges if necessary
585 foreach my $node ( keys %substart ) {
586 my $witstr = $self->path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 587 my $variables = { %edge_attrs, 'label' => $witstr };
588 my $varopts = _dot_attr_string( $variables );
589 $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;";
3bdec618 590 }
591 foreach my $node ( keys %subend ) {
592 my $witstr = $self->path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 593 my $variables = { %edge_attrs, 'label' => $witstr };
594 my $varopts = _dot_attr_string( $variables );
595 $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;";
3bdec618 596 }
597
df6d9812 598 $dot .= "}\n";
599 return $dot;
600}
601
f13b5582 602sub _dot_attr_string {
603 my( $hash ) = @_;
604 my @attrs;
605 foreach my $k ( sort keys %$hash ) {
606 my $v = $hash->{$k};
607 push( @attrs, $k.'="'.$v.'"' );
608 }
609 return( '[ ' . join( ', ', @attrs ) . ' ]' );
610}
611
3a2ebbf4 612sub path_witnesses {
613 my( $self, @edge ) = @_;
614 # If edge is an arrayref, cope.
615 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
616 my $e = shift @edge;
617 @edge = @$e;
618 }
619 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
508fd430 620 return @wits;
3a2ebbf4 621}
622
8f9cab7b 623sub path_display_label {
508fd430 624 my $self = shift;
625 my @wits = sort @_;
8f9cab7b 626 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
627 if( scalar @wits > $maj ) {
f13b5582 628 # TODO break out a.c. wits
8f9cab7b 629 return 'majority';
630 } else {
631 return join( ', ', @wits );
632 }
633}
634
635
4e5a7b2c 636=head2 as_graphml
8e1394aa 637
4e5a7b2c 638Returns a GraphML representation of the collation. The GraphML will contain
639two graphs. The first expresses the attributes of the readings and the witness
640paths that link them; the second expresses the relationships that link the
641readings. This is the native transfer format for a tradition.
8e1394aa 642
56eefa04 643=begin testing
644
645use Text::Tradition;
646
647my $READINGS = 311;
648my $PATHS = 361;
649
650my $datafile = 't/data/florilegium_tei_ps.xml';
651my $tradition = Text::Tradition->new( 'input' => 'TEI',
652 'name' => 'test0',
653 'file' => $datafile,
654 'linear' => 1 );
655
656ok( $tradition, "Got a tradition object" );
657is( scalar $tradition->witnesses, 13, "Found all witnesses" );
658ok( $tradition->collation, "Tradition has a collation" );
659
660my $c = $tradition->collation;
661is( scalar $c->readings, $READINGS, "Collation has all readings" );
662is( scalar $c->paths, $PATHS, "Collation has all paths" );
663is( scalar $c->relationships, 0, "Collation has all relationships" );
664
665# Add a few relationships
666$c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
667$c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
668$c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
669
670# Now write it to GraphML and parse it again.
671
672my $graphml = $c->as_graphml;
673my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
674is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
675is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
676is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
677
678=end testing
679
8e1394aa 680=cut
681
682sub as_graphml {
3a2ebbf4 683 my( $self ) = @_;
8e1394aa 684
685 # Some namespaces
686 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
687 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
688 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 689 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 690
691 # Create the document and root node
692 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
693 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
694 $graphml->setDocumentElement( $root );
695 $root->setNamespace( $xsi_ns, 'xsi', 0 );
696 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
697
e309421a 698 # Add the data keys for the graph
699 my %graph_data_keys;
700 my $gdi = 0;
1d310495 701 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
e309421a 702 foreach my $datum ( @graph_attributes ) {
703 $graph_data_keys{$datum} = 'dg'.$gdi++;
704 my $key = $root->addNewChild( $graphml_ns, 'key' );
705 $key->setAttribute( 'attr.name', $datum );
706 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
707 $key->setAttribute( 'for', 'graph' );
708 $key->setAttribute( 'id', $graph_data_keys{$datum} );
709 }
f6066bac 710
8e1394aa 711 # Add the data keys for nodes
ef9d481f 712 my %node_data_keys;
713 my $ndi = 0;
3a2ebbf4 714 my %node_data = (
715 id => 'string',
255875b8 716 text => 'string',
3a2ebbf4 717 rank => 'string',
718 is_start => 'boolean',
719 is_end => 'boolean',
720 is_lacuna => 'boolean',
721 );
722 foreach my $datum ( keys %node_data ) {
910a0a6d 723 $node_data_keys{$datum} = 'dn'.$ndi++;
724 my $key = $root->addNewChild( $graphml_ns, 'key' );
725 $key->setAttribute( 'attr.name', $datum );
3a2ebbf4 726 $key->setAttribute( 'attr.type', $node_data{$datum} );
910a0a6d 727 $key->setAttribute( 'for', 'node' );
728 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 729 }
730
df6d9812 731 # Add the data keys for edges, i.e. witnesses
ef9d481f 732 my $edi = 0;
733 my %edge_data_keys;
3a2ebbf4 734 my %edge_data = (
f523c7a8 735 class => 'string', # Class, deprecated soon
3a2ebbf4 736 witness => 'string', # ID/label for a path
737 relationship => 'string', # ID/label for a relationship
738 extra => 'boolean', # Path key
c84275ff 739 scope => 'string', # Relationship key
3a2ebbf4 740 non_correctable => 'boolean', # Relationship key
741 non_independent => 'boolean', # Relationship key
742 );
743 foreach my $datum ( keys %edge_data ) {
744 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 745 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 746 $key->setAttribute( 'attr.name', $datum );
747 $key->setAttribute( 'attr.type', $edge_data{$datum} );
910a0a6d 748 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 749 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 750 }
3a2ebbf4 751
22222af9 752 # Add the collation graph itself
2c669bca 753 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
754 $sgraph->setAttribute( 'edgedefault', 'directed' );
755 $sgraph->setAttribute( 'id', $self->tradition->name );
756 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
757 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
758 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
759 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
760 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
22222af9 761
e309421a 762 # Collation attribute data
763 foreach my $datum ( @graph_attributes ) {
2c669bca 764 my $value = $datum eq 'version' ? '3.0' : $self->$datum;
765 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 766 }
8e1394aa 767
768 my $node_ctr = 0;
769 my %node_hash;
22222af9 770 # Add our readings to the graph
3a2ebbf4 771 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
2c669bca 772 # Add to the main graph
773 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 774 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 775 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 776 $node_el->setAttribute( 'id', $node_xmlid );
255875b8 777 foreach my $d ( keys %node_data ) {
778 my $nval = $n->$d;
779 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
780 if defined $nval;
781 }
b15511bf 782 }
783
2c669bca 784 # Add the path edges to the sequence graph
df6d9812 785 my $edge_ctr = 0;
3a2ebbf4 786 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
787 # We add an edge in the graphml for every witness in $e.
508fd430 788 foreach my $wit ( sort $self->path_witnesses( $e ) ) {
3a2ebbf4 789 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
790 $node_hash{ $e->[0] },
791 $node_hash{ $e->[1] } );
2c669bca 792 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 793 $edge_el->setAttribute( 'source', $from );
794 $edge_el->setAttribute( 'target', $to );
795 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 796
797 # It's a witness path, so add the witness
798 my $base = $wit;
799 my $key = $edge_data_keys{'witness'};
800 # Is this an ante-corr witness?
801 my $aclabel = $self->ac_label;
802 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
803 # Keep the base witness
804 $base = $1;
805 # ...and record that this is an 'extra' reading path
806 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
807 }
808 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
f523c7a8 809 _add_graphml_data( $edge_el, $edge_data_keys{'class'}, 'path' );
3a2ebbf4 810 }
811 }
812
22222af9 813 # Add the relationship graph to the XML
2626f709 814 $self->relations->as_graphml( $graphml_ns, $root, \%node_hash,
815 $node_data_keys{'id'}, \%edge_data_keys );
8e1394aa 816
94c00c71 817 # Save and return the thing
818 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 819 return $result;
df6d9812 820}
821
b15511bf 822sub _add_graphml_data {
823 my( $el, $key, $value ) = @_;
b15511bf 824 return unless defined $value;
c9bf3dbf 825 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 826 $data_el->setAttribute( 'key', $key );
827 $data_el->appendText( $value );
8e1394aa 828}
829
4e5a7b2c 830=head2 as_csv
910a0a6d 831
832Returns a CSV alignment table representation of the collation graph, one
2c669bca 833row per witness (or witness uncorrected.)
910a0a6d 834
835=cut
836
837sub as_csv {
3a2ebbf4 838 my( $self ) = @_;
910a0a6d 839 my $table = $self->make_alignment_table;
840 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
841 my @result;
2c669bca 842 # Make the header row
843 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
844 push( @result, decode_utf8( $csv->string ) );
845 # Make the rest of the rows
846 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 847 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
2c669bca 848 my @row = map { $_ ? $_->{'t'} : $_ } @rowobjs;
849 $csv->combine( @row );
910a0a6d 850 push( @result, decode_utf8( $csv->string ) );
851 }
3a2ebbf4 852 return join( "\n", @result );
910a0a6d 853}
854
4e5a7b2c 855=head2 make_alignment_table( $use_refs, $include_witnesses )
2c669bca 856
566f4595 857Return a reference to an alignment table, in a slightly enhanced CollateX
858format which looks like this:
859
860 $table = { alignment => [ { witness => "SIGIL",
4e5a7b2c 861 tokens => [ { t => "TEXT" }, ... ] },
566f4595 862 { witness => "SIG2",
4e5a7b2c 863 tokens => [ { t => "TEXT" }, ... ] },
566f4595 864 ... ],
865 length => TEXTLEN };
866
867If $use_refs is set to 1, the reading object is returned in the table
868instead of READINGTEXT; if not, the text of the reading is returned.
4e5a7b2c 869
870If $include_witnesses is set to a hashref, only the witnesses whose sigil
566f4595 871keys have a true hash value will be included.
2c669bca 872
873=cut
9f3ba6f7 874
910a0a6d 875sub make_alignment_table {
08e0fb85 876 my( $self, $noderefs, $include ) = @_;
910a0a6d 877 unless( $self->linear ) {
63778331 878 throw( "Need a linear graph in order to make an alignment table" );
910a0a6d 879 }
2c669bca 880 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 881 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 882 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
2c669bca 883 if( $include ) {
566f4595 884 next unless $include->{$wit->sigil};
2c669bca 885 }
eca16057 886 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 887 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
888 my @row = _make_witness_row( \@wit_path, \@all_pos, $noderefs );
2c669bca 889 push( @{$table->{'alignment'}},
890 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 891 if( $wit->is_layered ) {
892 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
861c3e27 893 $wit->sigil.$self->ac_label );
1f7aa795 894 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos, $noderefs );
2c669bca 895 push( @{$table->{'alignment'}},
896 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 897 }
898 }
2c669bca 899 return $table;
910a0a6d 900}
901
902sub _make_witness_row {
0e476982 903 my( $path, $positions, $noderefs ) = @_;
910a0a6d 904 my %char_hash;
905 map { $char_hash{$_} = undef } @$positions;
2c669bca 906 my $debug = 0;
910a0a6d 907 foreach my $rdg ( @$path ) {
eca16057 908 my $rtext = $rdg->text;
909 $rtext = '#LACUNA#' if $rdg->is_lacuna;
2c669bca 910 print STDERR "rank " . $rdg->rank . "\n" if $debug;
3a2ebbf4 911 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
2c669bca 912 $char_hash{$rdg->rank} = $noderefs ? { 't' => $rdg }
913 : { 't' => $rtext };
910a0a6d 914 }
915 my @row = map { $char_hash{$_} } @$positions;
eca16057 916 # Fill in lacuna markers for undef spots in the row
917 my $last_el = shift @row;
918 my @filled_row = ( $last_el );
919 foreach my $el ( @row ) {
0e476982 920 # If we are using node reference, make the lacuna node appear many times
921 # in the table. If not, use the lacuna tag.
922 if( $last_el && _el_is_lacuna( $last_el ) && !defined $el ) {
566f4595 923 $el = $noderefs ? $last_el : { 't' => '#LACUNA#' };
eca16057 924 }
925 push( @filled_row, $el );
926 $last_el = $el;
927 }
928 return @filled_row;
910a0a6d 929}
930
0e476982 931# Tiny utility function to say if a table element is a lacuna
932sub _el_is_lacuna {
933 my $el = shift;
2c669bca 934 return 1 if $el->{'t'} eq '#LACUNA#';
935 return 1 if ref( $el->{'t'} ) eq 'Text::Tradition::Collation::Reading'
936 && $el->{'t'}->is_lacuna;
0e476982 937 return 0;
938}
939
910a0a6d 940# Helper to turn the witnesses along columns rather than rows. Assumes
941# equal-sized rows.
942sub _turn_table {
943 my( $table ) = @_;
944 my $result = [];
945 return $result unless scalar @$table;
946 my $nrows = scalar @{$table->[0]};
947 foreach my $idx ( 0 .. $nrows - 1 ) {
948 foreach my $wit ( 0 .. $#{$table} ) {
949 $result->[$idx]->[$wit] = $table->[$wit]->[$idx];
950 }
951 }
952 return $result;
953}
954
4e5a7b2c 955=head1 NAVIGATION METHODS
910a0a6d 956
4e5a7b2c 957=head2 reading_sequence( $first, $last, $sigil, $backup )
e2902068 958
959Returns the ordered list of readings, starting with $first and ending
4e5a7b2c 960with $last, for the witness given in $sigil. If a $backup sigil is
961specified (e.g. when walking a layered witness), it will be used wherever
962no $sigil path exists. If there is a base text reading, that will be
963used wherever no path exists for $sigil or $backup.
e2902068 964
965=cut
966
910a0a6d 967# TODO Think about returning some lazy-eval iterator.
b0b4421a 968# TODO Get rid of backup; we should know from what witness is whether we need it.
910a0a6d 969
e2902068 970sub reading_sequence {
861c3e27 971 my( $self, $start, $end, $witness ) = @_;
e2902068 972
930ff666 973 $witness = $self->baselabel unless $witness;
e2902068 974 my @readings = ( $start );
975 my %seen;
976 my $n = $start;
3a2ebbf4 977 while( $n && $n->id ne $end->id ) {
978 if( exists( $seen{$n->id} ) ) {
63778331 979 throw( "Detected loop for $witness at " . $n->id );
910a0a6d 980 }
3a2ebbf4 981 $seen{$n->id} = 1;
910a0a6d 982
861c3e27 983 my $next = $self->next_reading( $n, $witness );
44771cf2 984 unless( $next ) {
63778331 985 throw( "Did not find any path for $witness from reading " . $n->id );
44771cf2 986 }
910a0a6d 987 push( @readings, $next );
988 $n = $next;
e2902068 989 }
990 # Check that the last reading is our end reading.
991 my $last = $readings[$#readings];
63778331 992 throw( "Last reading found from " . $start->text .
993 " for witness $witness is not the end!" ) # TODO do we get this far?
3a2ebbf4 994 unless $last->id eq $end->id;
e2902068 995
996 return @readings;
997}
998
4e5a7b2c 999=head2 next_reading( $reading, $sigil );
8e1394aa 1000
4a8828f0 1001Returns the reading that follows the given reading along the given witness
930ff666 1002path.
8e1394aa 1003
1004=cut
1005
4a8828f0 1006sub next_reading {
e2902068 1007 # Return the successor via the corresponding path.
8e1394aa 1008 my $self = shift;
3a2ebbf4 1009 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 1010 return undef unless $answer;
3a2ebbf4 1011 return $self->reading( $answer );
8e1394aa 1012}
1013
4e5a7b2c 1014=head2 prior_reading( $reading, $sigil )
8e1394aa 1015
4a8828f0 1016Returns the reading that precedes the given reading along the given witness
930ff666 1017path.
8e1394aa 1018
1019=cut
1020
4a8828f0 1021sub prior_reading {
e2902068 1022 # Return the predecessor via the corresponding path.
8e1394aa 1023 my $self = shift;
3a2ebbf4 1024 my $answer = $self->_find_linked_reading( 'prior', @_ );
1025 return $self->reading( $answer );
8e1394aa 1026}
1027
4a8828f0 1028sub _find_linked_reading {
861c3e27 1029 my( $self, $direction, $node, $path ) = @_;
1030
1031 # Get a backup if we are dealing with a layered witness
1032 my $alt_path;
1033 my $aclabel = $self->ac_label;
1034 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1035 $alt_path = $1;
1036 }
1037
e2902068 1038 my @linked_paths = $direction eq 'next'
3a2ebbf4 1039 ? $self->sequence->edges_from( $node )
1040 : $self->sequence->edges_to( $node );
e2902068 1041 return undef unless scalar( @linked_paths );
8e1394aa 1042
e2902068 1043 # We have to find the linked path that contains all of the
1044 # witnesses supplied in $path.
1045 my( @path_wits, @alt_path_wits );
4e5a7b2c 1046 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1047 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 1048 my $base_le;
1049 my $alt_le;
1050 foreach my $le ( @linked_paths ) {
3a2ebbf4 1051 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 1052 $base_le = $le;
910a0a6d 1053 }
508fd430 1054 my @le_wits = sort $self->path_witnesses( $le );
3a2ebbf4 1055 if( _is_within( \@path_wits, \@le_wits ) ) {
1056 # This is the right path.
1057 return $direction eq 'next' ? $le->[1] : $le->[0];
1058 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1059 $alt_le = $le;
1060 }
8e1394aa 1061 }
e2902068 1062 # Got this far? Return the alternate path if it exists.
3a2ebbf4 1063 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 1064 if $alt_le;
e2902068 1065
1066 # Got this far? Return the base path if it exists.
3a2ebbf4 1067 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 1068 if $base_le;
e2902068 1069
1070 # Got this far? We have no appropriate path.
2c669bca 1071 warn "Could not find $direction node from " . $node->id
910a0a6d 1072 . " along path $path";
8e1394aa 1073 return undef;
1074}
1075
4a8828f0 1076# Some set logic.
1077sub _is_within {
1078 my( $set1, $set2 ) = @_;
7854e12e 1079 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 1080 foreach my $el ( @$set1 ) {
910a0a6d 1081 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 1082 }
1083 return $ret;
1084}
1085
4e5a7b2c 1086# Return the string that joins together a list of witnesses for
1087# display on a single path.
1088sub _witnesses_of_label {
1089 my( $self, $label ) = @_;
1090 my $regex = $self->wit_list_separator;
1091 my @answer = split( /\Q$regex\E/, $label );
1092 return @answer;
b0b4421a 1093}
1094
d4b75f44 1095=head2 common_readings
1096
1097Returns the list of common readings in the graph (i.e. those readings that are
1098shared by all non-lacunose witnesses.)
1099
1100=cut
1101
1102sub common_readings {
1103 my $self = shift;
1104 my @common = grep { $_->is_common } $self->readings;
1105 return @common;
1106}
1107
b0b4421a 1108=head2 path_text( $sigil, $mainsigil [, $start, $end ] )
1109
1110Returns the text of a witness (plus its backup, if we are using a layer)
1111as stored in the collation. The text is returned as a string, where the
1112individual readings are joined with spaces and the meta-readings (e.g.
1113lacunae) are omitted. Optional specification of $start and $end allows
1114the generation of a subset of the witness text.
4e5a7b2c 1115
b0b4421a 1116=cut
1117
1118sub path_text {
861c3e27 1119 my( $self, $wit, $start, $end ) = @_;
b0b4421a 1120 $start = $self->start unless $start;
1121 $end = $self->end unless $end;
861c3e27 1122 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
b0b4421a 1123 return join( ' ', map { $_->text } @path );
1124}
4e5a7b2c 1125
1126=head1 INITIALIZATION METHODS
1127
1128These are mostly for use by parsers.
1129
1130=head2 make_witness_path( $witness )
1131
1132Link the array of readings contained in $witness->path (and in
1133$witness->uncorrected_path if it exists) into collation paths.
1134Clear out the arrays when finished.
de51424a 1135
4e5a7b2c 1136=head2 make_witness_paths
1137
1138Call make_witness_path for all witnesses in the tradition.
1139
1140=cut
930ff666 1141
7e450e44 1142# For use when a collation is constructed from a base text and an apparatus.
1143# We have the sequences of readings and just need to add path edges.
1f7aa795 1144# When we are done, clear out the witness path attributes, as they are no
1145# longer needed.
1146# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 1147
6a222840 1148sub make_witness_paths {
1149 my( $self ) = @_;
910a0a6d 1150 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 1151 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 1152 $self->make_witness_path( $wit );
7854e12e 1153 }
7854e12e 1154}
1155
6a222840 1156sub make_witness_path {
7854e12e 1157 my( $self, $wit ) = @_;
1158 my @chain = @{$wit->path};
15d2d3df 1159 my $sig = $wit->sigil;
7854e12e 1160 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 1161 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 1162 }
1f7aa795 1163 if( $wit->is_layered ) {
d9e873d0 1164 @chain = @{$wit->uncorrected_path};
1165 foreach my $idx( 0 .. $#chain-1 ) {
1166 my $source = $chain[$idx];
1167 my $target = $chain[$idx+1];
1168 $self->add_path( $source, $target, $sig.$self->ac_label )
1169 unless $self->has_path( $source, $target, $sig );
1170 }
15d2d3df 1171 }
1f7aa795 1172 $wit->clear_path;
1173 $wit->clear_uncorrected_path;
e2902068 1174}
1175
4e5a7b2c 1176=head2 calculate_ranks
1177
1178Calculate the reading ranks (that is, their aligned positions relative
1179to each other) for the graph. This can only be called on linear collations.
1180
1181=cut
1182
910a0a6d 1183sub calculate_ranks {
1184 my $self = shift;
1185 # Walk a version of the graph where every node linked by a relationship
1186 # edge is fundamentally the same node, and do a topological ranking on
1187 # the nodes in this graph.
c9bf3dbf 1188 my $topo_graph = Graph->new();
910a0a6d 1189 my %rel_containers;
1190 my $rel_ctr = 0;
1191 # Add the nodes
1192 foreach my $r ( $self->readings ) {
3a2ebbf4 1193 next if exists $rel_containers{$r->id};
910a0a6d 1194 my @rels = $r->related_readings( 'colocated' );
1195 if( @rels ) {
1196 # Make a relationship container.
1197 push( @rels, $r );
c9bf3dbf 1198 my $rn = 'rel_container_' . $rel_ctr++;
1199 $topo_graph->add_vertex( $rn );
910a0a6d 1200 foreach( @rels ) {
3a2ebbf4 1201 $rel_containers{$_->id} = $rn;
910a0a6d 1202 }
1203 } else {
1204 # Add a new node to mirror the old node.
3a2ebbf4 1205 $rel_containers{$r->id} = $r->id;
1206 $topo_graph->add_vertex( $r->id );
910a0a6d 1207 }
4a8828f0 1208 }
3a1f2523 1209
3a2ebbf4 1210 # Add the edges.
910a0a6d 1211 foreach my $r ( $self->readings ) {
3a2ebbf4 1212 foreach my $n ( $self->sequence->successors( $r->id ) ) {
1213 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
1214 $rel_containers{$n} );
4e5a7b2c 1215 # $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 1216 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 1217 }
1218 }
1219
1220 # Now do the rankings, starting with the start node.
3a2ebbf4 1221 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 1222 my $node_ranks = { $topo_start => 0 };
910a0a6d 1223 my @curr_origin = ( $topo_start );
1224 # A little iterative function.
1225 while( @curr_origin ) {
c9bf3dbf 1226 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 1227 }
1228 # Transfer our rankings from the topological graph to the real one.
1229 foreach my $r ( $self->readings ) {
3a2ebbf4 1230 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1231 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 1232 } else {
63778331 1233 # Die. Find the last rank we calculated.
1234 my @all_defined = sort { $node_ranks->{$rel_containers{$a->id}}
1235 <=> $node_ranks->{$rel_containers{$b->id}} }
1236 $self->readings;
1237 my $last = pop @all_defined;
1238 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
67da8d6c 1239 }
de51424a 1240 }
8e1394aa 1241}
3a1f2523 1242
910a0a6d 1243sub _assign_rank {
c9bf3dbf 1244 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1245 # Look at each of the children of @current_nodes. If all the child's
1246 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1247 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1248 # parent gets a rank.
910a0a6d 1249 my @next_nodes;
1250 foreach my $c ( @current_nodes ) {
c9bf3dbf 1251 warn "Current reading $c has no rank!"
1252 unless exists $node_ranks->{$c};
1253 # print STDERR "Looking at child of node $c, rank "
1254 # . $node_ranks->{$c} . "\n";
1255 foreach my $child ( $graph->successors( $c ) ) {
1256 next if exists $node_ranks->{$child};
910a0a6d 1257 my $highest_rank = -1;
1258 my $skip = 0;
c9bf3dbf 1259 foreach my $parent ( $graph->predecessors( $child ) ) {
1260 if( exists $node_ranks->{$parent} ) {
1261 $highest_rank = $node_ranks->{$parent}
1262 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1263 } else {
1264 $skip = 1;
1265 last;
1266 }
1267 }
1268 next if $skip;
c9bf3dbf 1269 my $c_rank = $highest_rank + 1;
1270 # print STDERR "Assigning rank $c_rank to node $child \n";
1271 $node_ranks->{$child} = $c_rank;
910a0a6d 1272 push( @next_nodes, $child );
1273 }
1274 }
1275 return @next_nodes;
4cdd82f1 1276}
910a0a6d 1277
4e5a7b2c 1278=head2 flatten_ranks
1279
1280A convenience method for parsing collation data. Searches the graph for readings
1281with the same text at the same rank, and merges any that are found.
1282
1283=cut
1284
0e476982 1285sub flatten_ranks {
1286 my $self = shift;
1287 my %unique_rank_rdg;
1288 foreach my $rdg ( $self->readings ) {
1289 next unless $rdg->has_rank;
1290 my $key = $rdg->rank . "||" . $rdg->text;
1291 if( exists $unique_rank_rdg{$key} ) {
1292 # Combine!
56eefa04 1293 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1294 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
1295 } else {
1296 $unique_rank_rdg{$key} = $rdg;
1297 }
1298 }
1299}
1300
d4b75f44 1301=head2 calculate_common_readings
1302
1303Goes through the graph identifying the readings that appear in every witness
1304(apart from those with lacunae at that spot.) Marks them as common and returns
1305the list.
1306
1307=begin testing
1308
1309use Text::Tradition;
1310
1311my $cxfile = 't/data/Collatex-16.xml';
1312my $t = Text::Tradition->new(
1313 'name' => 'inline',
1314 'input' => 'CollateX',
1315 'file' => $cxfile,
1316 );
1317my $c = $t->collation;
1318
1319my @common = $c->calculate_common_readings();
1320is( scalar @common, 8, "Found correct number of common readings" );
1321my @marked = sort $c->common_readings();
1322is( scalar @common, 8, "All common readings got marked as such" );
1323my @expected = qw/ n1 n12 n16 n19 n20 n5 n6 n7 /;
1324is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1325
1326=end testing
1327
1328=cut
1329
1330sub calculate_common_readings {
1331 my $self = shift;
1332 my @common;
1333 my $table = $self->make_alignment_table( 1 );
1334 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1335 my @row = map { $_->{'tokens'}->[$idx]->{'t'} } @{$table->{'alignment'}};
1336 my %hash;
1337 foreach my $r ( @row ) {
1338 if( $r ) {
1339 $hash{$r->id} = $r unless $r->is_meta;
1340 } else {
1341 $hash{'UNDEF'} = $r;
1342 }
1343 }
1344 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1345 my( $r ) = values %hash;
1346 $r->is_common( 1 );
1347 push( @common, $r );
1348 }
1349 }
1350 return @common;
1351}
1352
861c3e27 1353=head2 text_from_paths
1354
1355Calculate the text array for all witnesses from the path, for later consistency
1356checking. Only to be used if there is no non-graph-based way to know the
1357original texts.
1358
1359=cut
1360
1361sub text_from_paths {
1362 my $self = shift;
1363 foreach my $wit ( $self->tradition->witnesses ) {
1364 my @text = split( /\s+/,
1365 $self->reading_sequence( $self->start, $self->end, $wit->sigil ) );
1366 $wit->text( \@text );
1367 if( $wit->is_layered ) {
1368 my @uctext = split( /\s+/,
1369 $self->reading_sequence( $self->start, $self->end,
1370 $wit->sigil.$self->ac_label ) );
1371 $wit->text( \@uctext );
1372 }
1373 }
1374}
0e476982 1375
4e5a7b2c 1376=head1 UTILITY FUNCTIONS
1377
1378=head2 common_predecessor( $reading_a, $reading_b )
8e1394aa 1379
4e5a7b2c 1380Find the last reading that occurs in sequence before both the given readings.
1381
1382=head2 common_successor( $reading_a, $reading_b )
1383
1384Find the first reading that occurs in sequence after both the given readings.
1385
22222af9 1386=begin testing
1387
1388use Text::Tradition;
1389
1390my $cxfile = 't/data/Collatex-16.xml';
1391my $t = Text::Tradition->new(
1392 'name' => 'inline',
1393 'input' => 'CollateX',
1394 'file' => $cxfile,
1395 );
1396my $c = $t->collation;
1397
4e5a7b2c 1398is( $c->common_predecessor( 'n9', 'n23' )->id,
22222af9 1399 'n20', "Found correct common predecessor" );
4e5a7b2c 1400is( $c->common_successor( 'n9', 'n23' )->id,
22222af9 1401 '#END#', "Found correct common successor" );
1402
4e5a7b2c 1403is( $c->common_predecessor( 'n19', 'n17' )->id,
22222af9 1404 'n16', "Found correct common predecessor for readings on same path" );
4e5a7b2c 1405is( $c->common_successor( 'n21', 'n26' )->id,
22222af9 1406 '#END#', "Found correct common successor for readings on same path" );
1407
1408=end testing
1409
1410=cut
1411
1412## Return the closest reading that is a predecessor of both the given readings.
1413sub common_predecessor {
1414 my $self = shift;
4e5a7b2c 1415 my( $r1, $r2 ) = $self->_objectify_args( @_ );
1416 return $self->common_in_path( $r1, $r2, 'predecessors' );
22222af9 1417}
1418
1419sub common_successor {
1420 my $self = shift;
4e5a7b2c 1421 my( $r1, $r2 ) = $self->_objectify_args( @_ );
1422 return $self->common_in_path( $r1, $r2, 'successors' );
22222af9 1423}
1424
1425sub common_in_path {
1426 my( $self, $r1, $r2, $dir ) = @_;
1427 my $iter = $r1->rank > $r2->rank ? $r1->rank : $r2->rank;
1428 $iter = $self->end->rank - $iter if $dir eq 'successors';
1429 my @candidates;
1430 my @last_checked = ( $r1, $r2 );
1431 my %all_seen;
1432 while( !@candidates ) {
1433 my @new_lc;
1434 foreach my $lc ( @last_checked ) {
1435 foreach my $p ( $lc->$dir ) {
1436 if( $all_seen{$p->id} ) {
1437 push( @candidates, $p );
1438 } else {
1439 $all_seen{$p->id} = 1;
1440 push( @new_lc, $p );
1441 }
1442 }
1443 }
1444 @last_checked = @new_lc;
1445 }
1446 my @answer = sort { $a->rank <=> $b->rank } @candidates;
1447 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1448}
1449
63778331 1450sub throw {
1451 Text::Tradition::Error->throw(
1452 'ident' => 'Collation error',
1453 'message' => $_[0],
1454 );
1455}
1456
dd3b58b0 1457no Moose;
1458__PACKAGE__->meta->make_immutable;
e867486f 1459
1460=head1 BUGS / TODO
1461
1462=over
1463
4e5a7b2c 1464=item * Get rid of $backup in reading_sequence
e867486f 1465
1466=back