add JSON serialization method
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
c9bf3dbf 5use Graph;
8e1394aa 6use IPC::Run qw( run binary );
910a0a6d 7use Text::CSV_XS;
b15511bf 8use Text::Tradition::Collation::Reading;
22222af9 9use Text::Tradition::Collation::RelationshipStore;
63778331 10use Text::Tradition::Error;
df6d9812 11use XML::LibXML;
a344ffcf 12use XML::LibXML::XPathContext;
dd3b58b0 13use Moose;
14
3a2ebbf4 15has 'sequence' => (
d047cd52 16 is => 'ro',
3a2ebbf4 17 isa => 'Graph',
18 default => sub { Graph->new() },
d047cd52 19 handles => {
3a2ebbf4 20 paths => 'edges',
d047cd52 21 },
d047cd52 22 );
3a2ebbf4 23
24has 'relations' => (
25 is => 'ro',
22222af9 26 isa => 'Text::Tradition::Collation::RelationshipStore',
27 handles => {
28 relationships => 'relationships',
29 related_readings => 'related_readings',
ee801e17 30 del_relationship => 'del_relationship',
22222af9 31 },
32 writer => '_set_relations',
3a2ebbf4 33 );
dd3b58b0 34
3a2ebbf4 35has 'tradition' => (
36 is => 'ro',
d047cd52 37 isa => 'Text::Tradition',
8d9a1cd8 38 weak_ref => 1,
d047cd52 39 );
dd3b58b0 40
3a2ebbf4 41has 'readings' => (
42 isa => 'HashRef[Text::Tradition::Collation::Reading]',
43 traits => ['Hash'],
44 handles => {
45 reading => 'get',
46 _add_reading => 'set',
47 del_reading => 'delete',
48 has_reading => 'exists',
49 readings => 'values',
50 },
51 default => sub { {} },
52 );
910a0a6d 53
4a8828f0 54has 'wit_list_separator' => (
7854e12e 55 is => 'rw',
56 isa => 'Str',
57 default => ', ',
58 );
59
60has 'baselabel' => (
61 is => 'rw',
62 isa => 'Str',
63 default => 'base text',
64 );
4a8828f0 65
15d2d3df 66has 'linear' => (
67 is => 'rw',
68 isa => 'Bool',
69 default => 1,
70 );
c84275ff 71
ef9d481f 72has 'ac_label' => (
73 is => 'rw',
74 isa => 'Str',
75 default => ' (a.c.)',
76 );
3a2ebbf4 77
78has 'start' => (
79 is => 'ro',
80 isa => 'Text::Tradition::Collation::Reading',
81 writer => '_set_start',
82 weak_ref => 1,
83 );
84
85has 'end' => (
86 is => 'ro',
87 isa => 'Text::Tradition::Collation::Reading',
88 writer => '_set_end',
89 weak_ref => 1,
90 );
b365fbae 91
92has 'cached_svg' => (
93 is => 'rw',
94 isa => 'Str',
95 predicate => 'has_cached_svg',
96 clearer => 'wipe_svg',
97 );
1dd07bda 98
99has 'cached_table' => (
100 is => 'rw',
101 isa => 'HashRef',
102 predicate => 'has_cached_table',
103 clearer => 'wipe_table',
104 );
1f563ac3 105
4e5a7b2c 106=head1 NAME
107
108Text::Tradition::Collation - a software model for a text collation
109
110=head1 SYNOPSIS
111
112 use Text::Tradition;
113 my $t = Text::Tradition->new(
114 'name' => 'this is a text',
115 'input' => 'TEI',
116 'file' => '/path/to/tei_parallel_seg_file.xml' );
117
118 my $c = $t->collation;
119 my @readings = $c->readings;
120 my @paths = $c->paths;
121 my @relationships = $c->relationships;
122
123 my $svg_variant_graph = $t->collation->as_svg();
124
125=head1 DESCRIPTION
126
127Text::Tradition is a library for representation and analysis of collated
128texts, particularly medieval ones. The Collation is the central feature of
129a Tradition, where the text, its sequence of readings, and its relationships
130between readings are actually kept.
131
132=head1 CONSTRUCTOR
133
134=head2 new
135
136The constructor. Takes a hash or hashref of the following arguments:
137
138=over
139
140=item * tradition - The Text::Tradition object to which the collation
141belongs. Required.
142
143=item * linear - Whether the collation should be linear; that is, whether
144transposed readings should be treated as two linked readings rather than one,
145and therefore whether the collation graph is acyclic. Defaults to true.
146
4e5a7b2c 147=item * baselabel - The default label for the path taken by a base text
148(if any). Defaults to 'base text'.
149
150=item * wit_list_separator - The string to join a list of witnesses for
151purposes of making labels in display graphs. Defaults to ', '.
152
153=item * ac_label - The extra label to tack onto a witness sigil when
154representing another layer of path for the given witness - that is, when
155a text has more than one possible reading due to scribal corrections or
156the like. Defaults to ' (a.c.)'.
157
158=back
159
160=head1 ACCESSORS
161
162=head2 tradition
163
164=head2 linear
165
4e5a7b2c 166=head2 wit_list_separator
167
168=head2 baselabel
169
170=head2 ac_label
171
172Simple accessors for collation attributes.
173
174=head2 start
175
176The meta-reading at the start of every witness path.
177
178=head2 end
179
180The meta-reading at the end of every witness path.
181
182=head2 readings
183
184Returns all Reading objects in the graph.
185
186=head2 reading( $id )
187
188Returns the Reading object corresponding to the given ID.
189
190=head2 add_reading( $reading_args )
191
192Adds a new reading object to the collation.
193See L<Text::Tradition::Collation::Reading> for the available arguments.
194
195=head2 del_reading( $object_or_id )
196
197Removes the given reading from the collation, implicitly removing its
198paths and relationships.
199
200=head2 merge_readings( $main, $second )
201
202Merges the $second reading into the $main one.
203The arguments may be either readings or reading IDs.
204
205=head2 has_reading( $id )
206
207Predicate to see whether a given reading ID is in the graph.
208
209=head2 reading_witnesses( $object_or_id )
210
211Returns a list of sigils whose witnesses contain the reading.
212
213=head2 paths
214
215Returns all reading paths within the document - that is, all edges in the
216collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
217
218=head2 add_path( $source, $target, $sigil )
219
220Links the given readings in the collation in sequence, under the given witness
221sigil. The readings may be specified by object or ID.
222
223=head2 del_path( $source, $target, $sigil )
224
225Links the given readings in the collation in sequence, under the given witness
226sigil. The readings may be specified by object or ID.
227
228=head2 has_path( $source, $target );
229
230Returns true if the two readings are linked in sequence in any witness.
231The readings may be specified by object or ID.
232
233=head2 relationships
234
235Returns all Relationship objects in the collation.
236
237=head2 add_relationship( $reading, $other_reading, $options )
238
239Adds a new relationship of the type given in $options between the two readings,
240which may be specified by object or ID. Returns a value of ( $status, @vectors)
241where $status is true on success, and @vectors is a list of relationship edges
242that were ultimately added.
243See L<Text::Tradition::Collation::Relationship> for the available options.
244
245=cut
dd3b58b0 246
d047cd52 247sub BUILD {
3a2ebbf4 248 my $self = shift;
22222af9 249 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
3a2ebbf4 250 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
251 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 252}
784877d9 253
3a2ebbf4 254### Reading construct/destruct functions
255
1dd07bda 256sub _clear_cache {
257 my $self = shift;
258 $self->wipe_svg if $self->has_cached_svg;
259 $self->wipe_table if $self->has_cached_table;
260}
261
3a2ebbf4 262sub add_reading {
263 my( $self, $reading ) = @_;
264 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
265 my %args = %$reading;
266 $reading = Text::Tradition::Collation::Reading->new(
267 'collation' => $self,
268 %args );
269 }
270 # First check to see if a reading with this ID exists.
271 if( $self->reading( $reading->id ) ) {
63778331 272 throw( "Collation already has a reading with id " . $reading->id );
3a2ebbf4 273 }
1dd07bda 274 $self->_clear_cache;
3a2ebbf4 275 $self->_add_reading( $reading->id => $reading );
276 # Once the reading has been added, put it in both graphs.
277 $self->sequence->add_vertex( $reading->id );
22222af9 278 $self->relations->add_reading( $reading->id );
3a2ebbf4 279 return $reading;
eca16057 280};
281
3a2ebbf4 282around del_reading => sub {
283 my $orig = shift;
284 my $self = shift;
285 my $arg = shift;
286
287 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
288 $arg = $arg->id;
289 }
3a2ebbf4 290 # Remove the reading from the graphs.
1dd07bda 291 $self->_clear_cache;
3a2ebbf4 292 $self->sequence->delete_vertex( $arg );
22222af9 293 $self->relations->delete_reading( $arg );
3a2ebbf4 294
295 # Carry on.
296 $self->$orig( $arg );
297};
7854e12e 298
3a2ebbf4 299# merge_readings( $main, $to_be_deleted );
7854e12e 300
3a2ebbf4 301sub merge_readings {
302 my $self = shift;
303
304 # We only need the IDs for adding paths to the graph, not the reading
305 # objects themselves.
49d4f2ac 306 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
1dd07bda 307 $self->_clear_cache;
3a2ebbf4 308
309 # The kept reading should inherit the paths and the relationships
310 # of the deleted reading.
311 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
312 my @vector = ( $kept );
313 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
314 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 315 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 316 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
317 $self->sequence->add_edge( @vector );
318 my $fwits = $self->sequence->get_edge_attributes( @vector );
319 @wits{keys %$fwits} = values %$fwits;
320 $self->sequence->set_edge_attributes( @vector, \%wits );
321 }
22222af9 322 $self->relations->merge_readings( $kept, $deleted, $combine_char );
3a2ebbf4 323
324 # Do the deletion deed.
49d4f2ac 325 if( $combine_char ) {
326 my $kept_obj = $self->reading( $kept );
327 my $new_text = join( $combine_char, $kept_obj->text,
328 $self->reading( $deleted )->text );
329 $kept_obj->alter_text( $new_text );
330 }
3a2ebbf4 331 $self->del_reading( $deleted );
332}
7854e12e 333
3265b0ce 334
3a2ebbf4 335# Helper function for manipulating the graph.
336sub _stringify_args {
337 my( $self, $first, $second, $arg ) = @_;
338 $first = $first->id
339 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
340 $second = $second->id
341 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
342 return( $first, $second, $arg );
343}
df6d9812 344
4e5a7b2c 345# Helper function for manipulating the graph.
346sub _objectify_args {
347 my( $self, $first, $second, $arg ) = @_;
348 $first = $self->reading( $first )
349 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
350 $second = $self->reading( $second )
351 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
352 return( $first, $second, $arg );
353}
3a2ebbf4 354### Path logic
355
356sub add_path {
357 my $self = shift;
358
359 # We only need the IDs for adding paths to the graph, not the reading
360 # objects themselves.
361 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
362
1dd07bda 363 $self->_clear_cache;
3a2ebbf4 364 # Connect the readings
365 $self->sequence->add_edge( $source, $target );
366 # Note the witness in question
367 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 368};
369
3a2ebbf4 370sub del_path {
371 my $self = shift;
49d4f2ac 372 my @args;
373 if( ref( $_[0] ) eq 'ARRAY' ) {
374 my $e = shift @_;
375 @args = ( @$e, @_ );
376 } else {
377 @args = @_;
378 }
3a2ebbf4 379
380 # We only need the IDs for adding paths to the graph, not the reading
381 # objects themselves.
49d4f2ac 382 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 383
1dd07bda 384 $self->_clear_cache;
3a2ebbf4 385 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 386 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 387 }
388 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
389 $self->sequence->delete_edge( $source, $target );
390 }
784877d9 391}
392
3a2ebbf4 393
15d2d3df 394# Extra graph-alike utility
395sub has_path {
3a2ebbf4 396 my $self = shift;
397 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
398 return undef unless $self->sequence->has_edge( $source, $target );
399 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 400}
401
4e5a7b2c 402=head2 clear_witness( @sigil_list )
3a2ebbf4 403
4e5a7b2c 404Clear the given witnesses out of the collation entirely, removing references
405to them in paths, and removing readings that belong only to them. Should only
406be called via $tradition->del_witness.
3a2ebbf4 407
408=cut
409
4e5a7b2c 410sub clear_witness {
411 my( $self, @sigils ) = @_;
412
1dd07bda 413 $self->_clear_cache;
4e5a7b2c 414 # Clear the witness(es) out of the paths
415 foreach my $e ( $self->paths ) {
416 foreach my $sig ( @sigils ) {
417 $self->del_path( $e, $sig );
418 }
419 }
420
421 # Clear out the newly unused readings
422 foreach my $r ( $self->readings ) {
423 unless( $self->reading_witnesses( $r ) ) {
424 $self->del_reading( $r );
425 }
426 }
427}
3a2ebbf4 428
429sub add_relationship {
430 my $self = shift;
22222af9 431 my( $source, $target, $opts ) = $self->_stringify_args( @_ );
63778331 432 my( @vectors ) = $self->relations->add_relationship( $source,
64ae6270 433 $self->reading( $source ), $target, $self->reading( $target ), $opts );
434 # Force a full rank recalculation every time. Yuck.
63778331 435 $self->calculate_ranks() if $self->end->has_rank;
1dd07bda 436 $self->_clear_cache;
63778331 437 return @vectors;
22222af9 438}
ef9d481f 439
22222af9 440=head2 reading_witnesses( $reading )
910a0a6d 441
22222af9 442Return a list of sigils corresponding to the witnesses in which the reading appears.
3265b0ce 443
22222af9 444=cut
1d310495 445
1d310495 446sub reading_witnesses {
447 my( $self, $reading ) = @_;
448 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 449 # arbitrarily chosen "incoming". Thus, special-case the start node.
450 if( $reading eq $self->start ) {
451 return map { $_->sigil } $self->tradition->witnesses;
452 }
1d310495 453 my %all_witnesses;
454 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
455 my $wits = $self->sequence->get_edge_attributes( @$e );
456 @all_witnesses{ keys %$wits } = 1;
457 }
458 return keys %all_witnesses;
910a0a6d 459}
460
4e5a7b2c 461=head1 OUTPUT METHODS
8e1394aa 462
0ecb975c 463=head2 as_svg( \%options )
8e1394aa 464
0068967c 465Returns an SVG string that represents the graph, via as_dot and graphviz.
0ecb975c 466See as_dot for a list of options.
8e1394aa 467
468=cut
469
470sub as_svg {
0ecb975c 471 my( $self, $opts ) = @_;
e247aad1 472 my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
473 if( !$self->has_cached_svg || $opts->{'recalc'} || $want_subgraph ) {
b365fbae 474 my @cmd = qw/dot -Tsvg/;
475 my( $svg, $err );
476 my $dotfile = File::Temp->new();
477 ## USE FOR DEBUGGING
478 # $dotfile->unlink_on_destroy(0);
479 binmode $dotfile, ':utf8';
480 print $dotfile $self->as_dot( $opts );
481 push( @cmd, $dotfile->filename );
482 run( \@cmd, ">", binary(), \$svg );
e247aad1 483 $svg = decode_utf8( $svg );
484 $self->cached_svg( $svg ) unless $want_subgraph;
485 return $svg;
486 } else {
487 return $self->cached_svg;
b365fbae 488 }
8e1394aa 489}
490
b22576c6 491
0ecb975c 492=head2 as_dot( \%options )
b22576c6 493
0ecb975c 494Returns a string that is the collation graph expressed in dot
495(i.e. GraphViz) format. Options include:
b22576c6 496
0ecb975c 497=over 4
b22576c6 498
0ecb975c 499=item * from
b22576c6 500
0ecb975c 501=item * to
df6d9812 502
0ecb975c 503=item * color_common
504
505=back
df6d9812 506
507=cut
508
509sub as_dot {
0ecb975c 510 my( $self, $opts ) = @_;
511 my $startrank = $opts->{'from'} if $opts;
512 my $endrank = $opts->{'to'} if $opts;
513 my $color_common = $opts->{'color_common'} if $opts;
b365fbae 514 my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
515 && $self->end->rank > 100;
516
b22576c6 517 # Check the arguments
518 if( $startrank ) {
519 return if $endrank && $startrank > $endrank;
520 return if $startrank > $self->end->rank;
521 }
522 if( defined $endrank ) {
523 return if $endrank < 0;
f1b3b33a 524 $endrank = undef if $endrank == $self->end->rank;
b22576c6 525 }
526
67da8d6c 527 my $graph_name = $self->tradition->name;
528 $graph_name =~ s/[^\w\s]//g;
529 $graph_name = join( '_', split( /\s+/, $graph_name ) );
f13b5582 530
531 my %graph_attrs = (
532 'rankdir' => 'LR',
533 'bgcolor' => 'none',
534 );
535 my %node_attrs = (
b8990398 536 'fontsize' => 14,
f13b5582 537 'fillcolor' => 'white',
538 'style' => 'filled',
539 'shape' => 'ellipse'
540 );
541 my %edge_attrs = (
542 'arrowhead' => 'open',
543 'color' => '#000000',
544 'fontcolor' => '#000000',
545 );
546
67da8d6c 547 my $dot = sprintf( "digraph %s {\n", $graph_name );
f13b5582 548 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
549 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
df6d9812 550
b22576c6 551 # Output substitute start/end readings if necessary
552 if( $startrank ) {
553 $dot .= "\t\"#SUBSTART#\" [ label=\"...\" ];\n";
554 }
555 if( $endrank ) {
556 $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n";
557 }
b365fbae 558 if( $STRAIGHTENHACK ) {
559 ## HACK part 1
560 $dot .= "\tsubgraph { rank=same \"#START#\" \"#SILENT#\" }\n";
561 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
562 }
b22576c6 563 my %used; # Keep track of the readings that actually appear in the graph
30ddc24c 564 # Sort the readings by rank if we have ranks; this speeds layout.
565 my @all_readings = $self->end->has_rank
566 ? sort { $a->rank <=> $b->rank } $self->readings
567 : $self->readings;
4633f9e4 568 # TODO Refrain from outputting lacuna nodes - just grey out the edges.
30ddc24c 569 foreach my $reading ( @all_readings ) {
b22576c6 570 # Only output readings within our rank range.
571 next if $startrank && $reading->rank < $startrank;
572 next if $endrank && $reading->rank > $endrank;
573 $used{$reading->id} = 1;
910a0a6d 574 # Need not output nodes without separate labels
3a2ebbf4 575 next if $reading->id eq $reading->text;
d4b75f44 576 my $rattrs;
30f0df34 577 my $label = $reading->text;
8f9cab7b 578 $label =~ s/\"/\\\"/g;
d4b75f44 579 $rattrs->{'label'} = $label;
0ecb975c 580 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
d4b75f44 581 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
df6d9812 582 }
3a2ebbf4 583
30ddc24c 584 # Add the real edges. Need to weight one edge per rank jump, in a
585 # continuous line.
b365fbae 586 # my $weighted = $self->_add_edge_weights;
b22576c6 587 my @edges = $self->paths;
3bdec618 588 my( %substart, %subend );
b22576c6 589 foreach my $edge ( @edges ) {
590 # Do we need to output this edge?
508fd430 591 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
027d819c 592 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
f13b5582 593 my $variables = { %edge_attrs, 'label' => $label };
30ddc24c 594
b22576c6 595 # Account for the rank gap if necessary
30ddc24c 596 my $rank0 = $self->reading( $edge->[0] )->rank
597 if $self->reading( $edge->[0] )->has_rank;
598 my $rank1 = $self->reading( $edge->[1] )->rank
599 if $self->reading( $edge->[1] )->has_rank;
600 if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
601 $variables->{'minlen'} = $rank1 - $rank0;
602 }
603
604 # Add the calculated edge weights
b365fbae 605 # if( exists $weighted->{$edge->[0]}
e247aad1 606 # && $weighted->{$edge->[0]} eq $edge->[1] ) {
607 # # $variables->{'color'} = 'red';
608 # $variables->{'weight'} = 3.0;
609 # }
30ddc24c 610
508fd430 611 # EXPERIMENTAL: make edge width reflect no. of witnesses
612 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
613 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
614
f13b5582 615 my $varopts = _dot_attr_string( $variables );
616 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
617 $edge->[0], $edge->[1], $varopts );
3bdec618 618 } elsif( $used{$edge->[0]} ) {
619 $subend{$edge->[0]} = 1;
620 } elsif( $used{$edge->[1]} ) {
621 $substart{$edge->[1]} = 1;
b22576c6 622 }
df6d9812 623 }
3bdec618 624 # Add substitute start and end edges if necessary
625 foreach my $node ( keys %substart ) {
027d819c 626 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 627 my $variables = { %edge_attrs, 'label' => $witstr };
628 my $varopts = _dot_attr_string( $variables );
629 $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;";
3bdec618 630 }
631 foreach my $node ( keys %subend ) {
027d819c 632 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 633 my $variables = { %edge_attrs, 'label' => $witstr };
634 my $varopts = _dot_attr_string( $variables );
635 $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;";
3bdec618 636 }
b365fbae 637 # HACK part 2
638 if( $STRAIGHTENHACK ) {
639 $dot .= "\t\"#END#\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
640 }
30ddc24c 641
df6d9812 642 $dot .= "}\n";
643 return $dot;
644}
645
f13b5582 646sub _dot_attr_string {
647 my( $hash ) = @_;
648 my @attrs;
649 foreach my $k ( sort keys %$hash ) {
650 my $v = $hash->{$k};
651 push( @attrs, $k.'="'.$v.'"' );
652 }
653 return( '[ ' . join( ', ', @attrs ) . ' ]' );
654}
655
30ddc24c 656sub _add_edge_weights {
657 my $self = shift;
658 # Walk the graph from START to END, choosing the successor node with
659 # the largest number of witness paths each time.
660 my $weighted = {};
661 my $curr = $self->start->id;
008fc8a6 662 my $ranked = $self->end->has_rank;
30ddc24c 663 while( $curr ne $self->end->id ) {
008fc8a6 664 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
30ddc24c 665 my @succ = sort { $self->path_witnesses( $curr, $a )
666 <=> $self->path_witnesses( $curr, $b ) }
667 $self->sequence->successors( $curr );
668 my $next = pop @succ;
008fc8a6 669 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
30ddc24c 670 # Try to avoid lacunae in the weighted path.
008fc8a6 671 while( @succ &&
672 ( $self->reading( $next )->is_lacuna ||
673 $nextrank - $rank > 1 ) ){
30ddc24c 674 $next = pop @succ;
675 }
676 $weighted->{$curr} = $next;
677 $curr = $next;
678 }
679 return $weighted;
680}
681
027d819c 682=head2 path_witnesses( $edge )
683
684Returns the list of sigils whose witnesses are associated with the given edge.
685The edge can be passed as either an array or an arrayref of ( $source, $target ).
686
687=cut
688
3a2ebbf4 689sub path_witnesses {
690 my( $self, @edge ) = @_;
691 # If edge is an arrayref, cope.
692 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
693 my $e = shift @edge;
694 @edge = @$e;
695 }
696 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
508fd430 697 return @wits;
3a2ebbf4 698}
699
027d819c 700sub _path_display_label {
508fd430 701 my $self = shift;
702 my @wits = sort @_;
8f9cab7b 703 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
704 if( scalar @wits > $maj ) {
f13b5582 705 # TODO break out a.c. wits
8f9cab7b 706 return 'majority';
707 } else {
708 return join( ', ', @wits );
709 }
710}
1dd07bda 711
712=head2 witnesses_at_rank
713
714Returns a list of witnesses that are not lacunose, for a given rank.
715
716=cut
717
718sub witnesses_at_rank {
719 my( $self, $rank ) = @_;
720}
8f9cab7b 721
4e5a7b2c 722=head2 as_graphml
8e1394aa 723
4e5a7b2c 724Returns a GraphML representation of the collation. The GraphML will contain
725two graphs. The first expresses the attributes of the readings and the witness
726paths that link them; the second expresses the relationships that link the
727readings. This is the native transfer format for a tradition.
8e1394aa 728
56eefa04 729=begin testing
730
731use Text::Tradition;
732
733my $READINGS = 311;
734my $PATHS = 361;
735
736my $datafile = 't/data/florilegium_tei_ps.xml';
737my $tradition = Text::Tradition->new( 'input' => 'TEI',
738 'name' => 'test0',
739 'file' => $datafile,
740 'linear' => 1 );
741
742ok( $tradition, "Got a tradition object" );
743is( scalar $tradition->witnesses, 13, "Found all witnesses" );
744ok( $tradition->collation, "Tradition has a collation" );
745
746my $c = $tradition->collation;
747is( scalar $c->readings, $READINGS, "Collation has all readings" );
748is( scalar $c->paths, $PATHS, "Collation has all paths" );
749is( scalar $c->relationships, 0, "Collation has all relationships" );
750
751# Add a few relationships
752$c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
753$c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
754$c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
755
756# Now write it to GraphML and parse it again.
757
758my $graphml = $c->as_graphml;
759my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
760is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
761is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
762is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
763
764=end testing
765
8e1394aa 766=cut
767
768sub as_graphml {
3a2ebbf4 769 my( $self ) = @_;
8e1394aa 770
771 # Some namespaces
772 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
773 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
774 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 775 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 776
777 # Create the document and root node
778 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
779 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
780 $graphml->setDocumentElement( $root );
781 $root->setNamespace( $xsi_ns, 'xsi', 0 );
782 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
783
e309421a 784 # Add the data keys for the graph
785 my %graph_data_keys;
786 my $gdi = 0;
1d310495 787 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
e309421a 788 foreach my $datum ( @graph_attributes ) {
789 $graph_data_keys{$datum} = 'dg'.$gdi++;
790 my $key = $root->addNewChild( $graphml_ns, 'key' );
791 $key->setAttribute( 'attr.name', $datum );
792 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
793 $key->setAttribute( 'for', 'graph' );
794 $key->setAttribute( 'id', $graph_data_keys{$datum} );
795 }
f6066bac 796
8e1394aa 797 # Add the data keys for nodes
ef9d481f 798 my %node_data_keys;
799 my $ndi = 0;
3a2ebbf4 800 my %node_data = (
801 id => 'string',
255875b8 802 text => 'string',
3a2ebbf4 803 rank => 'string',
804 is_start => 'boolean',
805 is_end => 'boolean',
806 is_lacuna => 'boolean',
807 );
808 foreach my $datum ( keys %node_data ) {
910a0a6d 809 $node_data_keys{$datum} = 'dn'.$ndi++;
810 my $key = $root->addNewChild( $graphml_ns, 'key' );
811 $key->setAttribute( 'attr.name', $datum );
3a2ebbf4 812 $key->setAttribute( 'attr.type', $node_data{$datum} );
910a0a6d 813 $key->setAttribute( 'for', 'node' );
814 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 815 }
816
df6d9812 817 # Add the data keys for edges, i.e. witnesses
ef9d481f 818 my $edi = 0;
819 my %edge_data_keys;
3a2ebbf4 820 my %edge_data = (
f523c7a8 821 class => 'string', # Class, deprecated soon
3a2ebbf4 822 witness => 'string', # ID/label for a path
823 relationship => 'string', # ID/label for a relationship
824 extra => 'boolean', # Path key
c84275ff 825 scope => 'string', # Relationship key
3a2ebbf4 826 non_correctable => 'boolean', # Relationship key
827 non_independent => 'boolean', # Relationship key
828 );
829 foreach my $datum ( keys %edge_data ) {
830 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 831 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 832 $key->setAttribute( 'attr.name', $datum );
833 $key->setAttribute( 'attr.type', $edge_data{$datum} );
910a0a6d 834 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 835 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 836 }
3a2ebbf4 837
22222af9 838 # Add the collation graph itself
2c669bca 839 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
840 $sgraph->setAttribute( 'edgedefault', 'directed' );
841 $sgraph->setAttribute( 'id', $self->tradition->name );
842 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
843 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
844 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
845 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
846 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
22222af9 847
e309421a 848 # Collation attribute data
849 foreach my $datum ( @graph_attributes ) {
2c669bca 850 my $value = $datum eq 'version' ? '3.0' : $self->$datum;
851 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 852 }
8e1394aa 853
854 my $node_ctr = 0;
855 my %node_hash;
22222af9 856 # Add our readings to the graph
3a2ebbf4 857 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
2c669bca 858 # Add to the main graph
859 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 860 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 861 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 862 $node_el->setAttribute( 'id', $node_xmlid );
255875b8 863 foreach my $d ( keys %node_data ) {
864 my $nval = $n->$d;
865 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
866 if defined $nval;
867 }
b15511bf 868 }
869
2c669bca 870 # Add the path edges to the sequence graph
df6d9812 871 my $edge_ctr = 0;
3a2ebbf4 872 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
873 # We add an edge in the graphml for every witness in $e.
508fd430 874 foreach my $wit ( sort $self->path_witnesses( $e ) ) {
3a2ebbf4 875 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
876 $node_hash{ $e->[0] },
877 $node_hash{ $e->[1] } );
2c669bca 878 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 879 $edge_el->setAttribute( 'source', $from );
880 $edge_el->setAttribute( 'target', $to );
881 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 882
883 # It's a witness path, so add the witness
884 my $base = $wit;
885 my $key = $edge_data_keys{'witness'};
886 # Is this an ante-corr witness?
887 my $aclabel = $self->ac_label;
888 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
889 # Keep the base witness
890 $base = $1;
891 # ...and record that this is an 'extra' reading path
892 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
893 }
894 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
f523c7a8 895 _add_graphml_data( $edge_el, $edge_data_keys{'class'}, 'path' );
3a2ebbf4 896 }
897 }
898
22222af9 899 # Add the relationship graph to the XML
027d819c 900 $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
2626f709 901 $node_data_keys{'id'}, \%edge_data_keys );
8e1394aa 902
94c00c71 903 # Save and return the thing
904 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 905 return $result;
df6d9812 906}
907
b15511bf 908sub _add_graphml_data {
909 my( $el, $key, $value ) = @_;
b15511bf 910 return unless defined $value;
c9bf3dbf 911 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 912 $data_el->setAttribute( 'key', $key );
913 $data_el->appendText( $value );
8e1394aa 914}
915
4e5a7b2c 916=head2 as_csv
910a0a6d 917
918Returns a CSV alignment table representation of the collation graph, one
2c669bca 919row per witness (or witness uncorrected.)
910a0a6d 920
921=cut
922
923sub as_csv {
3a2ebbf4 924 my( $self ) = @_;
1dd07bda 925 my $table = $self->alignment_table;
910a0a6d 926 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
927 my @result;
2c669bca 928 # Make the header row
929 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
930 push( @result, decode_utf8( $csv->string ) );
931 # Make the rest of the rows
932 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 933 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1dd07bda 934 my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
2c669bca 935 $csv->combine( @row );
910a0a6d 936 push( @result, decode_utf8( $csv->string ) );
937 }
3a2ebbf4 938 return join( "\n", @result );
910a0a6d 939}
940
1dd07bda 941=head2 alignment_table( $use_refs, $include_witnesses )
2c669bca 942
566f4595 943Return a reference to an alignment table, in a slightly enhanced CollateX
944format which looks like this:
945
946 $table = { alignment => [ { witness => "SIGIL",
4e5a7b2c 947 tokens => [ { t => "TEXT" }, ... ] },
566f4595 948 { witness => "SIG2",
4e5a7b2c 949 tokens => [ { t => "TEXT" }, ... ] },
566f4595 950 ... ],
951 length => TEXTLEN };
952
953If $use_refs is set to 1, the reading object is returned in the table
954instead of READINGTEXT; if not, the text of the reading is returned.
4e5a7b2c 955
956If $include_witnesses is set to a hashref, only the witnesses whose sigil
566f4595 957keys have a true hash value will be included.
2c669bca 958
959=cut
9f3ba6f7 960
1dd07bda 961sub alignment_table {
962 my( $self ) = @_;
963 my $include; # see if we can ditch this
964 return $self->cached_table if $self->has_cached_table;
965
0ecb975c 966 # Make sure we can do this
967 throw( "Need a linear graph in order to make an alignment table" )
968 unless $self->linear;
969 $self->calculate_ranks unless $self->end->has_rank;
970
2c669bca 971 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 972 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 973 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
2c669bca 974 if( $include ) {
566f4595 975 next unless $include->{$wit->sigil};
2c669bca 976 }
eca16057 977 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 978 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1dd07bda 979 my @row = _make_witness_row( \@wit_path, \@all_pos );
2c669bca 980 push( @{$table->{'alignment'}},
981 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 982 if( $wit->is_layered ) {
983 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
861c3e27 984 $wit->sigil.$self->ac_label );
1dd07bda 985 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
2c669bca 986 push( @{$table->{'alignment'}},
987 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 988 }
989 }
1dd07bda 990 $self->cached_table( $table );
991 return $table;
910a0a6d 992}
993
994sub _make_witness_row {
1dd07bda 995 my( $path, $positions ) = @_;
910a0a6d 996 my %char_hash;
997 map { $char_hash{$_} = undef } @$positions;
2c669bca 998 my $debug = 0;
910a0a6d 999 foreach my $rdg ( @$path ) {
eca16057 1000 my $rtext = $rdg->text;
1001 $rtext = '#LACUNA#' if $rdg->is_lacuna;
2c669bca 1002 print STDERR "rank " . $rdg->rank . "\n" if $debug;
3a2ebbf4 1003 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
1dd07bda 1004 $char_hash{$rdg->rank} = { 't' => $rdg };
910a0a6d 1005 }
1006 my @row = map { $char_hash{$_} } @$positions;
eca16057 1007 # Fill in lacuna markers for undef spots in the row
1008 my $last_el = shift @row;
1009 my @filled_row = ( $last_el );
1010 foreach my $el ( @row ) {
0e476982 1011 # If we are using node reference, make the lacuna node appear many times
1012 # in the table. If not, use the lacuna tag.
1dd07bda 1013 if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1014 $el = $last_el;
eca16057 1015 }
1016 push( @filled_row, $el );
1017 $last_el = $el;
1018 }
1019 return @filled_row;
910a0a6d 1020}
1021
4e5a7b2c 1022=head1 NAVIGATION METHODS
910a0a6d 1023
4e5a7b2c 1024=head2 reading_sequence( $first, $last, $sigil, $backup )
e2902068 1025
1026Returns the ordered list of readings, starting with $first and ending
4e5a7b2c 1027with $last, for the witness given in $sigil. If a $backup sigil is
1028specified (e.g. when walking a layered witness), it will be used wherever
1029no $sigil path exists. If there is a base text reading, that will be
1030used wherever no path exists for $sigil or $backup.
e2902068 1031
1032=cut
1033
910a0a6d 1034# TODO Think about returning some lazy-eval iterator.
b0b4421a 1035# TODO Get rid of backup; we should know from what witness is whether we need it.
910a0a6d 1036
e2902068 1037sub reading_sequence {
861c3e27 1038 my( $self, $start, $end, $witness ) = @_;
e2902068 1039
930ff666 1040 $witness = $self->baselabel unless $witness;
e2902068 1041 my @readings = ( $start );
1042 my %seen;
1043 my $n = $start;
3a2ebbf4 1044 while( $n && $n->id ne $end->id ) {
1045 if( exists( $seen{$n->id} ) ) {
63778331 1046 throw( "Detected loop for $witness at " . $n->id );
910a0a6d 1047 }
3a2ebbf4 1048 $seen{$n->id} = 1;
910a0a6d 1049
861c3e27 1050 my $next = $self->next_reading( $n, $witness );
44771cf2 1051 unless( $next ) {
63778331 1052 throw( "Did not find any path for $witness from reading " . $n->id );
44771cf2 1053 }
910a0a6d 1054 push( @readings, $next );
1055 $n = $next;
e2902068 1056 }
1057 # Check that the last reading is our end reading.
1058 my $last = $readings[$#readings];
63778331 1059 throw( "Last reading found from " . $start->text .
1060 " for witness $witness is not the end!" ) # TODO do we get this far?
3a2ebbf4 1061 unless $last->id eq $end->id;
e2902068 1062
1063 return @readings;
1064}
1065
4e5a7b2c 1066=head2 next_reading( $reading, $sigil );
8e1394aa 1067
4a8828f0 1068Returns the reading that follows the given reading along the given witness
930ff666 1069path.
8e1394aa 1070
1071=cut
1072
4a8828f0 1073sub next_reading {
e2902068 1074 # Return the successor via the corresponding path.
8e1394aa 1075 my $self = shift;
3a2ebbf4 1076 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 1077 return undef unless $answer;
3a2ebbf4 1078 return $self->reading( $answer );
8e1394aa 1079}
1080
4e5a7b2c 1081=head2 prior_reading( $reading, $sigil )
8e1394aa 1082
4a8828f0 1083Returns the reading that precedes the given reading along the given witness
930ff666 1084path.
8e1394aa 1085
1086=cut
1087
4a8828f0 1088sub prior_reading {
e2902068 1089 # Return the predecessor via the corresponding path.
8e1394aa 1090 my $self = shift;
3a2ebbf4 1091 my $answer = $self->_find_linked_reading( 'prior', @_ );
1092 return $self->reading( $answer );
8e1394aa 1093}
1094
4a8828f0 1095sub _find_linked_reading {
861c3e27 1096 my( $self, $direction, $node, $path ) = @_;
1097
1098 # Get a backup if we are dealing with a layered witness
1099 my $alt_path;
1100 my $aclabel = $self->ac_label;
1101 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1102 $alt_path = $1;
1103 }
1104
e2902068 1105 my @linked_paths = $direction eq 'next'
3a2ebbf4 1106 ? $self->sequence->edges_from( $node )
1107 : $self->sequence->edges_to( $node );
e2902068 1108 return undef unless scalar( @linked_paths );
8e1394aa 1109
e2902068 1110 # We have to find the linked path that contains all of the
1111 # witnesses supplied in $path.
1112 my( @path_wits, @alt_path_wits );
4e5a7b2c 1113 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1114 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 1115 my $base_le;
1116 my $alt_le;
1117 foreach my $le ( @linked_paths ) {
3a2ebbf4 1118 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 1119 $base_le = $le;
910a0a6d 1120 }
508fd430 1121 my @le_wits = sort $self->path_witnesses( $le );
3a2ebbf4 1122 if( _is_within( \@path_wits, \@le_wits ) ) {
1123 # This is the right path.
1124 return $direction eq 'next' ? $le->[1] : $le->[0];
1125 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1126 $alt_le = $le;
1127 }
8e1394aa 1128 }
e2902068 1129 # Got this far? Return the alternate path if it exists.
3a2ebbf4 1130 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 1131 if $alt_le;
e2902068 1132
1133 # Got this far? Return the base path if it exists.
3a2ebbf4 1134 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 1135 if $base_le;
e2902068 1136
1137 # Got this far? We have no appropriate path.
2c669bca 1138 warn "Could not find $direction node from " . $node->id
910a0a6d 1139 . " along path $path";
8e1394aa 1140 return undef;
1141}
1142
4a8828f0 1143# Some set logic.
1144sub _is_within {
1145 my( $set1, $set2 ) = @_;
7854e12e 1146 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 1147 foreach my $el ( @$set1 ) {
910a0a6d 1148 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 1149 }
1150 return $ret;
1151}
1152
4e5a7b2c 1153# Return the string that joins together a list of witnesses for
1154# display on a single path.
1155sub _witnesses_of_label {
1156 my( $self, $label ) = @_;
1157 my $regex = $self->wit_list_separator;
1158 my @answer = split( /\Q$regex\E/, $label );
1159 return @answer;
b0b4421a 1160}
1161
d4b75f44 1162=head2 common_readings
1163
1164Returns the list of common readings in the graph (i.e. those readings that are
1165shared by all non-lacunose witnesses.)
1166
1167=cut
1168
1169sub common_readings {
1170 my $self = shift;
1171 my @common = grep { $_->is_common } $self->readings;
1172 return @common;
1173}
1174
b0b4421a 1175=head2 path_text( $sigil, $mainsigil [, $start, $end ] )
1176
1177Returns the text of a witness (plus its backup, if we are using a layer)
1178as stored in the collation. The text is returned as a string, where the
1179individual readings are joined with spaces and the meta-readings (e.g.
1180lacunae) are omitted. Optional specification of $start and $end allows
1181the generation of a subset of the witness text.
4e5a7b2c 1182
b0b4421a 1183=cut
1184
1185sub path_text {
861c3e27 1186 my( $self, $wit, $start, $end ) = @_;
b0b4421a 1187 $start = $self->start unless $start;
1188 $end = $self->end unless $end;
861c3e27 1189 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
b0b4421a 1190 return join( ' ', map { $_->text } @path );
1191}
4e5a7b2c 1192
1193=head1 INITIALIZATION METHODS
1194
1195These are mostly for use by parsers.
1196
1197=head2 make_witness_path( $witness )
1198
1199Link the array of readings contained in $witness->path (and in
1200$witness->uncorrected_path if it exists) into collation paths.
1201Clear out the arrays when finished.
de51424a 1202
4e5a7b2c 1203=head2 make_witness_paths
1204
1205Call make_witness_path for all witnesses in the tradition.
1206
1207=cut
930ff666 1208
7e450e44 1209# For use when a collation is constructed from a base text and an apparatus.
1210# We have the sequences of readings and just need to add path edges.
1f7aa795 1211# When we are done, clear out the witness path attributes, as they are no
1212# longer needed.
1213# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 1214
6a222840 1215sub make_witness_paths {
1216 my( $self ) = @_;
910a0a6d 1217 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 1218 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 1219 $self->make_witness_path( $wit );
7854e12e 1220 }
7854e12e 1221}
1222
6a222840 1223sub make_witness_path {
7854e12e 1224 my( $self, $wit ) = @_;
1225 my @chain = @{$wit->path};
15d2d3df 1226 my $sig = $wit->sigil;
7854e12e 1227 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 1228 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 1229 }
1f7aa795 1230 if( $wit->is_layered ) {
d9e873d0 1231 @chain = @{$wit->uncorrected_path};
1232 foreach my $idx( 0 .. $#chain-1 ) {
1233 my $source = $chain[$idx];
1234 my $target = $chain[$idx+1];
1235 $self->add_path( $source, $target, $sig.$self->ac_label )
1236 unless $self->has_path( $source, $target, $sig );
1237 }
15d2d3df 1238 }
1f7aa795 1239 $wit->clear_path;
1240 $wit->clear_uncorrected_path;
e2902068 1241}
1242
4e5a7b2c 1243=head2 calculate_ranks
1244
1245Calculate the reading ranks (that is, their aligned positions relative
1246to each other) for the graph. This can only be called on linear collations.
1247
b365fbae 1248=begin testing
1249
1250use Text::Tradition;
1251
1252my $cxfile = 't/data/Collatex-16.xml';
1253my $t = Text::Tradition->new(
1254 'name' => 'inline',
1255 'input' => 'CollateX',
1256 'file' => $cxfile,
1257 );
1258my $c = $t->collation;
1259
1260# Make an svg
1261my $svg = $c->as_svg;
1262is( substr( $svg, 0, 5 ), '<?xml', "Got XML doc for svg" );
1263ok( $c->has_cached_svg, "SVG was cached" );
1264is( $c->as_svg, $svg, "Cached SVG returned upon second call" );
1265$c->calculate_ranks;
1266is( $c->as_svg, $svg, "Cached SVG retained with no rank change" );
1267$c->add_relationship( 'n9', 'n23', { 'type' => 'spelling' } );
1268isnt( $c->as_svg, $svg, "SVG changed after relationship add" );
1269
1270=end testing
1271
4e5a7b2c 1272=cut
1273
910a0a6d 1274sub calculate_ranks {
1275 my $self = shift;
b365fbae 1276 # Save the existing ranks, in case we need to invalidate the cached SVG.
1277 my %existing_ranks;
910a0a6d 1278 # Walk a version of the graph where every node linked by a relationship
1279 # edge is fundamentally the same node, and do a topological ranking on
1280 # the nodes in this graph.
c9bf3dbf 1281 my $topo_graph = Graph->new();
910a0a6d 1282 my %rel_containers;
1283 my $rel_ctr = 0;
1284 # Add the nodes
1285 foreach my $r ( $self->readings ) {
3a2ebbf4 1286 next if exists $rel_containers{$r->id};
910a0a6d 1287 my @rels = $r->related_readings( 'colocated' );
1288 if( @rels ) {
1289 # Make a relationship container.
1290 push( @rels, $r );
c9bf3dbf 1291 my $rn = 'rel_container_' . $rel_ctr++;
1292 $topo_graph->add_vertex( $rn );
910a0a6d 1293 foreach( @rels ) {
3a2ebbf4 1294 $rel_containers{$_->id} = $rn;
910a0a6d 1295 }
1296 } else {
1297 # Add a new node to mirror the old node.
3a2ebbf4 1298 $rel_containers{$r->id} = $r->id;
1299 $topo_graph->add_vertex( $r->id );
910a0a6d 1300 }
4a8828f0 1301 }
3a1f2523 1302
3a2ebbf4 1303 # Add the edges.
910a0a6d 1304 foreach my $r ( $self->readings ) {
b365fbae 1305 $existing_ranks{$r} = $r->rank;
3a2ebbf4 1306 foreach my $n ( $self->sequence->successors( $r->id ) ) {
1307 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
1308 $rel_containers{$n} );
4e5a7b2c 1309 # $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 1310 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 1311 }
1312 }
1313
1314 # Now do the rankings, starting with the start node.
3a2ebbf4 1315 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 1316 my $node_ranks = { $topo_start => 0 };
910a0a6d 1317 my @curr_origin = ( $topo_start );
1318 # A little iterative function.
1319 while( @curr_origin ) {
c9bf3dbf 1320 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 1321 }
1322 # Transfer our rankings from the topological graph to the real one.
1323 foreach my $r ( $self->readings ) {
3a2ebbf4 1324 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1325 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 1326 } else {
63778331 1327 # Die. Find the last rank we calculated.
1328 my @all_defined = sort { $node_ranks->{$rel_containers{$a->id}}
1329 <=> $node_ranks->{$rel_containers{$b->id}} }
1330 $self->readings;
1331 my $last = pop @all_defined;
1332 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
67da8d6c 1333 }
de51424a 1334 }
b365fbae 1335 # Do we need to invalidate the cached SVG?
1336 if( $self->has_cached_svg ) {
1337 foreach my $r ( $self->readings ) {
1338 next if $existing_ranks{$r} == $r->rank;
1339 $self->wipe_svg;
1340 last;
1341 }
1342 }
8e1394aa 1343}
3a1f2523 1344
910a0a6d 1345sub _assign_rank {
c9bf3dbf 1346 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1347 # Look at each of the children of @current_nodes. If all the child's
1348 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1349 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1350 # parent gets a rank.
910a0a6d 1351 my @next_nodes;
1352 foreach my $c ( @current_nodes ) {
c9bf3dbf 1353 warn "Current reading $c has no rank!"
1354 unless exists $node_ranks->{$c};
1355 # print STDERR "Looking at child of node $c, rank "
1356 # . $node_ranks->{$c} . "\n";
1357 foreach my $child ( $graph->successors( $c ) ) {
1358 next if exists $node_ranks->{$child};
910a0a6d 1359 my $highest_rank = -1;
1360 my $skip = 0;
c9bf3dbf 1361 foreach my $parent ( $graph->predecessors( $child ) ) {
1362 if( exists $node_ranks->{$parent} ) {
1363 $highest_rank = $node_ranks->{$parent}
1364 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1365 } else {
1366 $skip = 1;
1367 last;
1368 }
1369 }
1370 next if $skip;
c9bf3dbf 1371 my $c_rank = $highest_rank + 1;
1372 # print STDERR "Assigning rank $c_rank to node $child \n";
1373 $node_ranks->{$child} = $c_rank;
910a0a6d 1374 push( @next_nodes, $child );
1375 }
1376 }
1377 return @next_nodes;
4cdd82f1 1378}
910a0a6d 1379
4e5a7b2c 1380=head2 flatten_ranks
1381
1382A convenience method for parsing collation data. Searches the graph for readings
1383with the same text at the same rank, and merges any that are found.
1384
1385=cut
1386
0e476982 1387sub flatten_ranks {
1388 my $self = shift;
1389 my %unique_rank_rdg;
1390 foreach my $rdg ( $self->readings ) {
1391 next unless $rdg->has_rank;
1392 my $key = $rdg->rank . "||" . $rdg->text;
1393 if( exists $unique_rank_rdg{$key} ) {
1394 # Combine!
56eefa04 1395 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1396 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
0ecb975c 1397 # TODO see if this now makes a common point.
0e476982 1398 } else {
1399 $unique_rank_rdg{$key} = $rdg;
1400 }
1401 }
1402}
1403
4633f9e4 1404=head2 remove_collations
1405
1406Another convenience method for parsing. Removes all 'collation' relationships
1407that were defined in order to get the reading ranks to be correct.
1408
1409=begin testing
1410
1411use Text::Tradition;
1412
1413my $cxfile = 't/data/Collatex-16.xml';
1414my $t = Text::Tradition->new(
1415 'name' => 'inline',
1416 'input' => 'CollateX',
1417 'file' => $cxfile,
1418 );
1419my $c = $t->collation;
1420
1421isnt( $c->reading('n23')->rank, $c->reading('n9')->rank, "Rank skew exists" );
1422$c->add_relationship( 'n23', 'n9', { 'type' => 'collated', 'scope' => 'local' } );
1423is( scalar $c->relationships, 4, "Found all expected relationships" );
1424$c->remove_collations;
1425is( scalar $c->relationships, 3, "Collated relationships now gone" );
1426is( $c->reading('n23')->rank, $c->reading('n9')->rank, "Aligned ranks were preserved" );
1427
1428=end testing
1429
1430=cut
1431
1432sub remove_collations {
1433 my $self = shift;
1434 foreach my $reledge ( $self->relationships ) {
1435 my $relobj = $self->relations->get_relationship( $reledge );
1436 if( $relobj && $relobj->type eq 'collated' ) {
1437 $self->relations->delete_relationship( $reledge );
1438 }
1439 }
1440}
1441
1442
d4b75f44 1443=head2 calculate_common_readings
1444
1445Goes through the graph identifying the readings that appear in every witness
1446(apart from those with lacunae at that spot.) Marks them as common and returns
1447the list.
1448
1449=begin testing
1450
1451use Text::Tradition;
1452
1453my $cxfile = 't/data/Collatex-16.xml';
1454my $t = Text::Tradition->new(
1455 'name' => 'inline',
1456 'input' => 'CollateX',
1457 'file' => $cxfile,
1458 );
1459my $c = $t->collation;
1460
1461my @common = $c->calculate_common_readings();
1462is( scalar @common, 8, "Found correct number of common readings" );
1463my @marked = sort $c->common_readings();
1464is( scalar @common, 8, "All common readings got marked as such" );
1465my @expected = qw/ n1 n12 n16 n19 n20 n5 n6 n7 /;
1466is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1467
1468=end testing
1469
1470=cut
1471
1472sub calculate_common_readings {
1473 my $self = shift;
1474 my @common;
1dd07bda 1475 my $table = $self->alignment_table;
d4b75f44 1476 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
7f52eac8 1477 my @row = map { $_->{'tokens'}->[$idx]
1478 ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1479 @{$table->{'alignment'}};
d4b75f44 1480 my %hash;
1481 foreach my $r ( @row ) {
1482 if( $r ) {
1483 $hash{$r->id} = $r unless $r->is_meta;
1484 } else {
1485 $hash{'UNDEF'} = $r;
1486 }
1487 }
1488 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1489 my( $r ) = values %hash;
1490 $r->is_common( 1 );
1491 push( @common, $r );
1492 }
1493 }
1494 return @common;
1495}
1496
861c3e27 1497=head2 text_from_paths
1498
1499Calculate the text array for all witnesses from the path, for later consistency
1500checking. Only to be used if there is no non-graph-based way to know the
1501original texts.
1502
1503=cut
1504
1505sub text_from_paths {
1506 my $self = shift;
1507 foreach my $wit ( $self->tradition->witnesses ) {
1508 my @text = split( /\s+/,
1509 $self->reading_sequence( $self->start, $self->end, $wit->sigil ) );
1510 $wit->text( \@text );
1511 if( $wit->is_layered ) {
1512 my @uctext = split( /\s+/,
1513 $self->reading_sequence( $self->start, $self->end,
1514 $wit->sigil.$self->ac_label ) );
1515 $wit->text( \@uctext );
1516 }
1517 }
1518}
0e476982 1519
4e5a7b2c 1520=head1 UTILITY FUNCTIONS
1521
1522=head2 common_predecessor( $reading_a, $reading_b )
8e1394aa 1523
4e5a7b2c 1524Find the last reading that occurs in sequence before both the given readings.
1525
1526=head2 common_successor( $reading_a, $reading_b )
1527
1528Find the first reading that occurs in sequence after both the given readings.
1529
22222af9 1530=begin testing
1531
1532use Text::Tradition;
1533
1534my $cxfile = 't/data/Collatex-16.xml';
1535my $t = Text::Tradition->new(
1536 'name' => 'inline',
1537 'input' => 'CollateX',
1538 'file' => $cxfile,
1539 );
1540my $c = $t->collation;
1541
4e5a7b2c 1542is( $c->common_predecessor( 'n9', 'n23' )->id,
22222af9 1543 'n20', "Found correct common predecessor" );
4e5a7b2c 1544is( $c->common_successor( 'n9', 'n23' )->id,
22222af9 1545 '#END#', "Found correct common successor" );
1546
4e5a7b2c 1547is( $c->common_predecessor( 'n19', 'n17' )->id,
22222af9 1548 'n16', "Found correct common predecessor for readings on same path" );
4e5a7b2c 1549is( $c->common_successor( 'n21', 'n26' )->id,
22222af9 1550 '#END#', "Found correct common successor for readings on same path" );
1551
1552=end testing
1553
1554=cut
1555
1556## Return the closest reading that is a predecessor of both the given readings.
1557sub common_predecessor {
1558 my $self = shift;
4e5a7b2c 1559 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1560 return $self->_common_in_path( $r1, $r2, 'predecessors' );
22222af9 1561}
1562
1563sub common_successor {
1564 my $self = shift;
4e5a7b2c 1565 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1566 return $self->_common_in_path( $r1, $r2, 'successors' );
22222af9 1567}
1568
027d819c 1569sub _common_in_path {
22222af9 1570 my( $self, $r1, $r2, $dir ) = @_;
1571 my $iter = $r1->rank > $r2->rank ? $r1->rank : $r2->rank;
1572 $iter = $self->end->rank - $iter if $dir eq 'successors';
1573 my @candidates;
1574 my @last_checked = ( $r1, $r2 );
1575 my %all_seen;
1576 while( !@candidates ) {
1577 my @new_lc;
1578 foreach my $lc ( @last_checked ) {
1579 foreach my $p ( $lc->$dir ) {
1580 if( $all_seen{$p->id} ) {
1581 push( @candidates, $p );
1582 } else {
1583 $all_seen{$p->id} = 1;
1584 push( @new_lc, $p );
1585 }
1586 }
1587 }
1588 @last_checked = @new_lc;
1589 }
1590 my @answer = sort { $a->rank <=> $b->rank } @candidates;
1591 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1592}
1593
63778331 1594sub throw {
1595 Text::Tradition::Error->throw(
1596 'ident' => 'Collation error',
1597 'message' => $_[0],
1598 );
1599}
1600
dd3b58b0 1601no Moose;
1602__PACKAGE__->meta->make_immutable;
e867486f 1603
027d819c 1604=head1 LICENSE
e867486f 1605
027d819c 1606This package is free software and is provided "as is" without express
1607or implied warranty. You can redistribute it and/or modify it under
1608the same terms as Perl itself.
e867486f 1609
027d819c 1610=head1 AUTHOR
e867486f 1611
027d819c 1612Tara L Andrews E<lt>aurum@cpan.orgE<gt>