get rid of debugging print
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
c9bf3dbf 5use Graph;
8e1394aa 6use IPC::Run qw( run binary );
910a0a6d 7use Text::CSV_XS;
b15511bf 8use Text::Tradition::Collation::Reading;
22222af9 9use Text::Tradition::Collation::RelationshipStore;
63778331 10use Text::Tradition::Error;
df6d9812 11use XML::LibXML;
a344ffcf 12use XML::LibXML::XPathContext;
dd3b58b0 13use Moose;
14
3a2ebbf4 15has 'sequence' => (
d047cd52 16 is => 'ro',
3a2ebbf4 17 isa => 'Graph',
18 default => sub { Graph->new() },
d047cd52 19 handles => {
3a2ebbf4 20 paths => 'edges',
d047cd52 21 },
d047cd52 22 );
3a2ebbf4 23
24has 'relations' => (
25 is => 'ro',
22222af9 26 isa => 'Text::Tradition::Collation::RelationshipStore',
27 handles => {
28 relationships => 'relationships',
29 related_readings => 'related_readings',
30 },
31 writer => '_set_relations',
3a2ebbf4 32 );
dd3b58b0 33
3a2ebbf4 34has 'tradition' => (
35 is => 'ro',
d047cd52 36 isa => 'Text::Tradition',
8d9a1cd8 37 weak_ref => 1,
d047cd52 38 );
dd3b58b0 39
3a2ebbf4 40has 'readings' => (
41 isa => 'HashRef[Text::Tradition::Collation::Reading]',
42 traits => ['Hash'],
43 handles => {
44 reading => 'get',
45 _add_reading => 'set',
46 del_reading => 'delete',
47 has_reading => 'exists',
48 readings => 'values',
49 },
50 default => sub { {} },
51 );
910a0a6d 52
4a8828f0 53has 'wit_list_separator' => (
7854e12e 54 is => 'rw',
55 isa => 'Str',
56 default => ', ',
57 );
58
59has 'baselabel' => (
60 is => 'rw',
61 isa => 'Str',
62 default => 'base text',
63 );
4a8828f0 64
15d2d3df 65has 'linear' => (
66 is => 'rw',
67 isa => 'Bool',
68 default => 1,
69 );
c84275ff 70
ef9d481f 71has 'ac_label' => (
72 is => 'rw',
73 isa => 'Str',
74 default => ' (a.c.)',
75 );
3a2ebbf4 76
77has 'start' => (
78 is => 'ro',
79 isa => 'Text::Tradition::Collation::Reading',
80 writer => '_set_start',
81 weak_ref => 1,
82 );
83
84has 'end' => (
85 is => 'ro',
86 isa => 'Text::Tradition::Collation::Reading',
87 writer => '_set_end',
88 weak_ref => 1,
89 );
b365fbae 90
91has 'cached_svg' => (
92 is => 'rw',
93 isa => 'Str',
94 predicate => 'has_cached_svg',
95 clearer => 'wipe_svg',
96 );
1f563ac3 97
4e5a7b2c 98=head1 NAME
99
100Text::Tradition::Collation - a software model for a text collation
101
102=head1 SYNOPSIS
103
104 use Text::Tradition;
105 my $t = Text::Tradition->new(
106 'name' => 'this is a text',
107 'input' => 'TEI',
108 'file' => '/path/to/tei_parallel_seg_file.xml' );
109
110 my $c = $t->collation;
111 my @readings = $c->readings;
112 my @paths = $c->paths;
113 my @relationships = $c->relationships;
114
115 my $svg_variant_graph = $t->collation->as_svg();
116
117=head1 DESCRIPTION
118
119Text::Tradition is a library for representation and analysis of collated
120texts, particularly medieval ones. The Collation is the central feature of
121a Tradition, where the text, its sequence of readings, and its relationships
122between readings are actually kept.
123
124=head1 CONSTRUCTOR
125
126=head2 new
127
128The constructor. Takes a hash or hashref of the following arguments:
129
130=over
131
132=item * tradition - The Text::Tradition object to which the collation
133belongs. Required.
134
135=item * linear - Whether the collation should be linear; that is, whether
136transposed readings should be treated as two linked readings rather than one,
137and therefore whether the collation graph is acyclic. Defaults to true.
138
4e5a7b2c 139=item * baselabel - The default label for the path taken by a base text
140(if any). Defaults to 'base text'.
141
142=item * wit_list_separator - The string to join a list of witnesses for
143purposes of making labels in display graphs. Defaults to ', '.
144
145=item * ac_label - The extra label to tack onto a witness sigil when
146representing another layer of path for the given witness - that is, when
147a text has more than one possible reading due to scribal corrections or
148the like. Defaults to ' (a.c.)'.
149
150=back
151
152=head1 ACCESSORS
153
154=head2 tradition
155
156=head2 linear
157
4e5a7b2c 158=head2 wit_list_separator
159
160=head2 baselabel
161
162=head2 ac_label
163
164Simple accessors for collation attributes.
165
166=head2 start
167
168The meta-reading at the start of every witness path.
169
170=head2 end
171
172The meta-reading at the end of every witness path.
173
174=head2 readings
175
176Returns all Reading objects in the graph.
177
178=head2 reading( $id )
179
180Returns the Reading object corresponding to the given ID.
181
182=head2 add_reading( $reading_args )
183
184Adds a new reading object to the collation.
185See L<Text::Tradition::Collation::Reading> for the available arguments.
186
187=head2 del_reading( $object_or_id )
188
189Removes the given reading from the collation, implicitly removing its
190paths and relationships.
191
192=head2 merge_readings( $main, $second )
193
194Merges the $second reading into the $main one.
195The arguments may be either readings or reading IDs.
196
197=head2 has_reading( $id )
198
199Predicate to see whether a given reading ID is in the graph.
200
201=head2 reading_witnesses( $object_or_id )
202
203Returns a list of sigils whose witnesses contain the reading.
204
205=head2 paths
206
207Returns all reading paths within the document - that is, all edges in the
208collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
209
210=head2 add_path( $source, $target, $sigil )
211
212Links the given readings in the collation in sequence, under the given witness
213sigil. The readings may be specified by object or ID.
214
215=head2 del_path( $source, $target, $sigil )
216
217Links the given readings in the collation in sequence, under the given witness
218sigil. The readings may be specified by object or ID.
219
220=head2 has_path( $source, $target );
221
222Returns true if the two readings are linked in sequence in any witness.
223The readings may be specified by object or ID.
224
225=head2 relationships
226
227Returns all Relationship objects in the collation.
228
229=head2 add_relationship( $reading, $other_reading, $options )
230
231Adds a new relationship of the type given in $options between the two readings,
232which may be specified by object or ID. Returns a value of ( $status, @vectors)
233where $status is true on success, and @vectors is a list of relationship edges
234that were ultimately added.
235See L<Text::Tradition::Collation::Relationship> for the available options.
236
237=cut
dd3b58b0 238
d047cd52 239sub BUILD {
3a2ebbf4 240 my $self = shift;
22222af9 241 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
3a2ebbf4 242 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
243 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 244}
784877d9 245
3a2ebbf4 246### Reading construct/destruct functions
247
248sub add_reading {
249 my( $self, $reading ) = @_;
250 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
251 my %args = %$reading;
252 $reading = Text::Tradition::Collation::Reading->new(
253 'collation' => $self,
254 %args );
255 }
256 # First check to see if a reading with this ID exists.
257 if( $self->reading( $reading->id ) ) {
63778331 258 throw( "Collation already has a reading with id " . $reading->id );
3a2ebbf4 259 }
260 $self->_add_reading( $reading->id => $reading );
261 # Once the reading has been added, put it in both graphs.
262 $self->sequence->add_vertex( $reading->id );
22222af9 263 $self->relations->add_reading( $reading->id );
3a2ebbf4 264 return $reading;
eca16057 265};
266
3a2ebbf4 267around del_reading => sub {
268 my $orig = shift;
269 my $self = shift;
270 my $arg = shift;
271
272 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
273 $arg = $arg->id;
274 }
3a2ebbf4 275 # Remove the reading from the graphs.
276 $self->sequence->delete_vertex( $arg );
22222af9 277 $self->relations->delete_reading( $arg );
3a2ebbf4 278
279 # Carry on.
280 $self->$orig( $arg );
281};
7854e12e 282
3a2ebbf4 283# merge_readings( $main, $to_be_deleted );
7854e12e 284
3a2ebbf4 285sub merge_readings {
286 my $self = shift;
287
288 # We only need the IDs for adding paths to the graph, not the reading
289 # objects themselves.
49d4f2ac 290 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
3a2ebbf4 291
292 # The kept reading should inherit the paths and the relationships
293 # of the deleted reading.
294 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
295 my @vector = ( $kept );
296 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
297 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 298 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 299 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
300 $self->sequence->add_edge( @vector );
301 my $fwits = $self->sequence->get_edge_attributes( @vector );
302 @wits{keys %$fwits} = values %$fwits;
303 $self->sequence->set_edge_attributes( @vector, \%wits );
304 }
22222af9 305 $self->relations->merge_readings( $kept, $deleted, $combine_char );
3a2ebbf4 306
307 # Do the deletion deed.
49d4f2ac 308 if( $combine_char ) {
309 my $kept_obj = $self->reading( $kept );
310 my $new_text = join( $combine_char, $kept_obj->text,
311 $self->reading( $deleted )->text );
312 $kept_obj->alter_text( $new_text );
313 }
3a2ebbf4 314 $self->del_reading( $deleted );
315}
7854e12e 316
3265b0ce 317
3a2ebbf4 318# Helper function for manipulating the graph.
319sub _stringify_args {
320 my( $self, $first, $second, $arg ) = @_;
321 $first = $first->id
322 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
323 $second = $second->id
324 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
325 return( $first, $second, $arg );
326}
df6d9812 327
4e5a7b2c 328# Helper function for manipulating the graph.
329sub _objectify_args {
330 my( $self, $first, $second, $arg ) = @_;
331 $first = $self->reading( $first )
332 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
333 $second = $self->reading( $second )
334 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
335 return( $first, $second, $arg );
336}
3a2ebbf4 337### Path logic
338
339sub add_path {
340 my $self = shift;
341
342 # We only need the IDs for adding paths to the graph, not the reading
343 # objects themselves.
344 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
345
346 # Connect the readings
347 $self->sequence->add_edge( $source, $target );
348 # Note the witness in question
349 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 350};
351
3a2ebbf4 352sub del_path {
353 my $self = shift;
49d4f2ac 354 my @args;
355 if( ref( $_[0] ) eq 'ARRAY' ) {
356 my $e = shift @_;
357 @args = ( @$e, @_ );
358 } else {
359 @args = @_;
360 }
3a2ebbf4 361
362 # We only need the IDs for adding paths to the graph, not the reading
363 # objects themselves.
49d4f2ac 364 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 365
366 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 367 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 368 }
369 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
370 $self->sequence->delete_edge( $source, $target );
371 }
784877d9 372}
373
3a2ebbf4 374
15d2d3df 375# Extra graph-alike utility
376sub has_path {
3a2ebbf4 377 my $self = shift;
378 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
379 return undef unless $self->sequence->has_edge( $source, $target );
380 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 381}
382
4e5a7b2c 383=head2 clear_witness( @sigil_list )
3a2ebbf4 384
4e5a7b2c 385Clear the given witnesses out of the collation entirely, removing references
386to them in paths, and removing readings that belong only to them. Should only
387be called via $tradition->del_witness.
3a2ebbf4 388
389=cut
390
4e5a7b2c 391sub clear_witness {
392 my( $self, @sigils ) = @_;
393
394 # Clear the witness(es) out of the paths
395 foreach my $e ( $self->paths ) {
396 foreach my $sig ( @sigils ) {
397 $self->del_path( $e, $sig );
398 }
399 }
400
401 # Clear out the newly unused readings
402 foreach my $r ( $self->readings ) {
403 unless( $self->reading_witnesses( $r ) ) {
404 $self->del_reading( $r );
405 }
406 }
407}
3a2ebbf4 408
409sub add_relationship {
410 my $self = shift;
22222af9 411 my( $source, $target, $opts ) = $self->_stringify_args( @_ );
63778331 412 my( @vectors ) = $self->relations->add_relationship( $source,
64ae6270 413 $self->reading( $source ), $target, $self->reading( $target ), $opts );
414 # Force a full rank recalculation every time. Yuck.
63778331 415 $self->calculate_ranks() if $self->end->has_rank;
416 return @vectors;
22222af9 417}
ef9d481f 418
22222af9 419=head2 reading_witnesses( $reading )
910a0a6d 420
22222af9 421Return a list of sigils corresponding to the witnesses in which the reading appears.
3265b0ce 422
22222af9 423=cut
1d310495 424
1d310495 425sub reading_witnesses {
426 my( $self, $reading ) = @_;
427 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 428 # arbitrarily chosen "incoming". Thus, special-case the start node.
429 if( $reading eq $self->start ) {
430 return map { $_->sigil } $self->tradition->witnesses;
431 }
1d310495 432 my %all_witnesses;
433 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
434 my $wits = $self->sequence->get_edge_attributes( @$e );
435 @all_witnesses{ keys %$wits } = 1;
436 }
437 return keys %all_witnesses;
910a0a6d 438}
439
4e5a7b2c 440=head1 OUTPUT METHODS
8e1394aa 441
0ecb975c 442=head2 as_svg( \%options )
8e1394aa 443
0068967c 444Returns an SVG string that represents the graph, via as_dot and graphviz.
0ecb975c 445See as_dot for a list of options.
8e1394aa 446
447=cut
448
449sub as_svg {
0ecb975c 450 my( $self, $opts ) = @_;
e247aad1 451 my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
452 if( !$self->has_cached_svg || $opts->{'recalc'} || $want_subgraph ) {
b365fbae 453 my @cmd = qw/dot -Tsvg/;
454 my( $svg, $err );
455 my $dotfile = File::Temp->new();
456 ## USE FOR DEBUGGING
457 # $dotfile->unlink_on_destroy(0);
458 binmode $dotfile, ':utf8';
459 print $dotfile $self->as_dot( $opts );
460 push( @cmd, $dotfile->filename );
461 run( \@cmd, ">", binary(), \$svg );
e247aad1 462 $svg = decode_utf8( $svg );
463 $self->cached_svg( $svg ) unless $want_subgraph;
464 return $svg;
465 } else {
466 return $self->cached_svg;
b365fbae 467 }
8e1394aa 468}
469
b22576c6 470
0ecb975c 471=head2 as_dot( \%options )
b22576c6 472
0ecb975c 473Returns a string that is the collation graph expressed in dot
474(i.e. GraphViz) format. Options include:
b22576c6 475
0ecb975c 476=over 4
b22576c6 477
0ecb975c 478=item * from
b22576c6 479
0ecb975c 480=item * to
df6d9812 481
0ecb975c 482=item * color_common
483
484=back
df6d9812 485
486=cut
487
488sub as_dot {
0ecb975c 489 my( $self, $opts ) = @_;
490 my $startrank = $opts->{'from'} if $opts;
491 my $endrank = $opts->{'to'} if $opts;
492 my $color_common = $opts->{'color_common'} if $opts;
b365fbae 493 my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
494 && $self->end->rank > 100;
495
b22576c6 496 # Check the arguments
497 if( $startrank ) {
498 return if $endrank && $startrank > $endrank;
499 return if $startrank > $self->end->rank;
500 }
501 if( defined $endrank ) {
502 return if $endrank < 0;
f1b3b33a 503 $endrank = undef if $endrank == $self->end->rank;
b22576c6 504 }
505
67da8d6c 506 my $graph_name = $self->tradition->name;
507 $graph_name =~ s/[^\w\s]//g;
508 $graph_name = join( '_', split( /\s+/, $graph_name ) );
f13b5582 509
510 my %graph_attrs = (
511 'rankdir' => 'LR',
512 'bgcolor' => 'none',
513 );
514 my %node_attrs = (
b8990398 515 'fontsize' => 14,
f13b5582 516 'fillcolor' => 'white',
517 'style' => 'filled',
518 'shape' => 'ellipse'
519 );
520 my %edge_attrs = (
521 'arrowhead' => 'open',
522 'color' => '#000000',
523 'fontcolor' => '#000000',
524 );
525
67da8d6c 526 my $dot = sprintf( "digraph %s {\n", $graph_name );
f13b5582 527 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
528 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
df6d9812 529
b22576c6 530 # Output substitute start/end readings if necessary
531 if( $startrank ) {
532 $dot .= "\t\"#SUBSTART#\" [ label=\"...\" ];\n";
533 }
534 if( $endrank ) {
535 $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n";
536 }
b365fbae 537 if( $STRAIGHTENHACK ) {
538 ## HACK part 1
539 $dot .= "\tsubgraph { rank=same \"#START#\" \"#SILENT#\" }\n";
540 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
541 }
b22576c6 542 my %used; # Keep track of the readings that actually appear in the graph
30ddc24c 543 # Sort the readings by rank if we have ranks; this speeds layout.
544 my @all_readings = $self->end->has_rank
545 ? sort { $a->rank <=> $b->rank } $self->readings
546 : $self->readings;
4633f9e4 547 # TODO Refrain from outputting lacuna nodes - just grey out the edges.
30ddc24c 548 foreach my $reading ( @all_readings ) {
b22576c6 549 # Only output readings within our rank range.
550 next if $startrank && $reading->rank < $startrank;
551 next if $endrank && $reading->rank > $endrank;
552 $used{$reading->id} = 1;
910a0a6d 553 # Need not output nodes without separate labels
3a2ebbf4 554 next if $reading->id eq $reading->text;
d4b75f44 555 my $rattrs;
30f0df34 556 my $label = $reading->text;
8f9cab7b 557 $label =~ s/\"/\\\"/g;
d4b75f44 558 $rattrs->{'label'} = $label;
0ecb975c 559 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
d4b75f44 560 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
df6d9812 561 }
3a2ebbf4 562
30ddc24c 563 # Add the real edges. Need to weight one edge per rank jump, in a
564 # continuous line.
b365fbae 565 # my $weighted = $self->_add_edge_weights;
b22576c6 566 my @edges = $self->paths;
3bdec618 567 my( %substart, %subend );
b22576c6 568 foreach my $edge ( @edges ) {
569 # Do we need to output this edge?
508fd430 570 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
027d819c 571 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
f13b5582 572 my $variables = { %edge_attrs, 'label' => $label };
30ddc24c 573
b22576c6 574 # Account for the rank gap if necessary
30ddc24c 575 my $rank0 = $self->reading( $edge->[0] )->rank
576 if $self->reading( $edge->[0] )->has_rank;
577 my $rank1 = $self->reading( $edge->[1] )->rank
578 if $self->reading( $edge->[1] )->has_rank;
579 if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
580 $variables->{'minlen'} = $rank1 - $rank0;
581 }
582
583 # Add the calculated edge weights
b365fbae 584 # if( exists $weighted->{$edge->[0]}
e247aad1 585 # && $weighted->{$edge->[0]} eq $edge->[1] ) {
586 # # $variables->{'color'} = 'red';
587 # $variables->{'weight'} = 3.0;
588 # }
30ddc24c 589
508fd430 590 # EXPERIMENTAL: make edge width reflect no. of witnesses
591 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
592 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
593
f13b5582 594 my $varopts = _dot_attr_string( $variables );
595 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
596 $edge->[0], $edge->[1], $varopts );
3bdec618 597 } elsif( $used{$edge->[0]} ) {
598 $subend{$edge->[0]} = 1;
599 } elsif( $used{$edge->[1]} ) {
600 $substart{$edge->[1]} = 1;
b22576c6 601 }
df6d9812 602 }
3bdec618 603 # Add substitute start and end edges if necessary
604 foreach my $node ( keys %substart ) {
027d819c 605 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 606 my $variables = { %edge_attrs, 'label' => $witstr };
607 my $varopts = _dot_attr_string( $variables );
608 $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;";
3bdec618 609 }
610 foreach my $node ( keys %subend ) {
027d819c 611 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 612 my $variables = { %edge_attrs, 'label' => $witstr };
613 my $varopts = _dot_attr_string( $variables );
614 $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;";
3bdec618 615 }
b365fbae 616 # HACK part 2
617 if( $STRAIGHTENHACK ) {
618 $dot .= "\t\"#END#\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
619 }
30ddc24c 620
df6d9812 621 $dot .= "}\n";
622 return $dot;
623}
624
f13b5582 625sub _dot_attr_string {
626 my( $hash ) = @_;
627 my @attrs;
628 foreach my $k ( sort keys %$hash ) {
629 my $v = $hash->{$k};
630 push( @attrs, $k.'="'.$v.'"' );
631 }
632 return( '[ ' . join( ', ', @attrs ) . ' ]' );
633}
634
30ddc24c 635sub _add_edge_weights {
636 my $self = shift;
637 # Walk the graph from START to END, choosing the successor node with
638 # the largest number of witness paths each time.
639 my $weighted = {};
640 my $curr = $self->start->id;
008fc8a6 641 my $ranked = $self->end->has_rank;
30ddc24c 642 while( $curr ne $self->end->id ) {
008fc8a6 643 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
30ddc24c 644 my @succ = sort { $self->path_witnesses( $curr, $a )
645 <=> $self->path_witnesses( $curr, $b ) }
646 $self->sequence->successors( $curr );
647 my $next = pop @succ;
008fc8a6 648 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
30ddc24c 649 # Try to avoid lacunae in the weighted path.
008fc8a6 650 while( @succ &&
651 ( $self->reading( $next )->is_lacuna ||
652 $nextrank - $rank > 1 ) ){
30ddc24c 653 $next = pop @succ;
654 }
655 $weighted->{$curr} = $next;
656 $curr = $next;
657 }
658 return $weighted;
659}
660
027d819c 661=head2 path_witnesses( $edge )
662
663Returns the list of sigils whose witnesses are associated with the given edge.
664The edge can be passed as either an array or an arrayref of ( $source, $target ).
665
666=cut
667
3a2ebbf4 668sub path_witnesses {
669 my( $self, @edge ) = @_;
670 # If edge is an arrayref, cope.
671 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
672 my $e = shift @edge;
673 @edge = @$e;
674 }
675 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
508fd430 676 return @wits;
3a2ebbf4 677}
678
027d819c 679sub _path_display_label {
508fd430 680 my $self = shift;
681 my @wits = sort @_;
8f9cab7b 682 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
683 if( scalar @wits > $maj ) {
f13b5582 684 # TODO break out a.c. wits
8f9cab7b 685 return 'majority';
686 } else {
687 return join( ', ', @wits );
688 }
689}
690
691
4e5a7b2c 692=head2 as_graphml
8e1394aa 693
4e5a7b2c 694Returns a GraphML representation of the collation. The GraphML will contain
695two graphs. The first expresses the attributes of the readings and the witness
696paths that link them; the second expresses the relationships that link the
697readings. This is the native transfer format for a tradition.
8e1394aa 698
56eefa04 699=begin testing
700
701use Text::Tradition;
702
703my $READINGS = 311;
704my $PATHS = 361;
705
706my $datafile = 't/data/florilegium_tei_ps.xml';
707my $tradition = Text::Tradition->new( 'input' => 'TEI',
708 'name' => 'test0',
709 'file' => $datafile,
710 'linear' => 1 );
711
712ok( $tradition, "Got a tradition object" );
713is( scalar $tradition->witnesses, 13, "Found all witnesses" );
714ok( $tradition->collation, "Tradition has a collation" );
715
716my $c = $tradition->collation;
717is( scalar $c->readings, $READINGS, "Collation has all readings" );
718is( scalar $c->paths, $PATHS, "Collation has all paths" );
719is( scalar $c->relationships, 0, "Collation has all relationships" );
720
721# Add a few relationships
722$c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
723$c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
724$c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
725
726# Now write it to GraphML and parse it again.
727
728my $graphml = $c->as_graphml;
729my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
730is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
731is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
732is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
733
734=end testing
735
8e1394aa 736=cut
737
738sub as_graphml {
3a2ebbf4 739 my( $self ) = @_;
8e1394aa 740
741 # Some namespaces
742 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
743 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
744 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 745 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 746
747 # Create the document and root node
748 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
749 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
750 $graphml->setDocumentElement( $root );
751 $root->setNamespace( $xsi_ns, 'xsi', 0 );
752 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
753
e309421a 754 # Add the data keys for the graph
755 my %graph_data_keys;
756 my $gdi = 0;
1d310495 757 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
e309421a 758 foreach my $datum ( @graph_attributes ) {
759 $graph_data_keys{$datum} = 'dg'.$gdi++;
760 my $key = $root->addNewChild( $graphml_ns, 'key' );
761 $key->setAttribute( 'attr.name', $datum );
762 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
763 $key->setAttribute( 'for', 'graph' );
764 $key->setAttribute( 'id', $graph_data_keys{$datum} );
765 }
f6066bac 766
8e1394aa 767 # Add the data keys for nodes
ef9d481f 768 my %node_data_keys;
769 my $ndi = 0;
3a2ebbf4 770 my %node_data = (
771 id => 'string',
255875b8 772 text => 'string',
3a2ebbf4 773 rank => 'string',
774 is_start => 'boolean',
775 is_end => 'boolean',
776 is_lacuna => 'boolean',
777 );
778 foreach my $datum ( keys %node_data ) {
910a0a6d 779 $node_data_keys{$datum} = 'dn'.$ndi++;
780 my $key = $root->addNewChild( $graphml_ns, 'key' );
781 $key->setAttribute( 'attr.name', $datum );
3a2ebbf4 782 $key->setAttribute( 'attr.type', $node_data{$datum} );
910a0a6d 783 $key->setAttribute( 'for', 'node' );
784 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 785 }
786
df6d9812 787 # Add the data keys for edges, i.e. witnesses
ef9d481f 788 my $edi = 0;
789 my %edge_data_keys;
3a2ebbf4 790 my %edge_data = (
f523c7a8 791 class => 'string', # Class, deprecated soon
3a2ebbf4 792 witness => 'string', # ID/label for a path
793 relationship => 'string', # ID/label for a relationship
794 extra => 'boolean', # Path key
c84275ff 795 scope => 'string', # Relationship key
3a2ebbf4 796 non_correctable => 'boolean', # Relationship key
797 non_independent => 'boolean', # Relationship key
798 );
799 foreach my $datum ( keys %edge_data ) {
800 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 801 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 802 $key->setAttribute( 'attr.name', $datum );
803 $key->setAttribute( 'attr.type', $edge_data{$datum} );
910a0a6d 804 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 805 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 806 }
3a2ebbf4 807
22222af9 808 # Add the collation graph itself
2c669bca 809 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
810 $sgraph->setAttribute( 'edgedefault', 'directed' );
811 $sgraph->setAttribute( 'id', $self->tradition->name );
812 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
813 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
814 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
815 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
816 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
22222af9 817
e309421a 818 # Collation attribute data
819 foreach my $datum ( @graph_attributes ) {
2c669bca 820 my $value = $datum eq 'version' ? '3.0' : $self->$datum;
821 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 822 }
8e1394aa 823
824 my $node_ctr = 0;
825 my %node_hash;
22222af9 826 # Add our readings to the graph
3a2ebbf4 827 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
2c669bca 828 # Add to the main graph
829 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 830 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 831 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 832 $node_el->setAttribute( 'id', $node_xmlid );
255875b8 833 foreach my $d ( keys %node_data ) {
834 my $nval = $n->$d;
835 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
836 if defined $nval;
837 }
b15511bf 838 }
839
2c669bca 840 # Add the path edges to the sequence graph
df6d9812 841 my $edge_ctr = 0;
3a2ebbf4 842 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
843 # We add an edge in the graphml for every witness in $e.
508fd430 844 foreach my $wit ( sort $self->path_witnesses( $e ) ) {
3a2ebbf4 845 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
846 $node_hash{ $e->[0] },
847 $node_hash{ $e->[1] } );
2c669bca 848 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 849 $edge_el->setAttribute( 'source', $from );
850 $edge_el->setAttribute( 'target', $to );
851 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 852
853 # It's a witness path, so add the witness
854 my $base = $wit;
855 my $key = $edge_data_keys{'witness'};
856 # Is this an ante-corr witness?
857 my $aclabel = $self->ac_label;
858 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
859 # Keep the base witness
860 $base = $1;
861 # ...and record that this is an 'extra' reading path
862 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
863 }
864 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
f523c7a8 865 _add_graphml_data( $edge_el, $edge_data_keys{'class'}, 'path' );
3a2ebbf4 866 }
867 }
868
22222af9 869 # Add the relationship graph to the XML
027d819c 870 $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
2626f709 871 $node_data_keys{'id'}, \%edge_data_keys );
8e1394aa 872
94c00c71 873 # Save and return the thing
874 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 875 return $result;
df6d9812 876}
877
b15511bf 878sub _add_graphml_data {
879 my( $el, $key, $value ) = @_;
b15511bf 880 return unless defined $value;
c9bf3dbf 881 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 882 $data_el->setAttribute( 'key', $key );
883 $data_el->appendText( $value );
8e1394aa 884}
885
4e5a7b2c 886=head2 as_csv
910a0a6d 887
888Returns a CSV alignment table representation of the collation graph, one
2c669bca 889row per witness (or witness uncorrected.)
910a0a6d 890
891=cut
892
893sub as_csv {
3a2ebbf4 894 my( $self ) = @_;
910a0a6d 895 my $table = $self->make_alignment_table;
896 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
897 my @result;
2c669bca 898 # Make the header row
899 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
900 push( @result, decode_utf8( $csv->string ) );
901 # Make the rest of the rows
902 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 903 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
2c669bca 904 my @row = map { $_ ? $_->{'t'} : $_ } @rowobjs;
905 $csv->combine( @row );
910a0a6d 906 push( @result, decode_utf8( $csv->string ) );
907 }
3a2ebbf4 908 return join( "\n", @result );
910a0a6d 909}
910
4e5a7b2c 911=head2 make_alignment_table( $use_refs, $include_witnesses )
2c669bca 912
566f4595 913Return a reference to an alignment table, in a slightly enhanced CollateX
914format which looks like this:
915
916 $table = { alignment => [ { witness => "SIGIL",
4e5a7b2c 917 tokens => [ { t => "TEXT" }, ... ] },
566f4595 918 { witness => "SIG2",
4e5a7b2c 919 tokens => [ { t => "TEXT" }, ... ] },
566f4595 920 ... ],
921 length => TEXTLEN };
922
923If $use_refs is set to 1, the reading object is returned in the table
924instead of READINGTEXT; if not, the text of the reading is returned.
4e5a7b2c 925
926If $include_witnesses is set to a hashref, only the witnesses whose sigil
566f4595 927keys have a true hash value will be included.
2c669bca 928
929=cut
9f3ba6f7 930
910a0a6d 931sub make_alignment_table {
08e0fb85 932 my( $self, $noderefs, $include ) = @_;
0ecb975c 933 # Make sure we can do this
934 throw( "Need a linear graph in order to make an alignment table" )
935 unless $self->linear;
936 $self->calculate_ranks unless $self->end->has_rank;
937
2c669bca 938 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 939 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 940 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
2c669bca 941 if( $include ) {
566f4595 942 next unless $include->{$wit->sigil};
2c669bca 943 }
eca16057 944 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 945 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
946 my @row = _make_witness_row( \@wit_path, \@all_pos, $noderefs );
2c669bca 947 push( @{$table->{'alignment'}},
948 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 949 if( $wit->is_layered ) {
950 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
861c3e27 951 $wit->sigil.$self->ac_label );
1f7aa795 952 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos, $noderefs );
2c669bca 953 push( @{$table->{'alignment'}},
954 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 955 }
956 }
2c669bca 957 return $table;
910a0a6d 958}
959
960sub _make_witness_row {
0e476982 961 my( $path, $positions, $noderefs ) = @_;
910a0a6d 962 my %char_hash;
963 map { $char_hash{$_} = undef } @$positions;
2c669bca 964 my $debug = 0;
910a0a6d 965 foreach my $rdg ( @$path ) {
eca16057 966 my $rtext = $rdg->text;
967 $rtext = '#LACUNA#' if $rdg->is_lacuna;
2c669bca 968 print STDERR "rank " . $rdg->rank . "\n" if $debug;
3a2ebbf4 969 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
2c669bca 970 $char_hash{$rdg->rank} = $noderefs ? { 't' => $rdg }
971 : { 't' => $rtext };
910a0a6d 972 }
973 my @row = map { $char_hash{$_} } @$positions;
eca16057 974 # Fill in lacuna markers for undef spots in the row
975 my $last_el = shift @row;
976 my @filled_row = ( $last_el );
977 foreach my $el ( @row ) {
0e476982 978 # If we are using node reference, make the lacuna node appear many times
979 # in the table. If not, use the lacuna tag.
980 if( $last_el && _el_is_lacuna( $last_el ) && !defined $el ) {
566f4595 981 $el = $noderefs ? $last_el : { 't' => '#LACUNA#' };
eca16057 982 }
983 push( @filled_row, $el );
984 $last_el = $el;
985 }
986 return @filled_row;
910a0a6d 987}
988
0e476982 989# Tiny utility function to say if a table element is a lacuna
990sub _el_is_lacuna {
991 my $el = shift;
2c669bca 992 return 1 if $el->{'t'} eq '#LACUNA#';
993 return 1 if ref( $el->{'t'} ) eq 'Text::Tradition::Collation::Reading'
994 && $el->{'t'}->is_lacuna;
0e476982 995 return 0;
996}
997
910a0a6d 998# Helper to turn the witnesses along columns rather than rows. Assumes
999# equal-sized rows.
1000sub _turn_table {
1001 my( $table ) = @_;
1002 my $result = [];
1003 return $result unless scalar @$table;
1004 my $nrows = scalar @{$table->[0]};
1005 foreach my $idx ( 0 .. $nrows - 1 ) {
1006 foreach my $wit ( 0 .. $#{$table} ) {
1007 $result->[$idx]->[$wit] = $table->[$wit]->[$idx];
1008 }
1009 }
1010 return $result;
1011}
1012
4e5a7b2c 1013=head1 NAVIGATION METHODS
910a0a6d 1014
4e5a7b2c 1015=head2 reading_sequence( $first, $last, $sigil, $backup )
e2902068 1016
1017Returns the ordered list of readings, starting with $first and ending
4e5a7b2c 1018with $last, for the witness given in $sigil. If a $backup sigil is
1019specified (e.g. when walking a layered witness), it will be used wherever
1020no $sigil path exists. If there is a base text reading, that will be
1021used wherever no path exists for $sigil or $backup.
e2902068 1022
1023=cut
1024
910a0a6d 1025# TODO Think about returning some lazy-eval iterator.
b0b4421a 1026# TODO Get rid of backup; we should know from what witness is whether we need it.
910a0a6d 1027
e2902068 1028sub reading_sequence {
861c3e27 1029 my( $self, $start, $end, $witness ) = @_;
e2902068 1030
930ff666 1031 $witness = $self->baselabel unless $witness;
e2902068 1032 my @readings = ( $start );
1033 my %seen;
1034 my $n = $start;
3a2ebbf4 1035 while( $n && $n->id ne $end->id ) {
1036 if( exists( $seen{$n->id} ) ) {
63778331 1037 throw( "Detected loop for $witness at " . $n->id );
910a0a6d 1038 }
3a2ebbf4 1039 $seen{$n->id} = 1;
910a0a6d 1040
861c3e27 1041 my $next = $self->next_reading( $n, $witness );
44771cf2 1042 unless( $next ) {
63778331 1043 throw( "Did not find any path for $witness from reading " . $n->id );
44771cf2 1044 }
910a0a6d 1045 push( @readings, $next );
1046 $n = $next;
e2902068 1047 }
1048 # Check that the last reading is our end reading.
1049 my $last = $readings[$#readings];
63778331 1050 throw( "Last reading found from " . $start->text .
1051 " for witness $witness is not the end!" ) # TODO do we get this far?
3a2ebbf4 1052 unless $last->id eq $end->id;
e2902068 1053
1054 return @readings;
1055}
1056
4e5a7b2c 1057=head2 next_reading( $reading, $sigil );
8e1394aa 1058
4a8828f0 1059Returns the reading that follows the given reading along the given witness
930ff666 1060path.
8e1394aa 1061
1062=cut
1063
4a8828f0 1064sub next_reading {
e2902068 1065 # Return the successor via the corresponding path.
8e1394aa 1066 my $self = shift;
3a2ebbf4 1067 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 1068 return undef unless $answer;
3a2ebbf4 1069 return $self->reading( $answer );
8e1394aa 1070}
1071
4e5a7b2c 1072=head2 prior_reading( $reading, $sigil )
8e1394aa 1073
4a8828f0 1074Returns the reading that precedes the given reading along the given witness
930ff666 1075path.
8e1394aa 1076
1077=cut
1078
4a8828f0 1079sub prior_reading {
e2902068 1080 # Return the predecessor via the corresponding path.
8e1394aa 1081 my $self = shift;
3a2ebbf4 1082 my $answer = $self->_find_linked_reading( 'prior', @_ );
1083 return $self->reading( $answer );
8e1394aa 1084}
1085
4a8828f0 1086sub _find_linked_reading {
861c3e27 1087 my( $self, $direction, $node, $path ) = @_;
1088
1089 # Get a backup if we are dealing with a layered witness
1090 my $alt_path;
1091 my $aclabel = $self->ac_label;
1092 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1093 $alt_path = $1;
1094 }
1095
e2902068 1096 my @linked_paths = $direction eq 'next'
3a2ebbf4 1097 ? $self->sequence->edges_from( $node )
1098 : $self->sequence->edges_to( $node );
e2902068 1099 return undef unless scalar( @linked_paths );
8e1394aa 1100
e2902068 1101 # We have to find the linked path that contains all of the
1102 # witnesses supplied in $path.
1103 my( @path_wits, @alt_path_wits );
4e5a7b2c 1104 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1105 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 1106 my $base_le;
1107 my $alt_le;
1108 foreach my $le ( @linked_paths ) {
3a2ebbf4 1109 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 1110 $base_le = $le;
910a0a6d 1111 }
508fd430 1112 my @le_wits = sort $self->path_witnesses( $le );
3a2ebbf4 1113 if( _is_within( \@path_wits, \@le_wits ) ) {
1114 # This is the right path.
1115 return $direction eq 'next' ? $le->[1] : $le->[0];
1116 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1117 $alt_le = $le;
1118 }
8e1394aa 1119 }
e2902068 1120 # Got this far? Return the alternate path if it exists.
3a2ebbf4 1121 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 1122 if $alt_le;
e2902068 1123
1124 # Got this far? Return the base path if it exists.
3a2ebbf4 1125 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 1126 if $base_le;
e2902068 1127
1128 # Got this far? We have no appropriate path.
2c669bca 1129 warn "Could not find $direction node from " . $node->id
910a0a6d 1130 . " along path $path";
8e1394aa 1131 return undef;
1132}
1133
4a8828f0 1134# Some set logic.
1135sub _is_within {
1136 my( $set1, $set2 ) = @_;
7854e12e 1137 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 1138 foreach my $el ( @$set1 ) {
910a0a6d 1139 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 1140 }
1141 return $ret;
1142}
1143
4e5a7b2c 1144# Return the string that joins together a list of witnesses for
1145# display on a single path.
1146sub _witnesses_of_label {
1147 my( $self, $label ) = @_;
1148 my $regex = $self->wit_list_separator;
1149 my @answer = split( /\Q$regex\E/, $label );
1150 return @answer;
b0b4421a 1151}
1152
d4b75f44 1153=head2 common_readings
1154
1155Returns the list of common readings in the graph (i.e. those readings that are
1156shared by all non-lacunose witnesses.)
1157
1158=cut
1159
1160sub common_readings {
1161 my $self = shift;
1162 my @common = grep { $_->is_common } $self->readings;
1163 return @common;
1164}
1165
b0b4421a 1166=head2 path_text( $sigil, $mainsigil [, $start, $end ] )
1167
1168Returns the text of a witness (plus its backup, if we are using a layer)
1169as stored in the collation. The text is returned as a string, where the
1170individual readings are joined with spaces and the meta-readings (e.g.
1171lacunae) are omitted. Optional specification of $start and $end allows
1172the generation of a subset of the witness text.
4e5a7b2c 1173
b0b4421a 1174=cut
1175
1176sub path_text {
861c3e27 1177 my( $self, $wit, $start, $end ) = @_;
b0b4421a 1178 $start = $self->start unless $start;
1179 $end = $self->end unless $end;
861c3e27 1180 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
b0b4421a 1181 return join( ' ', map { $_->text } @path );
1182}
4e5a7b2c 1183
1184=head1 INITIALIZATION METHODS
1185
1186These are mostly for use by parsers.
1187
1188=head2 make_witness_path( $witness )
1189
1190Link the array of readings contained in $witness->path (and in
1191$witness->uncorrected_path if it exists) into collation paths.
1192Clear out the arrays when finished.
de51424a 1193
4e5a7b2c 1194=head2 make_witness_paths
1195
1196Call make_witness_path for all witnesses in the tradition.
1197
1198=cut
930ff666 1199
7e450e44 1200# For use when a collation is constructed from a base text and an apparatus.
1201# We have the sequences of readings and just need to add path edges.
1f7aa795 1202# When we are done, clear out the witness path attributes, as they are no
1203# longer needed.
1204# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 1205
6a222840 1206sub make_witness_paths {
1207 my( $self ) = @_;
910a0a6d 1208 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 1209 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 1210 $self->make_witness_path( $wit );
7854e12e 1211 }
7854e12e 1212}
1213
6a222840 1214sub make_witness_path {
7854e12e 1215 my( $self, $wit ) = @_;
1216 my @chain = @{$wit->path};
15d2d3df 1217 my $sig = $wit->sigil;
7854e12e 1218 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 1219 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 1220 }
1f7aa795 1221 if( $wit->is_layered ) {
d9e873d0 1222 @chain = @{$wit->uncorrected_path};
1223 foreach my $idx( 0 .. $#chain-1 ) {
1224 my $source = $chain[$idx];
1225 my $target = $chain[$idx+1];
1226 $self->add_path( $source, $target, $sig.$self->ac_label )
1227 unless $self->has_path( $source, $target, $sig );
1228 }
15d2d3df 1229 }
1f7aa795 1230 $wit->clear_path;
1231 $wit->clear_uncorrected_path;
e2902068 1232}
1233
4e5a7b2c 1234=head2 calculate_ranks
1235
1236Calculate the reading ranks (that is, their aligned positions relative
1237to each other) for the graph. This can only be called on linear collations.
1238
b365fbae 1239=begin testing
1240
1241use Text::Tradition;
1242
1243my $cxfile = 't/data/Collatex-16.xml';
1244my $t = Text::Tradition->new(
1245 'name' => 'inline',
1246 'input' => 'CollateX',
1247 'file' => $cxfile,
1248 );
1249my $c = $t->collation;
1250
1251# Make an svg
1252my $svg = $c->as_svg;
1253is( substr( $svg, 0, 5 ), '<?xml', "Got XML doc for svg" );
1254ok( $c->has_cached_svg, "SVG was cached" );
1255is( $c->as_svg, $svg, "Cached SVG returned upon second call" );
1256$c->calculate_ranks;
1257is( $c->as_svg, $svg, "Cached SVG retained with no rank change" );
1258$c->add_relationship( 'n9', 'n23', { 'type' => 'spelling' } );
1259isnt( $c->as_svg, $svg, "SVG changed after relationship add" );
1260
1261=end testing
1262
4e5a7b2c 1263=cut
1264
910a0a6d 1265sub calculate_ranks {
1266 my $self = shift;
b365fbae 1267 # Save the existing ranks, in case we need to invalidate the cached SVG.
1268 my %existing_ranks;
910a0a6d 1269 # Walk a version of the graph where every node linked by a relationship
1270 # edge is fundamentally the same node, and do a topological ranking on
1271 # the nodes in this graph.
c9bf3dbf 1272 my $topo_graph = Graph->new();
910a0a6d 1273 my %rel_containers;
1274 my $rel_ctr = 0;
1275 # Add the nodes
1276 foreach my $r ( $self->readings ) {
3a2ebbf4 1277 next if exists $rel_containers{$r->id};
910a0a6d 1278 my @rels = $r->related_readings( 'colocated' );
1279 if( @rels ) {
1280 # Make a relationship container.
1281 push( @rels, $r );
c9bf3dbf 1282 my $rn = 'rel_container_' . $rel_ctr++;
1283 $topo_graph->add_vertex( $rn );
910a0a6d 1284 foreach( @rels ) {
3a2ebbf4 1285 $rel_containers{$_->id} = $rn;
910a0a6d 1286 }
1287 } else {
1288 # Add a new node to mirror the old node.
3a2ebbf4 1289 $rel_containers{$r->id} = $r->id;
1290 $topo_graph->add_vertex( $r->id );
910a0a6d 1291 }
4a8828f0 1292 }
3a1f2523 1293
3a2ebbf4 1294 # Add the edges.
910a0a6d 1295 foreach my $r ( $self->readings ) {
b365fbae 1296 $existing_ranks{$r} = $r->rank;
3a2ebbf4 1297 foreach my $n ( $self->sequence->successors( $r->id ) ) {
1298 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
1299 $rel_containers{$n} );
4e5a7b2c 1300 # $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 1301 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 1302 }
1303 }
1304
1305 # Now do the rankings, starting with the start node.
3a2ebbf4 1306 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 1307 my $node_ranks = { $topo_start => 0 };
910a0a6d 1308 my @curr_origin = ( $topo_start );
1309 # A little iterative function.
1310 while( @curr_origin ) {
c9bf3dbf 1311 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 1312 }
1313 # Transfer our rankings from the topological graph to the real one.
1314 foreach my $r ( $self->readings ) {
3a2ebbf4 1315 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1316 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 1317 } else {
63778331 1318 # Die. Find the last rank we calculated.
1319 my @all_defined = sort { $node_ranks->{$rel_containers{$a->id}}
1320 <=> $node_ranks->{$rel_containers{$b->id}} }
1321 $self->readings;
1322 my $last = pop @all_defined;
1323 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
67da8d6c 1324 }
de51424a 1325 }
b365fbae 1326 # Do we need to invalidate the cached SVG?
1327 if( $self->has_cached_svg ) {
1328 foreach my $r ( $self->readings ) {
1329 next if $existing_ranks{$r} == $r->rank;
1330 $self->wipe_svg;
1331 last;
1332 }
1333 }
8e1394aa 1334}
3a1f2523 1335
910a0a6d 1336sub _assign_rank {
c9bf3dbf 1337 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1338 # Look at each of the children of @current_nodes. If all the child's
1339 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1340 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1341 # parent gets a rank.
910a0a6d 1342 my @next_nodes;
1343 foreach my $c ( @current_nodes ) {
c9bf3dbf 1344 warn "Current reading $c has no rank!"
1345 unless exists $node_ranks->{$c};
1346 # print STDERR "Looking at child of node $c, rank "
1347 # . $node_ranks->{$c} . "\n";
1348 foreach my $child ( $graph->successors( $c ) ) {
1349 next if exists $node_ranks->{$child};
910a0a6d 1350 my $highest_rank = -1;
1351 my $skip = 0;
c9bf3dbf 1352 foreach my $parent ( $graph->predecessors( $child ) ) {
1353 if( exists $node_ranks->{$parent} ) {
1354 $highest_rank = $node_ranks->{$parent}
1355 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1356 } else {
1357 $skip = 1;
1358 last;
1359 }
1360 }
1361 next if $skip;
c9bf3dbf 1362 my $c_rank = $highest_rank + 1;
1363 # print STDERR "Assigning rank $c_rank to node $child \n";
1364 $node_ranks->{$child} = $c_rank;
910a0a6d 1365 push( @next_nodes, $child );
1366 }
1367 }
1368 return @next_nodes;
4cdd82f1 1369}
910a0a6d 1370
4e5a7b2c 1371=head2 flatten_ranks
1372
1373A convenience method for parsing collation data. Searches the graph for readings
1374with the same text at the same rank, and merges any that are found.
1375
1376=cut
1377
0e476982 1378sub flatten_ranks {
1379 my $self = shift;
1380 my %unique_rank_rdg;
1381 foreach my $rdg ( $self->readings ) {
1382 next unless $rdg->has_rank;
1383 my $key = $rdg->rank . "||" . $rdg->text;
1384 if( exists $unique_rank_rdg{$key} ) {
1385 # Combine!
56eefa04 1386 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1387 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
0ecb975c 1388 # TODO see if this now makes a common point.
0e476982 1389 } else {
1390 $unique_rank_rdg{$key} = $rdg;
1391 }
1392 }
1393}
1394
4633f9e4 1395=head2 remove_collations
1396
1397Another convenience method for parsing. Removes all 'collation' relationships
1398that were defined in order to get the reading ranks to be correct.
1399
1400=begin testing
1401
1402use Text::Tradition;
1403
1404my $cxfile = 't/data/Collatex-16.xml';
1405my $t = Text::Tradition->new(
1406 'name' => 'inline',
1407 'input' => 'CollateX',
1408 'file' => $cxfile,
1409 );
1410my $c = $t->collation;
1411
1412isnt( $c->reading('n23')->rank, $c->reading('n9')->rank, "Rank skew exists" );
1413$c->add_relationship( 'n23', 'n9', { 'type' => 'collated', 'scope' => 'local' } );
1414is( scalar $c->relationships, 4, "Found all expected relationships" );
1415$c->remove_collations;
1416is( scalar $c->relationships, 3, "Collated relationships now gone" );
1417is( $c->reading('n23')->rank, $c->reading('n9')->rank, "Aligned ranks were preserved" );
1418
1419=end testing
1420
1421=cut
1422
1423sub remove_collations {
1424 my $self = shift;
1425 foreach my $reledge ( $self->relationships ) {
1426 my $relobj = $self->relations->get_relationship( $reledge );
1427 if( $relobj && $relobj->type eq 'collated' ) {
1428 $self->relations->delete_relationship( $reledge );
1429 }
1430 }
1431}
1432
1433
d4b75f44 1434=head2 calculate_common_readings
1435
1436Goes through the graph identifying the readings that appear in every witness
1437(apart from those with lacunae at that spot.) Marks them as common and returns
1438the list.
1439
1440=begin testing
1441
1442use Text::Tradition;
1443
1444my $cxfile = 't/data/Collatex-16.xml';
1445my $t = Text::Tradition->new(
1446 'name' => 'inline',
1447 'input' => 'CollateX',
1448 'file' => $cxfile,
1449 );
1450my $c = $t->collation;
1451
1452my @common = $c->calculate_common_readings();
1453is( scalar @common, 8, "Found correct number of common readings" );
1454my @marked = sort $c->common_readings();
1455is( scalar @common, 8, "All common readings got marked as such" );
1456my @expected = qw/ n1 n12 n16 n19 n20 n5 n6 n7 /;
1457is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1458
1459=end testing
1460
1461=cut
1462
1463sub calculate_common_readings {
1464 my $self = shift;
1465 my @common;
1466 my $table = $self->make_alignment_table( 1 );
1467 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1468 my @row = map { $_->{'tokens'}->[$idx]->{'t'} } @{$table->{'alignment'}};
1469 my %hash;
1470 foreach my $r ( @row ) {
1471 if( $r ) {
1472 $hash{$r->id} = $r unless $r->is_meta;
1473 } else {
1474 $hash{'UNDEF'} = $r;
1475 }
1476 }
1477 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1478 my( $r ) = values %hash;
1479 $r->is_common( 1 );
1480 push( @common, $r );
1481 }
1482 }
1483 return @common;
1484}
1485
861c3e27 1486=head2 text_from_paths
1487
1488Calculate the text array for all witnesses from the path, for later consistency
1489checking. Only to be used if there is no non-graph-based way to know the
1490original texts.
1491
1492=cut
1493
1494sub text_from_paths {
1495 my $self = shift;
1496 foreach my $wit ( $self->tradition->witnesses ) {
1497 my @text = split( /\s+/,
1498 $self->reading_sequence( $self->start, $self->end, $wit->sigil ) );
1499 $wit->text( \@text );
1500 if( $wit->is_layered ) {
1501 my @uctext = split( /\s+/,
1502 $self->reading_sequence( $self->start, $self->end,
1503 $wit->sigil.$self->ac_label ) );
1504 $wit->text( \@uctext );
1505 }
1506 }
1507}
0e476982 1508
4e5a7b2c 1509=head1 UTILITY FUNCTIONS
1510
1511=head2 common_predecessor( $reading_a, $reading_b )
8e1394aa 1512
4e5a7b2c 1513Find the last reading that occurs in sequence before both the given readings.
1514
1515=head2 common_successor( $reading_a, $reading_b )
1516
1517Find the first reading that occurs in sequence after both the given readings.
1518
22222af9 1519=begin testing
1520
1521use Text::Tradition;
1522
1523my $cxfile = 't/data/Collatex-16.xml';
1524my $t = Text::Tradition->new(
1525 'name' => 'inline',
1526 'input' => 'CollateX',
1527 'file' => $cxfile,
1528 );
1529my $c = $t->collation;
1530
4e5a7b2c 1531is( $c->common_predecessor( 'n9', 'n23' )->id,
22222af9 1532 'n20', "Found correct common predecessor" );
4e5a7b2c 1533is( $c->common_successor( 'n9', 'n23' )->id,
22222af9 1534 '#END#', "Found correct common successor" );
1535
4e5a7b2c 1536is( $c->common_predecessor( 'n19', 'n17' )->id,
22222af9 1537 'n16', "Found correct common predecessor for readings on same path" );
4e5a7b2c 1538is( $c->common_successor( 'n21', 'n26' )->id,
22222af9 1539 '#END#', "Found correct common successor for readings on same path" );
1540
1541=end testing
1542
1543=cut
1544
1545## Return the closest reading that is a predecessor of both the given readings.
1546sub common_predecessor {
1547 my $self = shift;
4e5a7b2c 1548 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1549 return $self->_common_in_path( $r1, $r2, 'predecessors' );
22222af9 1550}
1551
1552sub common_successor {
1553 my $self = shift;
4e5a7b2c 1554 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1555 return $self->_common_in_path( $r1, $r2, 'successors' );
22222af9 1556}
1557
027d819c 1558sub _common_in_path {
22222af9 1559 my( $self, $r1, $r2, $dir ) = @_;
1560 my $iter = $r1->rank > $r2->rank ? $r1->rank : $r2->rank;
1561 $iter = $self->end->rank - $iter if $dir eq 'successors';
1562 my @candidates;
1563 my @last_checked = ( $r1, $r2 );
1564 my %all_seen;
1565 while( !@candidates ) {
1566 my @new_lc;
1567 foreach my $lc ( @last_checked ) {
1568 foreach my $p ( $lc->$dir ) {
1569 if( $all_seen{$p->id} ) {
1570 push( @candidates, $p );
1571 } else {
1572 $all_seen{$p->id} = 1;
1573 push( @new_lc, $p );
1574 }
1575 }
1576 }
1577 @last_checked = @new_lc;
1578 }
1579 my @answer = sort { $a->rank <=> $b->rank } @candidates;
1580 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1581}
1582
63778331 1583sub throw {
1584 Text::Tradition::Error->throw(
1585 'ident' => 'Collation error',
1586 'message' => $_[0],
1587 );
1588}
1589
dd3b58b0 1590no Moose;
1591__PACKAGE__->meta->make_immutable;
e867486f 1592
027d819c 1593=head1 LICENSE
e867486f 1594
027d819c 1595This package is free software and is provided "as is" without express
1596or implied warranty. You can redistribute it and/or modify it under
1597the same terms as Perl itself.
e867486f 1598
027d819c 1599=head1 AUTHOR
e867486f 1600
027d819c 1601Tara L Andrews E<lt>aurum@cpan.orgE<gt>