post-processing script to get rid of punctuation
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
c9bf3dbf 5use Graph;
8e1394aa 6use IPC::Run qw( run binary );
910a0a6d 7use Text::CSV_XS;
b15511bf 8use Text::Tradition::Collation::Reading;
22222af9 9use Text::Tradition::Collation::RelationshipStore;
63778331 10use Text::Tradition::Error;
df6d9812 11use XML::LibXML;
a344ffcf 12use XML::LibXML::XPathContext;
dd3b58b0 13use Moose;
14
3a2ebbf4 15has 'sequence' => (
d047cd52 16 is => 'ro',
3a2ebbf4 17 isa => 'Graph',
18 default => sub { Graph->new() },
d047cd52 19 handles => {
3a2ebbf4 20 paths => 'edges',
d047cd52 21 },
d047cd52 22 );
3a2ebbf4 23
24has 'relations' => (
25 is => 'ro',
22222af9 26 isa => 'Text::Tradition::Collation::RelationshipStore',
27 handles => {
28 relationships => 'relationships',
29 related_readings => 'related_readings',
30 },
31 writer => '_set_relations',
3a2ebbf4 32 );
dd3b58b0 33
3a2ebbf4 34has 'tradition' => (
35 is => 'ro',
d047cd52 36 isa => 'Text::Tradition',
8d9a1cd8 37 weak_ref => 1,
d047cd52 38 );
dd3b58b0 39
3a2ebbf4 40has 'readings' => (
41 isa => 'HashRef[Text::Tradition::Collation::Reading]',
42 traits => ['Hash'],
43 handles => {
44 reading => 'get',
45 _add_reading => 'set',
46 del_reading => 'delete',
47 has_reading => 'exists',
48 readings => 'values',
49 },
50 default => sub { {} },
51 );
910a0a6d 52
4a8828f0 53has 'wit_list_separator' => (
7854e12e 54 is => 'rw',
55 isa => 'Str',
56 default => ', ',
57 );
58
59has 'baselabel' => (
60 is => 'rw',
61 isa => 'Str',
62 default => 'base text',
63 );
4a8828f0 64
15d2d3df 65has 'linear' => (
66 is => 'rw',
67 isa => 'Bool',
68 default => 1,
69 );
c84275ff 70
ef9d481f 71has 'ac_label' => (
72 is => 'rw',
73 isa => 'Str',
74 default => ' (a.c.)',
75 );
3a2ebbf4 76
77has 'start' => (
78 is => 'ro',
79 isa => 'Text::Tradition::Collation::Reading',
80 writer => '_set_start',
81 weak_ref => 1,
82 );
83
84has 'end' => (
85 is => 'ro',
86 isa => 'Text::Tradition::Collation::Reading',
87 writer => '_set_end',
88 weak_ref => 1,
89 );
1f563ac3 90
4e5a7b2c 91=head1 NAME
92
93Text::Tradition::Collation - a software model for a text collation
94
95=head1 SYNOPSIS
96
97 use Text::Tradition;
98 my $t = Text::Tradition->new(
99 'name' => 'this is a text',
100 'input' => 'TEI',
101 'file' => '/path/to/tei_parallel_seg_file.xml' );
102
103 my $c = $t->collation;
104 my @readings = $c->readings;
105 my @paths = $c->paths;
106 my @relationships = $c->relationships;
107
108 my $svg_variant_graph = $t->collation->as_svg();
109
110=head1 DESCRIPTION
111
112Text::Tradition is a library for representation and analysis of collated
113texts, particularly medieval ones. The Collation is the central feature of
114a Tradition, where the text, its sequence of readings, and its relationships
115between readings are actually kept.
116
117=head1 CONSTRUCTOR
118
119=head2 new
120
121The constructor. Takes a hash or hashref of the following arguments:
122
123=over
124
125=item * tradition - The Text::Tradition object to which the collation
126belongs. Required.
127
128=item * linear - Whether the collation should be linear; that is, whether
129transposed readings should be treated as two linked readings rather than one,
130and therefore whether the collation graph is acyclic. Defaults to true.
131
4e5a7b2c 132=item * baselabel - The default label for the path taken by a base text
133(if any). Defaults to 'base text'.
134
135=item * wit_list_separator - The string to join a list of witnesses for
136purposes of making labels in display graphs. Defaults to ', '.
137
138=item * ac_label - The extra label to tack onto a witness sigil when
139representing another layer of path for the given witness - that is, when
140a text has more than one possible reading due to scribal corrections or
141the like. Defaults to ' (a.c.)'.
142
143=back
144
145=head1 ACCESSORS
146
147=head2 tradition
148
149=head2 linear
150
4e5a7b2c 151=head2 wit_list_separator
152
153=head2 baselabel
154
155=head2 ac_label
156
157Simple accessors for collation attributes.
158
159=head2 start
160
161The meta-reading at the start of every witness path.
162
163=head2 end
164
165The meta-reading at the end of every witness path.
166
167=head2 readings
168
169Returns all Reading objects in the graph.
170
171=head2 reading( $id )
172
173Returns the Reading object corresponding to the given ID.
174
175=head2 add_reading( $reading_args )
176
177Adds a new reading object to the collation.
178See L<Text::Tradition::Collation::Reading> for the available arguments.
179
180=head2 del_reading( $object_or_id )
181
182Removes the given reading from the collation, implicitly removing its
183paths and relationships.
184
185=head2 merge_readings( $main, $second )
186
187Merges the $second reading into the $main one.
188The arguments may be either readings or reading IDs.
189
190=head2 has_reading( $id )
191
192Predicate to see whether a given reading ID is in the graph.
193
194=head2 reading_witnesses( $object_or_id )
195
196Returns a list of sigils whose witnesses contain the reading.
197
198=head2 paths
199
200Returns all reading paths within the document - that is, all edges in the
201collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
202
203=head2 add_path( $source, $target, $sigil )
204
205Links the given readings in the collation in sequence, under the given witness
206sigil. The readings may be specified by object or ID.
207
208=head2 del_path( $source, $target, $sigil )
209
210Links the given readings in the collation in sequence, under the given witness
211sigil. The readings may be specified by object or ID.
212
213=head2 has_path( $source, $target );
214
215Returns true if the two readings are linked in sequence in any witness.
216The readings may be specified by object or ID.
217
218=head2 relationships
219
220Returns all Relationship objects in the collation.
221
222=head2 add_relationship( $reading, $other_reading, $options )
223
224Adds a new relationship of the type given in $options between the two readings,
225which may be specified by object or ID. Returns a value of ( $status, @vectors)
226where $status is true on success, and @vectors is a list of relationship edges
227that were ultimately added.
228See L<Text::Tradition::Collation::Relationship> for the available options.
229
230=cut
dd3b58b0 231
d047cd52 232sub BUILD {
3a2ebbf4 233 my $self = shift;
22222af9 234 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
3a2ebbf4 235 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
236 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 237}
784877d9 238
3a2ebbf4 239### Reading construct/destruct functions
240
241sub add_reading {
242 my( $self, $reading ) = @_;
243 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
244 my %args = %$reading;
245 $reading = Text::Tradition::Collation::Reading->new(
246 'collation' => $self,
247 %args );
248 }
249 # First check to see if a reading with this ID exists.
250 if( $self->reading( $reading->id ) ) {
63778331 251 throw( "Collation already has a reading with id " . $reading->id );
3a2ebbf4 252 }
253 $self->_add_reading( $reading->id => $reading );
254 # Once the reading has been added, put it in both graphs.
255 $self->sequence->add_vertex( $reading->id );
22222af9 256 $self->relations->add_reading( $reading->id );
3a2ebbf4 257 return $reading;
eca16057 258};
259
3a2ebbf4 260around del_reading => sub {
261 my $orig = shift;
262 my $self = shift;
263 my $arg = shift;
264
265 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
266 $arg = $arg->id;
267 }
3a2ebbf4 268 # Remove the reading from the graphs.
269 $self->sequence->delete_vertex( $arg );
22222af9 270 $self->relations->delete_reading( $arg );
3a2ebbf4 271
272 # Carry on.
273 $self->$orig( $arg );
274};
7854e12e 275
3a2ebbf4 276# merge_readings( $main, $to_be_deleted );
7854e12e 277
3a2ebbf4 278sub merge_readings {
279 my $self = shift;
280
281 # We only need the IDs for adding paths to the graph, not the reading
282 # objects themselves.
49d4f2ac 283 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
3a2ebbf4 284
285 # The kept reading should inherit the paths and the relationships
286 # of the deleted reading.
287 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
288 my @vector = ( $kept );
289 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
290 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 291 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 292 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
293 $self->sequence->add_edge( @vector );
294 my $fwits = $self->sequence->get_edge_attributes( @vector );
295 @wits{keys %$fwits} = values %$fwits;
296 $self->sequence->set_edge_attributes( @vector, \%wits );
297 }
22222af9 298 $self->relations->merge_readings( $kept, $deleted, $combine_char );
3a2ebbf4 299
300 # Do the deletion deed.
49d4f2ac 301 if( $combine_char ) {
302 my $kept_obj = $self->reading( $kept );
303 my $new_text = join( $combine_char, $kept_obj->text,
304 $self->reading( $deleted )->text );
305 $kept_obj->alter_text( $new_text );
306 }
3a2ebbf4 307 $self->del_reading( $deleted );
308}
7854e12e 309
3265b0ce 310
3a2ebbf4 311# Helper function for manipulating the graph.
312sub _stringify_args {
313 my( $self, $first, $second, $arg ) = @_;
314 $first = $first->id
315 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
316 $second = $second->id
317 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
318 return( $first, $second, $arg );
319}
df6d9812 320
4e5a7b2c 321# Helper function for manipulating the graph.
322sub _objectify_args {
323 my( $self, $first, $second, $arg ) = @_;
324 $first = $self->reading( $first )
325 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
326 $second = $self->reading( $second )
327 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
328 return( $first, $second, $arg );
329}
3a2ebbf4 330### Path logic
331
332sub add_path {
333 my $self = shift;
334
335 # We only need the IDs for adding paths to the graph, not the reading
336 # objects themselves.
337 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
338
339 # Connect the readings
340 $self->sequence->add_edge( $source, $target );
341 # Note the witness in question
342 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 343};
344
3a2ebbf4 345sub del_path {
346 my $self = shift;
49d4f2ac 347 my @args;
348 if( ref( $_[0] ) eq 'ARRAY' ) {
349 my $e = shift @_;
350 @args = ( @$e, @_ );
351 } else {
352 @args = @_;
353 }
3a2ebbf4 354
355 # We only need the IDs for adding paths to the graph, not the reading
356 # objects themselves.
49d4f2ac 357 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 358
359 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 360 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 361 }
362 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
363 $self->sequence->delete_edge( $source, $target );
364 }
784877d9 365}
366
3a2ebbf4 367
15d2d3df 368# Extra graph-alike utility
369sub has_path {
3a2ebbf4 370 my $self = shift;
371 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
372 return undef unless $self->sequence->has_edge( $source, $target );
373 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 374}
375
4e5a7b2c 376=head2 clear_witness( @sigil_list )
3a2ebbf4 377
4e5a7b2c 378Clear the given witnesses out of the collation entirely, removing references
379to them in paths, and removing readings that belong only to them. Should only
380be called via $tradition->del_witness.
3a2ebbf4 381
382=cut
383
4e5a7b2c 384sub clear_witness {
385 my( $self, @sigils ) = @_;
386
387 # Clear the witness(es) out of the paths
388 foreach my $e ( $self->paths ) {
389 foreach my $sig ( @sigils ) {
390 $self->del_path( $e, $sig );
391 }
392 }
393
394 # Clear out the newly unused readings
395 foreach my $r ( $self->readings ) {
396 unless( $self->reading_witnesses( $r ) ) {
397 $self->del_reading( $r );
398 }
399 }
400}
3a2ebbf4 401
402sub add_relationship {
403 my $self = shift;
22222af9 404 my( $source, $target, $opts ) = $self->_stringify_args( @_ );
63778331 405 my( @vectors ) = $self->relations->add_relationship( $source,
64ae6270 406 $self->reading( $source ), $target, $self->reading( $target ), $opts );
407 # Force a full rank recalculation every time. Yuck.
63778331 408 $self->calculate_ranks() if $self->end->has_rank;
409 return @vectors;
22222af9 410}
ef9d481f 411
22222af9 412=head2 reading_witnesses( $reading )
910a0a6d 413
22222af9 414Return a list of sigils corresponding to the witnesses in which the reading appears.
3265b0ce 415
22222af9 416=cut
1d310495 417
1d310495 418sub reading_witnesses {
419 my( $self, $reading ) = @_;
420 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 421 # arbitrarily chosen "incoming". Thus, special-case the start node.
422 if( $reading eq $self->start ) {
423 return map { $_->sigil } $self->tradition->witnesses;
424 }
1d310495 425 my %all_witnesses;
426 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
427 my $wits = $self->sequence->get_edge_attributes( @$e );
428 @all_witnesses{ keys %$wits } = 1;
429 }
430 return keys %all_witnesses;
910a0a6d 431}
432
4e5a7b2c 433=head1 OUTPUT METHODS
8e1394aa 434
0ecb975c 435=head2 as_svg( \%options )
8e1394aa 436
0068967c 437Returns an SVG string that represents the graph, via as_dot and graphviz.
0ecb975c 438See as_dot for a list of options.
8e1394aa 439
440=cut
441
442sub as_svg {
0ecb975c 443 my( $self, $opts ) = @_;
3a2ebbf4 444
8e1394aa 445 my @cmd = qw/dot -Tsvg/;
446 my( $svg, $err );
910a0a6d 447 my $dotfile = File::Temp->new();
a344ffcf 448 ## USE FOR DEBUGGING
eca16057 449 # $dotfile->unlink_on_destroy(0);
910a0a6d 450 binmode $dotfile, ':utf8';
0ecb975c 451 print $dotfile $self->as_dot( $opts );
910a0a6d 452 push( @cmd, $dotfile->filename );
453 run( \@cmd, ">", binary(), \$svg );
364c71b1 454 return decode_utf8( $svg );
8e1394aa 455}
456
b22576c6 457
0ecb975c 458=head2 as_dot( \%options )
b22576c6 459
0ecb975c 460Returns a string that is the collation graph expressed in dot
461(i.e. GraphViz) format. Options include:
b22576c6 462
0ecb975c 463=over 4
b22576c6 464
0ecb975c 465=item * from
b22576c6 466
0ecb975c 467=item * to
df6d9812 468
0ecb975c 469=item * color_common
470
471=back
df6d9812 472
473=cut
474
475sub as_dot {
0ecb975c 476 my( $self, $opts ) = @_;
477 my $startrank = $opts->{'from'} if $opts;
478 my $endrank = $opts->{'to'} if $opts;
479 my $color_common = $opts->{'color_common'} if $opts;
b22576c6 480
481 # Check the arguments
482 if( $startrank ) {
483 return if $endrank && $startrank > $endrank;
484 return if $startrank > $self->end->rank;
485 }
486 if( defined $endrank ) {
487 return if $endrank < 0;
f1b3b33a 488 $endrank = undef if $endrank == $self->end->rank;
b22576c6 489 }
490
df6d9812 491 # TODO consider making some of these things configurable
67da8d6c 492 my $graph_name = $self->tradition->name;
493 $graph_name =~ s/[^\w\s]//g;
494 $graph_name = join( '_', split( /\s+/, $graph_name ) );
f13b5582 495
496 my %graph_attrs = (
497 'rankdir' => 'LR',
498 'bgcolor' => 'none',
499 );
500 my %node_attrs = (
b8990398 501 'fontsize' => 14,
f13b5582 502 'fillcolor' => 'white',
503 'style' => 'filled',
504 'shape' => 'ellipse'
505 );
506 my %edge_attrs = (
507 'arrowhead' => 'open',
508 'color' => '#000000',
509 'fontcolor' => '#000000',
510 );
511
67da8d6c 512 my $dot = sprintf( "digraph %s {\n", $graph_name );
f13b5582 513 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
514 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
df6d9812 515
b22576c6 516 # Output substitute start/end readings if necessary
517 if( $startrank ) {
518 $dot .= "\t\"#SUBSTART#\" [ label=\"...\" ];\n";
519 }
520 if( $endrank ) {
521 $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n";
522 }
30ddc24c 523
b22576c6 524 my %used; # Keep track of the readings that actually appear in the graph
30ddc24c 525 # Sort the readings by rank if we have ranks; this speeds layout.
526 my @all_readings = $self->end->has_rank
527 ? sort { $a->rank <=> $b->rank } $self->readings
528 : $self->readings;
529 foreach my $reading ( @all_readings ) {
b22576c6 530 # Only output readings within our rank range.
531 next if $startrank && $reading->rank < $startrank;
532 next if $endrank && $reading->rank > $endrank;
533 $used{$reading->id} = 1;
910a0a6d 534 # Need not output nodes without separate labels
3a2ebbf4 535 next if $reading->id eq $reading->text;
d4b75f44 536 my $rattrs;
30f0df34 537 my $label = $reading->text;
8f9cab7b 538 $label =~ s/\"/\\\"/g;
d4b75f44 539 $rattrs->{'label'} = $label;
0ecb975c 540 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
d4b75f44 541 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
df6d9812 542 }
3a2ebbf4 543
30ddc24c 544 # Add the real edges. Need to weight one edge per rank jump, in a
545 # continuous line.
546 my $weighted = $self->_add_edge_weights;
b22576c6 547 my @edges = $self->paths;
3bdec618 548 my( %substart, %subend );
b22576c6 549 foreach my $edge ( @edges ) {
550 # Do we need to output this edge?
508fd430 551 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
027d819c 552 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
f13b5582 553 my $variables = { %edge_attrs, 'label' => $label };
30ddc24c 554
b22576c6 555 # Account for the rank gap if necessary
30ddc24c 556 my $rank0 = $self->reading( $edge->[0] )->rank
557 if $self->reading( $edge->[0] )->has_rank;
558 my $rank1 = $self->reading( $edge->[1] )->rank
559 if $self->reading( $edge->[1] )->has_rank;
560 if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
561 $variables->{'minlen'} = $rank1 - $rank0;
562 }
563
564 # Add the calculated edge weights
565 if( exists $weighted->{$edge->[0]}
566 && $weighted->{$edge->[0]} eq $edge->[1] ) {
567 # $variables->{'color'} = 'red';
568 $variables->{'weight'} = 3.0;
b0b4421a 569 }
30ddc24c 570
508fd430 571 # EXPERIMENTAL: make edge width reflect no. of witnesses
572 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
573 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
574
f13b5582 575 my $varopts = _dot_attr_string( $variables );
576 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
577 $edge->[0], $edge->[1], $varopts );
3bdec618 578 } elsif( $used{$edge->[0]} ) {
579 $subend{$edge->[0]} = 1;
580 } elsif( $used{$edge->[1]} ) {
581 $substart{$edge->[1]} = 1;
b22576c6 582 }
df6d9812 583 }
3bdec618 584 # Add substitute start and end edges if necessary
585 foreach my $node ( keys %substart ) {
027d819c 586 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 587 my $variables = { %edge_attrs, 'label' => $witstr };
588 my $varopts = _dot_attr_string( $variables );
589 $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;";
3bdec618 590 }
591 foreach my $node ( keys %subend ) {
027d819c 592 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
f13b5582 593 my $variables = { %edge_attrs, 'label' => $witstr };
594 my $varopts = _dot_attr_string( $variables );
595 $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;";
3bdec618 596 }
30ddc24c 597
df6d9812 598 $dot .= "}\n";
599 return $dot;
600}
601
f13b5582 602sub _dot_attr_string {
603 my( $hash ) = @_;
604 my @attrs;
605 foreach my $k ( sort keys %$hash ) {
606 my $v = $hash->{$k};
607 push( @attrs, $k.'="'.$v.'"' );
608 }
609 return( '[ ' . join( ', ', @attrs ) . ' ]' );
610}
611
30ddc24c 612sub _add_edge_weights {
613 my $self = shift;
614 # Walk the graph from START to END, choosing the successor node with
615 # the largest number of witness paths each time.
616 my $weighted = {};
617 my $curr = $self->start->id;
618 while( $curr ne $self->end->id ) {
619 my @succ = sort { $self->path_witnesses( $curr, $a )
620 <=> $self->path_witnesses( $curr, $b ) }
621 $self->sequence->successors( $curr );
622 my $next = pop @succ;
623 # Try to avoid lacunae in the weighted path.
624 while( $self->reading( $next )->is_lacuna && @succ ) {
625 $next = pop @succ;
626 }
627 $weighted->{$curr} = $next;
628 $curr = $next;
629 }
630 return $weighted;
631}
632
027d819c 633=head2 path_witnesses( $edge )
634
635Returns the list of sigils whose witnesses are associated with the given edge.
636The edge can be passed as either an array or an arrayref of ( $source, $target ).
637
638=cut
639
3a2ebbf4 640sub path_witnesses {
641 my( $self, @edge ) = @_;
642 # If edge is an arrayref, cope.
643 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
644 my $e = shift @edge;
645 @edge = @$e;
646 }
647 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
508fd430 648 return @wits;
3a2ebbf4 649}
650
027d819c 651sub _path_display_label {
508fd430 652 my $self = shift;
653 my @wits = sort @_;
8f9cab7b 654 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
655 if( scalar @wits > $maj ) {
f13b5582 656 # TODO break out a.c. wits
8f9cab7b 657 return 'majority';
658 } else {
659 return join( ', ', @wits );
660 }
661}
662
663
4e5a7b2c 664=head2 as_graphml
8e1394aa 665
4e5a7b2c 666Returns a GraphML representation of the collation. The GraphML will contain
667two graphs. The first expresses the attributes of the readings and the witness
668paths that link them; the second expresses the relationships that link the
669readings. This is the native transfer format for a tradition.
8e1394aa 670
56eefa04 671=begin testing
672
673use Text::Tradition;
674
675my $READINGS = 311;
676my $PATHS = 361;
677
678my $datafile = 't/data/florilegium_tei_ps.xml';
679my $tradition = Text::Tradition->new( 'input' => 'TEI',
680 'name' => 'test0',
681 'file' => $datafile,
682 'linear' => 1 );
683
684ok( $tradition, "Got a tradition object" );
685is( scalar $tradition->witnesses, 13, "Found all witnesses" );
686ok( $tradition->collation, "Tradition has a collation" );
687
688my $c = $tradition->collation;
689is( scalar $c->readings, $READINGS, "Collation has all readings" );
690is( scalar $c->paths, $PATHS, "Collation has all paths" );
691is( scalar $c->relationships, 0, "Collation has all relationships" );
692
693# Add a few relationships
694$c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
695$c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
696$c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
697
698# Now write it to GraphML and parse it again.
699
700my $graphml = $c->as_graphml;
701my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
702is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
703is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
704is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
705
706=end testing
707
8e1394aa 708=cut
709
710sub as_graphml {
3a2ebbf4 711 my( $self ) = @_;
8e1394aa 712
713 # Some namespaces
714 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
715 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
716 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 717 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 718
719 # Create the document and root node
720 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
721 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
722 $graphml->setDocumentElement( $root );
723 $root->setNamespace( $xsi_ns, 'xsi', 0 );
724 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
725
e309421a 726 # Add the data keys for the graph
727 my %graph_data_keys;
728 my $gdi = 0;
1d310495 729 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
e309421a 730 foreach my $datum ( @graph_attributes ) {
731 $graph_data_keys{$datum} = 'dg'.$gdi++;
732 my $key = $root->addNewChild( $graphml_ns, 'key' );
733 $key->setAttribute( 'attr.name', $datum );
734 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
735 $key->setAttribute( 'for', 'graph' );
736 $key->setAttribute( 'id', $graph_data_keys{$datum} );
737 }
f6066bac 738
8e1394aa 739 # Add the data keys for nodes
ef9d481f 740 my %node_data_keys;
741 my $ndi = 0;
3a2ebbf4 742 my %node_data = (
743 id => 'string',
255875b8 744 text => 'string',
3a2ebbf4 745 rank => 'string',
746 is_start => 'boolean',
747 is_end => 'boolean',
748 is_lacuna => 'boolean',
749 );
750 foreach my $datum ( keys %node_data ) {
910a0a6d 751 $node_data_keys{$datum} = 'dn'.$ndi++;
752 my $key = $root->addNewChild( $graphml_ns, 'key' );
753 $key->setAttribute( 'attr.name', $datum );
3a2ebbf4 754 $key->setAttribute( 'attr.type', $node_data{$datum} );
910a0a6d 755 $key->setAttribute( 'for', 'node' );
756 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 757 }
758
df6d9812 759 # Add the data keys for edges, i.e. witnesses
ef9d481f 760 my $edi = 0;
761 my %edge_data_keys;
3a2ebbf4 762 my %edge_data = (
f523c7a8 763 class => 'string', # Class, deprecated soon
3a2ebbf4 764 witness => 'string', # ID/label for a path
765 relationship => 'string', # ID/label for a relationship
766 extra => 'boolean', # Path key
c84275ff 767 scope => 'string', # Relationship key
3a2ebbf4 768 non_correctable => 'boolean', # Relationship key
769 non_independent => 'boolean', # Relationship key
770 );
771 foreach my $datum ( keys %edge_data ) {
772 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 773 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 774 $key->setAttribute( 'attr.name', $datum );
775 $key->setAttribute( 'attr.type', $edge_data{$datum} );
910a0a6d 776 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 777 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 778 }
3a2ebbf4 779
22222af9 780 # Add the collation graph itself
2c669bca 781 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
782 $sgraph->setAttribute( 'edgedefault', 'directed' );
783 $sgraph->setAttribute( 'id', $self->tradition->name );
784 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
785 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
786 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
787 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
788 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
22222af9 789
e309421a 790 # Collation attribute data
791 foreach my $datum ( @graph_attributes ) {
2c669bca 792 my $value = $datum eq 'version' ? '3.0' : $self->$datum;
793 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 794 }
8e1394aa 795
796 my $node_ctr = 0;
797 my %node_hash;
22222af9 798 # Add our readings to the graph
3a2ebbf4 799 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
2c669bca 800 # Add to the main graph
801 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 802 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 803 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 804 $node_el->setAttribute( 'id', $node_xmlid );
255875b8 805 foreach my $d ( keys %node_data ) {
806 my $nval = $n->$d;
807 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
808 if defined $nval;
809 }
b15511bf 810 }
811
2c669bca 812 # Add the path edges to the sequence graph
df6d9812 813 my $edge_ctr = 0;
3a2ebbf4 814 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
815 # We add an edge in the graphml for every witness in $e.
508fd430 816 foreach my $wit ( sort $self->path_witnesses( $e ) ) {
3a2ebbf4 817 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
818 $node_hash{ $e->[0] },
819 $node_hash{ $e->[1] } );
2c669bca 820 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 821 $edge_el->setAttribute( 'source', $from );
822 $edge_el->setAttribute( 'target', $to );
823 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 824
825 # It's a witness path, so add the witness
826 my $base = $wit;
827 my $key = $edge_data_keys{'witness'};
828 # Is this an ante-corr witness?
829 my $aclabel = $self->ac_label;
830 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
831 # Keep the base witness
832 $base = $1;
833 # ...and record that this is an 'extra' reading path
834 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
835 }
836 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
f523c7a8 837 _add_graphml_data( $edge_el, $edge_data_keys{'class'}, 'path' );
3a2ebbf4 838 }
839 }
840
22222af9 841 # Add the relationship graph to the XML
027d819c 842 $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
2626f709 843 $node_data_keys{'id'}, \%edge_data_keys );
8e1394aa 844
94c00c71 845 # Save and return the thing
846 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 847 return $result;
df6d9812 848}
849
b15511bf 850sub _add_graphml_data {
851 my( $el, $key, $value ) = @_;
b15511bf 852 return unless defined $value;
c9bf3dbf 853 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 854 $data_el->setAttribute( 'key', $key );
855 $data_el->appendText( $value );
8e1394aa 856}
857
4e5a7b2c 858=head2 as_csv
910a0a6d 859
860Returns a CSV alignment table representation of the collation graph, one
2c669bca 861row per witness (or witness uncorrected.)
910a0a6d 862
863=cut
864
865sub as_csv {
3a2ebbf4 866 my( $self ) = @_;
910a0a6d 867 my $table = $self->make_alignment_table;
868 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
869 my @result;
2c669bca 870 # Make the header row
871 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
872 push( @result, decode_utf8( $csv->string ) );
873 # Make the rest of the rows
874 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 875 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
2c669bca 876 my @row = map { $_ ? $_->{'t'} : $_ } @rowobjs;
877 $csv->combine( @row );
910a0a6d 878 push( @result, decode_utf8( $csv->string ) );
879 }
3a2ebbf4 880 return join( "\n", @result );
910a0a6d 881}
882
4e5a7b2c 883=head2 make_alignment_table( $use_refs, $include_witnesses )
2c669bca 884
566f4595 885Return a reference to an alignment table, in a slightly enhanced CollateX
886format which looks like this:
887
888 $table = { alignment => [ { witness => "SIGIL",
4e5a7b2c 889 tokens => [ { t => "TEXT" }, ... ] },
566f4595 890 { witness => "SIG2",
4e5a7b2c 891 tokens => [ { t => "TEXT" }, ... ] },
566f4595 892 ... ],
893 length => TEXTLEN };
894
895If $use_refs is set to 1, the reading object is returned in the table
896instead of READINGTEXT; if not, the text of the reading is returned.
4e5a7b2c 897
898If $include_witnesses is set to a hashref, only the witnesses whose sigil
566f4595 899keys have a true hash value will be included.
2c669bca 900
901=cut
9f3ba6f7 902
910a0a6d 903sub make_alignment_table {
08e0fb85 904 my( $self, $noderefs, $include ) = @_;
0ecb975c 905 # Make sure we can do this
906 throw( "Need a linear graph in order to make an alignment table" )
907 unless $self->linear;
908 $self->calculate_ranks unless $self->end->has_rank;
909
2c669bca 910 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 911 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 912 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
2c669bca 913 if( $include ) {
566f4595 914 next unless $include->{$wit->sigil};
2c669bca 915 }
eca16057 916 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 917 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
918 my @row = _make_witness_row( \@wit_path, \@all_pos, $noderefs );
2c669bca 919 push( @{$table->{'alignment'}},
920 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 921 if( $wit->is_layered ) {
922 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
861c3e27 923 $wit->sigil.$self->ac_label );
1f7aa795 924 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos, $noderefs );
2c669bca 925 push( @{$table->{'alignment'}},
926 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 927 }
928 }
2c669bca 929 return $table;
910a0a6d 930}
931
932sub _make_witness_row {
0e476982 933 my( $path, $positions, $noderefs ) = @_;
910a0a6d 934 my %char_hash;
935 map { $char_hash{$_} = undef } @$positions;
2c669bca 936 my $debug = 0;
910a0a6d 937 foreach my $rdg ( @$path ) {
eca16057 938 my $rtext = $rdg->text;
939 $rtext = '#LACUNA#' if $rdg->is_lacuna;
2c669bca 940 print STDERR "rank " . $rdg->rank . "\n" if $debug;
3a2ebbf4 941 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
2c669bca 942 $char_hash{$rdg->rank} = $noderefs ? { 't' => $rdg }
943 : { 't' => $rtext };
910a0a6d 944 }
945 my @row = map { $char_hash{$_} } @$positions;
eca16057 946 # Fill in lacuna markers for undef spots in the row
947 my $last_el = shift @row;
948 my @filled_row = ( $last_el );
949 foreach my $el ( @row ) {
0e476982 950 # If we are using node reference, make the lacuna node appear many times
951 # in the table. If not, use the lacuna tag.
952 if( $last_el && _el_is_lacuna( $last_el ) && !defined $el ) {
566f4595 953 $el = $noderefs ? $last_el : { 't' => '#LACUNA#' };
eca16057 954 }
955 push( @filled_row, $el );
956 $last_el = $el;
957 }
958 return @filled_row;
910a0a6d 959}
960
0e476982 961# Tiny utility function to say if a table element is a lacuna
962sub _el_is_lacuna {
963 my $el = shift;
2c669bca 964 return 1 if $el->{'t'} eq '#LACUNA#';
965 return 1 if ref( $el->{'t'} ) eq 'Text::Tradition::Collation::Reading'
966 && $el->{'t'}->is_lacuna;
0e476982 967 return 0;
968}
969
910a0a6d 970# Helper to turn the witnesses along columns rather than rows. Assumes
971# equal-sized rows.
972sub _turn_table {
973 my( $table ) = @_;
974 my $result = [];
975 return $result unless scalar @$table;
976 my $nrows = scalar @{$table->[0]};
977 foreach my $idx ( 0 .. $nrows - 1 ) {
978 foreach my $wit ( 0 .. $#{$table} ) {
979 $result->[$idx]->[$wit] = $table->[$wit]->[$idx];
980 }
981 }
982 return $result;
983}
984
4e5a7b2c 985=head1 NAVIGATION METHODS
910a0a6d 986
4e5a7b2c 987=head2 reading_sequence( $first, $last, $sigil, $backup )
e2902068 988
989Returns the ordered list of readings, starting with $first and ending
4e5a7b2c 990with $last, for the witness given in $sigil. If a $backup sigil is
991specified (e.g. when walking a layered witness), it will be used wherever
992no $sigil path exists. If there is a base text reading, that will be
993used wherever no path exists for $sigil or $backup.
e2902068 994
995=cut
996
910a0a6d 997# TODO Think about returning some lazy-eval iterator.
b0b4421a 998# TODO Get rid of backup; we should know from what witness is whether we need it.
910a0a6d 999
e2902068 1000sub reading_sequence {
861c3e27 1001 my( $self, $start, $end, $witness ) = @_;
e2902068 1002
930ff666 1003 $witness = $self->baselabel unless $witness;
e2902068 1004 my @readings = ( $start );
1005 my %seen;
1006 my $n = $start;
3a2ebbf4 1007 while( $n && $n->id ne $end->id ) {
1008 if( exists( $seen{$n->id} ) ) {
63778331 1009 throw( "Detected loop for $witness at " . $n->id );
910a0a6d 1010 }
3a2ebbf4 1011 $seen{$n->id} = 1;
910a0a6d 1012
861c3e27 1013 my $next = $self->next_reading( $n, $witness );
44771cf2 1014 unless( $next ) {
63778331 1015 throw( "Did not find any path for $witness from reading " . $n->id );
44771cf2 1016 }
910a0a6d 1017 push( @readings, $next );
1018 $n = $next;
e2902068 1019 }
1020 # Check that the last reading is our end reading.
1021 my $last = $readings[$#readings];
63778331 1022 throw( "Last reading found from " . $start->text .
1023 " for witness $witness is not the end!" ) # TODO do we get this far?
3a2ebbf4 1024 unless $last->id eq $end->id;
e2902068 1025
1026 return @readings;
1027}
1028
4e5a7b2c 1029=head2 next_reading( $reading, $sigil );
8e1394aa 1030
4a8828f0 1031Returns the reading that follows the given reading along the given witness
930ff666 1032path.
8e1394aa 1033
1034=cut
1035
4a8828f0 1036sub next_reading {
e2902068 1037 # Return the successor via the corresponding path.
8e1394aa 1038 my $self = shift;
3a2ebbf4 1039 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 1040 return undef unless $answer;
3a2ebbf4 1041 return $self->reading( $answer );
8e1394aa 1042}
1043
4e5a7b2c 1044=head2 prior_reading( $reading, $sigil )
8e1394aa 1045
4a8828f0 1046Returns the reading that precedes the given reading along the given witness
930ff666 1047path.
8e1394aa 1048
1049=cut
1050
4a8828f0 1051sub prior_reading {
e2902068 1052 # Return the predecessor via the corresponding path.
8e1394aa 1053 my $self = shift;
3a2ebbf4 1054 my $answer = $self->_find_linked_reading( 'prior', @_ );
1055 return $self->reading( $answer );
8e1394aa 1056}
1057
4a8828f0 1058sub _find_linked_reading {
861c3e27 1059 my( $self, $direction, $node, $path ) = @_;
1060
1061 # Get a backup if we are dealing with a layered witness
1062 my $alt_path;
1063 my $aclabel = $self->ac_label;
1064 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1065 $alt_path = $1;
1066 }
1067
e2902068 1068 my @linked_paths = $direction eq 'next'
3a2ebbf4 1069 ? $self->sequence->edges_from( $node )
1070 : $self->sequence->edges_to( $node );
e2902068 1071 return undef unless scalar( @linked_paths );
8e1394aa 1072
e2902068 1073 # We have to find the linked path that contains all of the
1074 # witnesses supplied in $path.
1075 my( @path_wits, @alt_path_wits );
4e5a7b2c 1076 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1077 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 1078 my $base_le;
1079 my $alt_le;
1080 foreach my $le ( @linked_paths ) {
3a2ebbf4 1081 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 1082 $base_le = $le;
910a0a6d 1083 }
508fd430 1084 my @le_wits = sort $self->path_witnesses( $le );
3a2ebbf4 1085 if( _is_within( \@path_wits, \@le_wits ) ) {
1086 # This is the right path.
1087 return $direction eq 'next' ? $le->[1] : $le->[0];
1088 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1089 $alt_le = $le;
1090 }
8e1394aa 1091 }
e2902068 1092 # Got this far? Return the alternate path if it exists.
3a2ebbf4 1093 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 1094 if $alt_le;
e2902068 1095
1096 # Got this far? Return the base path if it exists.
3a2ebbf4 1097 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 1098 if $base_le;
e2902068 1099
1100 # Got this far? We have no appropriate path.
2c669bca 1101 warn "Could not find $direction node from " . $node->id
910a0a6d 1102 . " along path $path";
8e1394aa 1103 return undef;
1104}
1105
4a8828f0 1106# Some set logic.
1107sub _is_within {
1108 my( $set1, $set2 ) = @_;
7854e12e 1109 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 1110 foreach my $el ( @$set1 ) {
910a0a6d 1111 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 1112 }
1113 return $ret;
1114}
1115
4e5a7b2c 1116# Return the string that joins together a list of witnesses for
1117# display on a single path.
1118sub _witnesses_of_label {
1119 my( $self, $label ) = @_;
1120 my $regex = $self->wit_list_separator;
1121 my @answer = split( /\Q$regex\E/, $label );
1122 return @answer;
b0b4421a 1123}
1124
d4b75f44 1125=head2 common_readings
1126
1127Returns the list of common readings in the graph (i.e. those readings that are
1128shared by all non-lacunose witnesses.)
1129
1130=cut
1131
1132sub common_readings {
1133 my $self = shift;
1134 my @common = grep { $_->is_common } $self->readings;
1135 return @common;
1136}
1137
b0b4421a 1138=head2 path_text( $sigil, $mainsigil [, $start, $end ] )
1139
1140Returns the text of a witness (plus its backup, if we are using a layer)
1141as stored in the collation. The text is returned as a string, where the
1142individual readings are joined with spaces and the meta-readings (e.g.
1143lacunae) are omitted. Optional specification of $start and $end allows
1144the generation of a subset of the witness text.
4e5a7b2c 1145
b0b4421a 1146=cut
1147
1148sub path_text {
861c3e27 1149 my( $self, $wit, $start, $end ) = @_;
b0b4421a 1150 $start = $self->start unless $start;
1151 $end = $self->end unless $end;
861c3e27 1152 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
b0b4421a 1153 return join( ' ', map { $_->text } @path );
1154}
4e5a7b2c 1155
1156=head1 INITIALIZATION METHODS
1157
1158These are mostly for use by parsers.
1159
1160=head2 make_witness_path( $witness )
1161
1162Link the array of readings contained in $witness->path (and in
1163$witness->uncorrected_path if it exists) into collation paths.
1164Clear out the arrays when finished.
de51424a 1165
4e5a7b2c 1166=head2 make_witness_paths
1167
1168Call make_witness_path for all witnesses in the tradition.
1169
1170=cut
930ff666 1171
7e450e44 1172# For use when a collation is constructed from a base text and an apparatus.
1173# We have the sequences of readings and just need to add path edges.
1f7aa795 1174# When we are done, clear out the witness path attributes, as they are no
1175# longer needed.
1176# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 1177
6a222840 1178sub make_witness_paths {
1179 my( $self ) = @_;
910a0a6d 1180 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 1181 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 1182 $self->make_witness_path( $wit );
7854e12e 1183 }
7854e12e 1184}
1185
6a222840 1186sub make_witness_path {
7854e12e 1187 my( $self, $wit ) = @_;
1188 my @chain = @{$wit->path};
15d2d3df 1189 my $sig = $wit->sigil;
7854e12e 1190 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 1191 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 1192 }
1f7aa795 1193 if( $wit->is_layered ) {
d9e873d0 1194 @chain = @{$wit->uncorrected_path};
1195 foreach my $idx( 0 .. $#chain-1 ) {
1196 my $source = $chain[$idx];
1197 my $target = $chain[$idx+1];
1198 $self->add_path( $source, $target, $sig.$self->ac_label )
1199 unless $self->has_path( $source, $target, $sig );
1200 }
15d2d3df 1201 }
1f7aa795 1202 $wit->clear_path;
1203 $wit->clear_uncorrected_path;
e2902068 1204}
1205
4e5a7b2c 1206=head2 calculate_ranks
1207
1208Calculate the reading ranks (that is, their aligned positions relative
1209to each other) for the graph. This can only be called on linear collations.
1210
1211=cut
1212
910a0a6d 1213sub calculate_ranks {
1214 my $self = shift;
1215 # Walk a version of the graph where every node linked by a relationship
1216 # edge is fundamentally the same node, and do a topological ranking on
1217 # the nodes in this graph.
c9bf3dbf 1218 my $topo_graph = Graph->new();
910a0a6d 1219 my %rel_containers;
1220 my $rel_ctr = 0;
1221 # Add the nodes
1222 foreach my $r ( $self->readings ) {
3a2ebbf4 1223 next if exists $rel_containers{$r->id};
910a0a6d 1224 my @rels = $r->related_readings( 'colocated' );
1225 if( @rels ) {
1226 # Make a relationship container.
1227 push( @rels, $r );
c9bf3dbf 1228 my $rn = 'rel_container_' . $rel_ctr++;
1229 $topo_graph->add_vertex( $rn );
910a0a6d 1230 foreach( @rels ) {
3a2ebbf4 1231 $rel_containers{$_->id} = $rn;
910a0a6d 1232 }
1233 } else {
1234 # Add a new node to mirror the old node.
3a2ebbf4 1235 $rel_containers{$r->id} = $r->id;
1236 $topo_graph->add_vertex( $r->id );
910a0a6d 1237 }
4a8828f0 1238 }
3a1f2523 1239
3a2ebbf4 1240 # Add the edges.
910a0a6d 1241 foreach my $r ( $self->readings ) {
3a2ebbf4 1242 foreach my $n ( $self->sequence->successors( $r->id ) ) {
1243 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
1244 $rel_containers{$n} );
4e5a7b2c 1245 # $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 1246 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 1247 }
1248 }
1249
1250 # Now do the rankings, starting with the start node.
3a2ebbf4 1251 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 1252 my $node_ranks = { $topo_start => 0 };
910a0a6d 1253 my @curr_origin = ( $topo_start );
1254 # A little iterative function.
1255 while( @curr_origin ) {
c9bf3dbf 1256 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 1257 }
1258 # Transfer our rankings from the topological graph to the real one.
1259 foreach my $r ( $self->readings ) {
3a2ebbf4 1260 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1261 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 1262 } else {
63778331 1263 # Die. Find the last rank we calculated.
1264 my @all_defined = sort { $node_ranks->{$rel_containers{$a->id}}
1265 <=> $node_ranks->{$rel_containers{$b->id}} }
1266 $self->readings;
1267 my $last = pop @all_defined;
1268 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
67da8d6c 1269 }
de51424a 1270 }
8e1394aa 1271}
3a1f2523 1272
910a0a6d 1273sub _assign_rank {
c9bf3dbf 1274 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1275 # Look at each of the children of @current_nodes. If all the child's
1276 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1277 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1278 # parent gets a rank.
910a0a6d 1279 my @next_nodes;
1280 foreach my $c ( @current_nodes ) {
c9bf3dbf 1281 warn "Current reading $c has no rank!"
1282 unless exists $node_ranks->{$c};
1283 # print STDERR "Looking at child of node $c, rank "
1284 # . $node_ranks->{$c} . "\n";
1285 foreach my $child ( $graph->successors( $c ) ) {
1286 next if exists $node_ranks->{$child};
910a0a6d 1287 my $highest_rank = -1;
1288 my $skip = 0;
c9bf3dbf 1289 foreach my $parent ( $graph->predecessors( $child ) ) {
1290 if( exists $node_ranks->{$parent} ) {
1291 $highest_rank = $node_ranks->{$parent}
1292 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1293 } else {
1294 $skip = 1;
1295 last;
1296 }
1297 }
1298 next if $skip;
c9bf3dbf 1299 my $c_rank = $highest_rank + 1;
1300 # print STDERR "Assigning rank $c_rank to node $child \n";
1301 $node_ranks->{$child} = $c_rank;
910a0a6d 1302 push( @next_nodes, $child );
1303 }
1304 }
1305 return @next_nodes;
4cdd82f1 1306}
910a0a6d 1307
4e5a7b2c 1308=head2 flatten_ranks
1309
1310A convenience method for parsing collation data. Searches the graph for readings
1311with the same text at the same rank, and merges any that are found.
1312
1313=cut
1314
0e476982 1315sub flatten_ranks {
1316 my $self = shift;
1317 my %unique_rank_rdg;
1318 foreach my $rdg ( $self->readings ) {
1319 next unless $rdg->has_rank;
1320 my $key = $rdg->rank . "||" . $rdg->text;
1321 if( exists $unique_rank_rdg{$key} ) {
1322 # Combine!
56eefa04 1323 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1324 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
0ecb975c 1325 # TODO see if this now makes a common point.
0e476982 1326 } else {
1327 $unique_rank_rdg{$key} = $rdg;
1328 }
1329 }
1330}
1331
d4b75f44 1332=head2 calculate_common_readings
1333
1334Goes through the graph identifying the readings that appear in every witness
1335(apart from those with lacunae at that spot.) Marks them as common and returns
1336the list.
1337
1338=begin testing
1339
1340use Text::Tradition;
1341
1342my $cxfile = 't/data/Collatex-16.xml';
1343my $t = Text::Tradition->new(
1344 'name' => 'inline',
1345 'input' => 'CollateX',
1346 'file' => $cxfile,
1347 );
1348my $c = $t->collation;
1349
1350my @common = $c->calculate_common_readings();
1351is( scalar @common, 8, "Found correct number of common readings" );
1352my @marked = sort $c->common_readings();
1353is( scalar @common, 8, "All common readings got marked as such" );
1354my @expected = qw/ n1 n12 n16 n19 n20 n5 n6 n7 /;
1355is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1356
1357=end testing
1358
1359=cut
1360
1361sub calculate_common_readings {
1362 my $self = shift;
1363 my @common;
1364 my $table = $self->make_alignment_table( 1 );
1365 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1366 my @row = map { $_->{'tokens'}->[$idx]->{'t'} } @{$table->{'alignment'}};
1367 my %hash;
1368 foreach my $r ( @row ) {
1369 if( $r ) {
1370 $hash{$r->id} = $r unless $r->is_meta;
1371 } else {
1372 $hash{'UNDEF'} = $r;
1373 }
1374 }
1375 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1376 my( $r ) = values %hash;
1377 $r->is_common( 1 );
1378 push( @common, $r );
1379 }
1380 }
1381 return @common;
1382}
1383
861c3e27 1384=head2 text_from_paths
1385
1386Calculate the text array for all witnesses from the path, for later consistency
1387checking. Only to be used if there is no non-graph-based way to know the
1388original texts.
1389
1390=cut
1391
1392sub text_from_paths {
1393 my $self = shift;
1394 foreach my $wit ( $self->tradition->witnesses ) {
1395 my @text = split( /\s+/,
1396 $self->reading_sequence( $self->start, $self->end, $wit->sigil ) );
1397 $wit->text( \@text );
1398 if( $wit->is_layered ) {
1399 my @uctext = split( /\s+/,
1400 $self->reading_sequence( $self->start, $self->end,
1401 $wit->sigil.$self->ac_label ) );
1402 $wit->text( \@uctext );
1403 }
1404 }
1405}
0e476982 1406
4e5a7b2c 1407=head1 UTILITY FUNCTIONS
1408
1409=head2 common_predecessor( $reading_a, $reading_b )
8e1394aa 1410
4e5a7b2c 1411Find the last reading that occurs in sequence before both the given readings.
1412
1413=head2 common_successor( $reading_a, $reading_b )
1414
1415Find the first reading that occurs in sequence after both the given readings.
1416
22222af9 1417=begin testing
1418
1419use Text::Tradition;
1420
1421my $cxfile = 't/data/Collatex-16.xml';
1422my $t = Text::Tradition->new(
1423 'name' => 'inline',
1424 'input' => 'CollateX',
1425 'file' => $cxfile,
1426 );
1427my $c = $t->collation;
1428
4e5a7b2c 1429is( $c->common_predecessor( 'n9', 'n23' )->id,
22222af9 1430 'n20', "Found correct common predecessor" );
4e5a7b2c 1431is( $c->common_successor( 'n9', 'n23' )->id,
22222af9 1432 '#END#', "Found correct common successor" );
1433
4e5a7b2c 1434is( $c->common_predecessor( 'n19', 'n17' )->id,
22222af9 1435 'n16', "Found correct common predecessor for readings on same path" );
4e5a7b2c 1436is( $c->common_successor( 'n21', 'n26' )->id,
22222af9 1437 '#END#', "Found correct common successor for readings on same path" );
1438
1439=end testing
1440
1441=cut
1442
1443## Return the closest reading that is a predecessor of both the given readings.
1444sub common_predecessor {
1445 my $self = shift;
4e5a7b2c 1446 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1447 return $self->_common_in_path( $r1, $r2, 'predecessors' );
22222af9 1448}
1449
1450sub common_successor {
1451 my $self = shift;
4e5a7b2c 1452 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1453 return $self->_common_in_path( $r1, $r2, 'successors' );
22222af9 1454}
1455
027d819c 1456sub _common_in_path {
22222af9 1457 my( $self, $r1, $r2, $dir ) = @_;
1458 my $iter = $r1->rank > $r2->rank ? $r1->rank : $r2->rank;
1459 $iter = $self->end->rank - $iter if $dir eq 'successors';
1460 my @candidates;
1461 my @last_checked = ( $r1, $r2 );
1462 my %all_seen;
1463 while( !@candidates ) {
1464 my @new_lc;
1465 foreach my $lc ( @last_checked ) {
1466 foreach my $p ( $lc->$dir ) {
1467 if( $all_seen{$p->id} ) {
1468 push( @candidates, $p );
1469 } else {
1470 $all_seen{$p->id} = 1;
1471 push( @new_lc, $p );
1472 }
1473 }
1474 }
1475 @last_checked = @new_lc;
1476 }
1477 my @answer = sort { $a->rank <=> $b->rank } @candidates;
1478 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1479}
1480
63778331 1481sub throw {
1482 Text::Tradition::Error->throw(
1483 'ident' => 'Collation error',
1484 'message' => $_[0],
1485 );
1486}
1487
dd3b58b0 1488no Moose;
1489__PACKAGE__->meta->make_immutable;
e867486f 1490
027d819c 1491=head1 LICENSE
e867486f 1492
027d819c 1493This package is free software and is provided "as is" without express
1494or implied warranty. You can redistribute it and/or modify it under
1495the same terms as Perl itself.
e867486f 1496
027d819c 1497=head1 AUTHOR
e867486f 1498
027d819c 1499Tara L Andrews E<lt>aurum@cpan.orgE<gt>