set up proper garbage-collecting deletion of traditions from the directory
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
c9bf3dbf 5use Graph;
8e1394aa 6use IPC::Run qw( run binary );
910a0a6d 7use Text::CSV_XS;
b15511bf 8use Text::Tradition::Collation::Reading;
df6d9812 9use XML::LibXML;
dd3b58b0 10use Moose;
11
3a2ebbf4 12has 'sequence' => (
d047cd52 13 is => 'ro',
3a2ebbf4 14 isa => 'Graph',
15 default => sub { Graph->new() },
d047cd52 16 handles => {
3a2ebbf4 17 paths => 'edges',
d047cd52 18 },
d047cd52 19 );
3a2ebbf4 20
21has 'relations' => (
22 is => 'ro',
23 isa => 'Graph',
24 default => sub { Graph->new( undirected => 1 ) },
25 handles => {
26 relationships => 'edges',
27 },
28 );
dd3b58b0 29
3a2ebbf4 30has 'tradition' => (
31 is => 'ro',
d047cd52 32 isa => 'Text::Tradition',
8d9a1cd8 33 weak_ref => 1,
d047cd52 34 );
dd3b58b0 35
3a2ebbf4 36has 'readings' => (
37 isa => 'HashRef[Text::Tradition::Collation::Reading]',
38 traits => ['Hash'],
39 handles => {
40 reading => 'get',
41 _add_reading => 'set',
42 del_reading => 'delete',
43 has_reading => 'exists',
44 readings => 'values',
45 },
46 default => sub { {} },
47 );
910a0a6d 48
4a8828f0 49has 'wit_list_separator' => (
7854e12e 50 is => 'rw',
51 isa => 'Str',
52 default => ', ',
53 );
54
55has 'baselabel' => (
56 is => 'rw',
57 isa => 'Str',
58 default => 'base text',
59 );
4a8828f0 60
15d2d3df 61has 'linear' => (
62 is => 'rw',
63 isa => 'Bool',
64 default => 1,
65 );
1f563ac3 66
ef9d481f 67has 'ac_label' => (
68 is => 'rw',
69 isa => 'Str',
70 default => ' (a.c.)',
71 );
3a2ebbf4 72
73has 'start' => (
74 is => 'ro',
75 isa => 'Text::Tradition::Collation::Reading',
76 writer => '_set_start',
77 weak_ref => 1,
78 );
79
80has 'end' => (
81 is => 'ro',
82 isa => 'Text::Tradition::Collation::Reading',
83 writer => '_set_end',
84 weak_ref => 1,
85 );
1f563ac3 86
dd3b58b0 87# The collation can be created two ways:
88# 1. Collate a set of witnesses (with CollateX I guess) and process
89# the results as in 2.
90# 2. Read a pre-prepared collation in one of a variety of formats,
91# and make the graph from that.
92
93# The graph itself will (for now) be immutable, and the positions
94# within the graph will also be immutable. We need to calculate those
95# positions upon graph construction. The equivalences between graph
96# nodes will be mutable, entirely determined by the user (or possibly
97# by some semantic pre-processing provided by the user.) So the
98# constructor should just make an empty equivalences object. The
99# constructor will also need to make the witness objects, if we didn't
100# come through option 1.
101
d047cd52 102sub BUILD {
3a2ebbf4 103 my $self = shift;
104 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
105 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 106}
784877d9 107
3a2ebbf4 108### Reading construct/destruct functions
109
110sub add_reading {
111 my( $self, $reading ) = @_;
112 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
113 my %args = %$reading;
114 $reading = Text::Tradition::Collation::Reading->new(
115 'collation' => $self,
116 %args );
117 }
118 # First check to see if a reading with this ID exists.
119 if( $self->reading( $reading->id ) ) {
120 warn "Collation already has a reading with id " . $reading->id;
121 return undef;
122 }
123 $self->_add_reading( $reading->id => $reading );
124 # Once the reading has been added, put it in both graphs.
125 $self->sequence->add_vertex( $reading->id );
126 $self->relations->add_vertex( $reading->id );
127 return $reading;
eca16057 128};
129
3a2ebbf4 130around del_reading => sub {
131 my $orig = shift;
132 my $self = shift;
133 my $arg = shift;
134
135 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
136 $arg = $arg->id;
137 }
3a2ebbf4 138 # Remove the reading from the graphs.
139 $self->sequence->delete_vertex( $arg );
140 $self->relations->delete_vertex( $arg );
141
142 # Carry on.
143 $self->$orig( $arg );
144};
7854e12e 145
3a2ebbf4 146# merge_readings( $main, $to_be_deleted );
7854e12e 147
3a2ebbf4 148sub merge_readings {
149 my $self = shift;
150
151 # We only need the IDs for adding paths to the graph, not the reading
152 # objects themselves.
49d4f2ac 153 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
3a2ebbf4 154
155 # The kept reading should inherit the paths and the relationships
156 # of the deleted reading.
157 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
158 my @vector = ( $kept );
159 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
160 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 161 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 162 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
163 $self->sequence->add_edge( @vector );
164 my $fwits = $self->sequence->get_edge_attributes( @vector );
165 @wits{keys %$fwits} = values %$fwits;
166 $self->sequence->set_edge_attributes( @vector, \%wits );
167 }
168 foreach my $rel ( $self->relations->edges_at( $deleted ) ) {
169 my @vector = ( $kept );
170 push( @vector, $rel->[0] eq $deleted ? $rel->[1] : $rel->[0] );
49d4f2ac 171 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 172 # Is there a relationship here already? If so, keep it.
173 # TODO Warn about conflicting relationships
174 next if $self->relations->has_edge( @vector );
175 # If not, adopt the relationship that would be deleted.
176 $self->relations->add_edge( @vector );
177 my $attr = $self->relations->get_edge_attributes( @$rel );
178 $self->relations->set_edge_attributes( @vector, $attr );
179 }
180
181 # Do the deletion deed.
49d4f2ac 182 if( $combine_char ) {
183 my $kept_obj = $self->reading( $kept );
184 my $new_text = join( $combine_char, $kept_obj->text,
185 $self->reading( $deleted )->text );
186 $kept_obj->alter_text( $new_text );
187 }
3a2ebbf4 188 $self->del_reading( $deleted );
189}
7854e12e 190
3265b0ce 191
3a2ebbf4 192# Helper function for manipulating the graph.
193sub _stringify_args {
194 my( $self, $first, $second, $arg ) = @_;
195 $first = $first->id
196 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
197 $second = $second->id
198 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
199 return( $first, $second, $arg );
200}
df6d9812 201
3a2ebbf4 202### Path logic
203
204sub add_path {
205 my $self = shift;
206
207 # We only need the IDs for adding paths to the graph, not the reading
208 # objects themselves.
209 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
210
211 # Connect the readings
212 $self->sequence->add_edge( $source, $target );
213 # Note the witness in question
214 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 215};
216
3a2ebbf4 217sub del_path {
218 my $self = shift;
49d4f2ac 219 my @args;
220 if( ref( $_[0] ) eq 'ARRAY' ) {
221 my $e = shift @_;
222 @args = ( @$e, @_ );
223 } else {
224 @args = @_;
225 }
3a2ebbf4 226
227 # We only need the IDs for adding paths to the graph, not the reading
228 # objects themselves.
49d4f2ac 229 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 230
231 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 232 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 233 }
234 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
235 $self->sequence->delete_edge( $source, $target );
236 }
784877d9 237}
238
3a2ebbf4 239
15d2d3df 240# Extra graph-alike utility
241sub has_path {
3a2ebbf4 242 my $self = shift;
243 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
244 return undef unless $self->sequence->has_edge( $source, $target );
245 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 246}
247
3a2ebbf4 248### Relationship logic
3265b0ce 249
3a2ebbf4 250=head2 add_relationship( $reading1, $reading2, $definition )
251
252Adds the specified relationship between the two readings. A relationship
253is transitive (i.e. undirected), and must have the following attributes
254specified in the hashref $definition:
255
256=over 4
257
ad1291ee 258=item * type - Can be one of spelling, orthographic, grammatical, meaning, lexical, collated, repetition, transposition. All but the last two are only valid relationships between readings that occur at the same point in the text.
3a2ebbf4 259
260=item * non_correctable - (Optional) True if the reading would not have been corrected independently.
ef9d481f 261
3a2ebbf4 262=item * non_independent - (Optional) True if the variant is unlikely to have occurred independently in unrelated witnesses.
263
264=item * global - (Optional) A meta-attribute, to set the same relationship between readings with the same text whenever they occur in the same place.
265
266=back
267
268=cut
269
270# Wouldn't it be lovely if edges could be objects, and all this type checking
271# and attribute management could be done via Moose?
272
273sub add_relationship {
274 my $self = shift;
275 my( $source, $target, $options ) = $self->_stringify_args( @_ );
276
277 # Check the options
278 if( !defined $options->{'type'} ||
131fa659 279 $options->{'type'} !~ /^(spelling|orthographic|grammatical|meaning|lexical|collated|repetition|transposition)$/i ) {
3a2ebbf4 280 my $t = $options->{'type'} ? $options->{'type'} : '';
49d4f2ac 281 return( undef, "Invalid or missing type " . $options->{'type'} );
3a2ebbf4 282 }
49d4f2ac 283 unless ( $options->{'type'} =~ /^(repetition|transposition)$/ ) {
3a2ebbf4 284 $options->{'colocated'} = 1;
285 }
286
ef9d481f 287 # Make sure there is not another relationship between these two
94c00c71 288 # readings already
3a2ebbf4 289 if( $self->relations->has_edge( $source, $target ) ) {
290 return ( undef, "Relationship already exists between these readings" );
4cdd82f1 291 }
1d310495 292 if( !$self->relationship_valid( $source, $target, $options->{'type'} ) ) {
910a0a6d 293 return ( undef, 'Relationship creates witness loop' );
ef9d481f 294 }
295
3a2ebbf4 296 my @vector = ( $source, $target );
297 $self->relations->add_edge( @vector );
298 $self->relations->set_edge_attributes( @vector, $options );
910a0a6d 299
300 # TODO Handle global relationship setting
301
3a2ebbf4 302 return( 1, @vector );
3265b0ce 303}
304
910a0a6d 305sub relationship_valid {
1d310495 306 my( $self, $source, $target, $rel ) = @_;
307 if( $rel eq 'repetition' ) {
308 return 1;
309 } elsif ( $rel eq 'transposition' ) {
310 # Check that the two readings do not appear in the same witness.
311 my %seen_wits;
312 map { $seen_wits{$_} = 1 } $self->reading_witnesses( $source );
313 foreach my $w ( $self->reading_witnesses( $target ) ) {
314 return 0 if $seen_wits{$w};
315 }
316 return 1;
317 } else {
318 # Check that linking the source and target in a relationship won't lead
319 # to a path loop for any witness. First make a lookup table of all the
320 # readings related to either the source or the target.
321 my @proposed_related = ( $source, $target );
49d4f2ac 322 push( @proposed_related, $self->related_readings( $source, 'colocated' ) );
323 push( @proposed_related, $self->related_readings( $target, 'colocated' ) );
1d310495 324 my %pr_ids;
49d4f2ac 325 map { $pr_ids{ $_ } = 1 } @proposed_related;
1d310495 326
327 # None of these proposed related readings should have a neighbor that
328 # is also in proposed_related.
329 foreach my $pr ( keys %pr_ids ) {
330 foreach my $neighbor( $self->sequence->neighbors( $pr ) ) {
331 return 0 if exists $pr_ids{$neighbor};
332 }
333 }
334 return 1;
335 }
336}
337
338# Return a list of the witnesses in which the reading appears.
339sub reading_witnesses {
340 my( $self, $reading ) = @_;
341 # We need only check either the incoming or the outgoing edges; I have
342 # arbitrarily chosen "incoming".
343 my %all_witnesses;
344 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
345 my $wits = $self->sequence->get_edge_attributes( @$e );
346 @all_witnesses{ keys %$wits } = 1;
347 }
348 return keys %all_witnesses;
910a0a6d 349}
350
3a2ebbf4 351sub related_readings {
352 my( $self, $reading, $colocated ) = @_;
a753cc84 353 my $return_object;
354 if( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
355 $reading = $reading->id;
356 $return_object = 1;
7035e3a6 357# print STDERR "Returning related objects\n";
358# } else {
359# print STDERR "Returning related object names\n";
a753cc84 360 }
3a2ebbf4 361 my @related = $self->relations->all_reachable( $reading );
362 if( $colocated ) {
363 my @colo = grep { $self->relations->has_edge_attribute( $reading, $_, 'colocated' ) } @related;
a753cc84 364 @related = @colo;
365 }
366 return $return_object ? map { $self->reading( $_ ) } @related : @related;
3a2ebbf4 367}
368
8e1394aa 369=head2 Output method(s)
370
371=over
372
373=item B<as_svg>
374
375print $graph->as_svg( $recalculate );
376
0068967c 377Returns an SVG string that represents the graph, via as_dot and graphviz.
8e1394aa 378
379=cut
380
381sub as_svg {
3a2ebbf4 382 my( $self ) = @_;
383
8e1394aa 384 my @cmd = qw/dot -Tsvg/;
385 my( $svg, $err );
910a0a6d 386 my $dotfile = File::Temp->new();
d9e873d0 387 ## TODO REMOVE
eca16057 388 # $dotfile->unlink_on_destroy(0);
910a0a6d 389 binmode $dotfile, ':utf8';
390 print $dotfile $self->as_dot();
391 push( @cmd, $dotfile->filename );
392 run( \@cmd, ">", binary(), \$svg );
393 $svg = decode_utf8( $svg );
8e1394aa 394 return $svg;
395}
396
df6d9812 397=item B<as_dot>
398
399print $graph->as_dot( $view, $recalculate );
400
401Returns a string that is the collation graph expressed in dot
402(i.e. GraphViz) format. The 'view' argument determines what kind of
403graph is produced.
404 * 'path': a graph of witness paths through the collation (DEFAULT)
405 * 'relationship': a graph of how collation readings relate to
406 each other
407
408=cut
409
410sub as_dot {
411 my( $self, $view ) = @_;
3a2ebbf4 412 $view = 'sequence' unless $view;
df6d9812 413 # TODO consider making some of these things configurable
67da8d6c 414 my $graph_name = $self->tradition->name;
415 $graph_name =~ s/[^\w\s]//g;
416 $graph_name = join( '_', split( /\s+/, $graph_name ) );
417 my $dot = sprintf( "digraph %s {\n", $graph_name );
df6d9812 418 $dot .= "\tedge [ arrowhead=open ];\n";
419 $dot .= "\tgraph [ rankdir=LR ];\n";
420 $dot .= sprintf( "\tnode [ fontsize=%d, fillcolor=%s, style=%s, shape=%s ];\n",
3a2ebbf4 421 11, "white", "filled", "ellipse" );
df6d9812 422
423 foreach my $reading ( $self->readings ) {
910a0a6d 424 # Need not output nodes without separate labels
3a2ebbf4 425 next if $reading->id eq $reading->text;
8f9cab7b 426 my $label = $reading->text;
427 $label =~ s/\"/\\\"/g;
428 $dot .= sprintf( "\t\"%s\" [ label=\"%s\" ];\n", $reading->id, $label );
df6d9812 429 }
3a2ebbf4 430
431 # TODO do something sensible for relationships
df6d9812 432
3a2ebbf4 433 my @edges = $self->paths;
df6d9812 434 foreach my $edge ( @edges ) {
910a0a6d 435 my %variables = ( 'color' => '#000000',
436 'fontcolor' => '#000000',
8f9cab7b 437 'label' => join( ', ', $self->path_display_label( $edge ) ),
910a0a6d 438 );
439 my $varopts = join( ', ', map { $_.'="'.$variables{$_}.'"' } sort keys %variables );
440 $dot .= sprintf( "\t\"%s\" -> \"%s\" [ %s ];\n",
3a2ebbf4 441 $edge->[0], $edge->[1], $varopts );
df6d9812 442 }
df6d9812 443 $dot .= "}\n";
444 return $dot;
445}
446
3a2ebbf4 447sub path_witnesses {
448 my( $self, @edge ) = @_;
449 # If edge is an arrayref, cope.
450 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
451 my $e = shift @edge;
452 @edge = @$e;
453 }
454 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
455 return sort @wits;
456}
457
8f9cab7b 458sub path_display_label {
459 my( $self, $edge ) = @_;
460 my @wits = $self->path_witnesses( $edge );
461 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
462 if( scalar @wits > $maj ) {
463 return 'majority';
464 } else {
465 return join( ', ', @wits );
466 }
467}
468
469
8e1394aa 470=item B<as_graphml>
471
472print $graph->as_graphml( $recalculate )
473
474Returns a GraphML representation of the collation graph, with
475transposition information and position information. Unless
476$recalculate is passed (and is a true value), the method will return a
477cached copy of the SVG after the first call to the method.
478
479=cut
480
481sub as_graphml {
3a2ebbf4 482 my( $self ) = @_;
8e1394aa 483
484 # Some namespaces
485 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
486 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
487 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 488 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 489
490 # Create the document and root node
491 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
492 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
493 $graphml->setDocumentElement( $root );
494 $root->setNamespace( $xsi_ns, 'xsi', 0 );
495 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
496
e309421a 497 # Add the data keys for the graph
498 my %graph_data_keys;
499 my $gdi = 0;
1d310495 500 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
e309421a 501 foreach my $datum ( @graph_attributes ) {
502 $graph_data_keys{$datum} = 'dg'.$gdi++;
503 my $key = $root->addNewChild( $graphml_ns, 'key' );
504 $key->setAttribute( 'attr.name', $datum );
505 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
506 $key->setAttribute( 'for', 'graph' );
507 $key->setAttribute( 'id', $graph_data_keys{$datum} );
508 }
f6066bac 509
8e1394aa 510 # Add the data keys for nodes
ef9d481f 511 my %node_data_keys;
512 my $ndi = 0;
3a2ebbf4 513 my %node_data = (
514 id => 'string',
255875b8 515 text => 'string',
3a2ebbf4 516 rank => 'string',
517 is_start => 'boolean',
518 is_end => 'boolean',
519 is_lacuna => 'boolean',
520 );
521 foreach my $datum ( keys %node_data ) {
910a0a6d 522 $node_data_keys{$datum} = 'dn'.$ndi++;
523 my $key = $root->addNewChild( $graphml_ns, 'key' );
524 $key->setAttribute( 'attr.name', $datum );
3a2ebbf4 525 $key->setAttribute( 'attr.type', $node_data{$datum} );
910a0a6d 526 $key->setAttribute( 'for', 'node' );
527 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 528 }
529
df6d9812 530 # Add the data keys for edges, i.e. witnesses
ef9d481f 531 my $edi = 0;
532 my %edge_data_keys;
3a2ebbf4 533 my %edge_data = (
534 class => 'string', # Path or relationship?
535 witness => 'string', # ID/label for a path
536 relationship => 'string', # ID/label for a relationship
537 extra => 'boolean', # Path key
538 colocated => 'boolean', # Relationship key
539 non_correctable => 'boolean', # Relationship key
540 non_independent => 'boolean', # Relationship key
541 );
542 foreach my $datum ( keys %edge_data ) {
543 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 544 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 545 $key->setAttribute( 'attr.name', $datum );
546 $key->setAttribute( 'attr.type', $edge_data{$datum} );
910a0a6d 547 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 548 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 549 }
3a2ebbf4 550
551 # Add the collation graph itself
8e1394aa 552 my $graph = $root->addNewChild( $graphml_ns, 'graph' );
553 $graph->setAttribute( 'edgedefault', 'directed' );
94c00c71 554 $graph->setAttribute( 'id', $self->tradition->name );
8e1394aa 555 $graph->setAttribute( 'parse.edgeids', 'canonical' );
df6d9812 556 $graph->setAttribute( 'parse.edges', scalar($self->paths) );
8e1394aa 557 $graph->setAttribute( 'parse.nodeids', 'canonical' );
df6d9812 558 $graph->setAttribute( 'parse.nodes', scalar($self->readings) );
8e1394aa 559 $graph->setAttribute( 'parse.order', 'nodesfirst' );
e309421a 560
561 # Collation attribute data
562 foreach my $datum ( @graph_attributes ) {
1d310495 563 my $value = $datum eq 'version' ? '2.0' : $self->$datum;
564 _add_graphml_data( $graph, $graph_data_keys{$datum}, $value );
e309421a 565 }
8e1394aa 566
567 my $node_ctr = 0;
568 my %node_hash;
b15511bf 569 # Add our readings to the graph
3a2ebbf4 570 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
910a0a6d 571 my $node_el = $graph->addNewChild( $graphml_ns, 'node' );
572 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 573 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 574 $node_el->setAttribute( 'id', $node_xmlid );
255875b8 575 foreach my $d ( keys %node_data ) {
576 my $nval = $n->$d;
577 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
578 if defined $nval;
579 }
b15511bf 580 }
581
3a2ebbf4 582 # Add the path edges
df6d9812 583 my $edge_ctr = 0;
3a2ebbf4 584 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
585 # We add an edge in the graphml for every witness in $e.
586 foreach my $wit ( $self->path_witnesses( $e ) ) {
587 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
588 $node_hash{ $e->[0] },
589 $node_hash{ $e->[1] } );
590 my $edge_el = $graph->addNewChild( $graphml_ns, 'edge' );
591 $edge_el->setAttribute( 'source', $from );
592 $edge_el->setAttribute( 'target', $to );
593 $edge_el->setAttribute( 'id', $id );
594 # Add the edge class
595 _add_graphml_data( $edge_el, $edge_data_keys{'class'}, 'path' );
596
597 # It's a witness path, so add the witness
598 my $base = $wit;
599 my $key = $edge_data_keys{'witness'};
600 # Is this an ante-corr witness?
601 my $aclabel = $self->ac_label;
602 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
603 # Keep the base witness
604 $base = $1;
605 # ...and record that this is an 'extra' reading path
606 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
607 }
608 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
609 }
610 }
611
612 # Add the relationship edges
613 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->relationships ) {
614 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
615 $node_hash{ $e->[0] },
616 $node_hash{ $e->[1] } );
617 my $edge_el = $graph->addNewChild( $graphml_ns, 'edge' );
618 $edge_el->setAttribute( 'source', $from );
619 $edge_el->setAttribute( 'target', $to );
620 $edge_el->setAttribute( 'id', $id );
621 # Add the edge class
622 _add_graphml_data( $edge_el, $edge_data_keys{'class'}, 'relationship' );
623
624 my $data = $self->relations->get_edge_attributes( @$e );
625 # It's a relationship, so save the relationship data
626 _add_graphml_data( $edge_el, $edge_data_keys{'relationship'}, $data->{type} );
627 _add_graphml_data( $edge_el, $edge_data_keys{'colocated'}, $data->{colocated} );
628 if( exists $data->{non_correctable} ) {
629 _add_graphml_data( $edge_el, $edge_data_keys{'non_correctable'},
630 $data->{non_correctable} );
631 }
632 if( exists $data->{non_independent} ) {
633 _add_graphml_data( $edge_el, $edge_data_keys{'non_independent'},
634 $data->{non_independent} );
635 }
8e1394aa 636 }
637
94c00c71 638 # Save and return the thing
639 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 640 return $result;
df6d9812 641}
642
b15511bf 643sub _add_graphml_data {
644 my( $el, $key, $value ) = @_;
b15511bf 645 return unless defined $value;
c9bf3dbf 646 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 647 $data_el->setAttribute( 'key', $key );
648 $data_el->appendText( $value );
8e1394aa 649}
650
910a0a6d 651=item B<as_csv>
652
653print $graph->as_csv( $recalculate )
654
655Returns a CSV alignment table representation of the collation graph, one
656row per witness (or witness uncorrected.) Unless $recalculate is passed
657(and is a true value), the method will return a cached copy of the CSV
658after the first call to the method.
659
660=cut
661
662sub as_csv {
3a2ebbf4 663 my( $self ) = @_;
910a0a6d 664 my $table = $self->make_alignment_table;
665 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
666 my @result;
667 foreach my $row ( @$table ) {
668 $csv->combine( @$row );
669 push( @result, decode_utf8( $csv->string ) );
670 }
3a2ebbf4 671 return join( "\n", @result );
910a0a6d 672}
673
0e476982 674# Make an alignment table - $noderefs controls whether the objects
675# in the table are the nodes or simply their readings.
9f3ba6f7 676
910a0a6d 677sub make_alignment_table {
08e0fb85 678 my( $self, $noderefs, $include ) = @_;
910a0a6d 679 unless( $self->linear ) {
680 warn "Need a linear graph in order to make an alignment table";
681 return;
682 }
683 my $table;
3a2ebbf4 684 my @all_pos = ( 1 .. $self->end->rank - 1 );
910a0a6d 685 foreach my $wit ( $self->tradition->witnesses ) {
eca16057 686 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 687 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
688 my @row = _make_witness_row( \@wit_path, \@all_pos, $noderefs );
910a0a6d 689 unshift( @row, $wit->sigil );
690 push( @$table, \@row );
1f7aa795 691 if( $wit->is_layered ) {
692 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
693 $wit->sigil.$self->ac_label, $wit->sigil );
694 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos, $noderefs );
910a0a6d 695 unshift( @ac_row, $wit->sigil . $self->ac_label );
696 push( @$table, \@ac_row );
697 }
698 }
0e476982 699
08e0fb85 700 if( $include ) {
701 my $winnowed = [];
702 # Winnow out the rows for any witness not included.
703 foreach my $row ( @$table ) {
704 next unless $include->{$row->[0]};
705 push( @$winnowed, $row );
706 }
707 $table = $winnowed;
708 }
709
910a0a6d 710 # Return a table where the witnesses read in columns rather than rows.
711 my $turned = _turn_table( $table );
08e0fb85 712 # TODO We should really go through and delete empty rows.
910a0a6d 713 return $turned;
714}
715
716sub _make_witness_row {
0e476982 717 my( $path, $positions, $noderefs ) = @_;
910a0a6d 718 my %char_hash;
719 map { $char_hash{$_} = undef } @$positions;
720 foreach my $rdg ( @$path ) {
eca16057 721 my $rtext = $rdg->text;
722 $rtext = '#LACUNA#' if $rdg->is_lacuna;
3a2ebbf4 723 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
0e476982 724 $char_hash{$rdg->rank} = $noderefs ? $rdg : $rtext;
910a0a6d 725 }
726 my @row = map { $char_hash{$_} } @$positions;
eca16057 727 # Fill in lacuna markers for undef spots in the row
728 my $last_el = shift @row;
729 my @filled_row = ( $last_el );
730 foreach my $el ( @row ) {
0e476982 731 # If we are using node reference, make the lacuna node appear many times
732 # in the table. If not, use the lacuna tag.
733 if( $last_el && _el_is_lacuna( $last_el ) && !defined $el ) {
734 $el = $noderefs ? $last_el : '#LACUNA#';
eca16057 735 }
736 push( @filled_row, $el );
737 $last_el = $el;
738 }
739 return @filled_row;
910a0a6d 740}
741
0e476982 742# Tiny utility function to say if a table element is a lacuna
743sub _el_is_lacuna {
744 my $el = shift;
745 return 1 if $el eq '#LACUNA#';
746 return 1 if ref( $el ) eq 'Text::Tradition::Collation::Reading'
747 && $el->is_lacuna;
748 return 0;
749}
750
910a0a6d 751# Helper to turn the witnesses along columns rather than rows. Assumes
752# equal-sized rows.
753sub _turn_table {
754 my( $table ) = @_;
755 my $result = [];
756 return $result unless scalar @$table;
757 my $nrows = scalar @{$table->[0]};
758 foreach my $idx ( 0 .. $nrows - 1 ) {
759 foreach my $wit ( 0 .. $#{$table} ) {
760 $result->[$idx]->[$wit] = $table->[$wit]->[$idx];
761 }
762 }
763 return $result;
764}
765
8e1394aa 766=back
767
de51424a 768=head2 Navigation methods
769
770=over
771
8e1394aa 772=item B<start>
773
774my $beginning = $collation->start();
775
776Returns the beginning of the collation, a meta-reading with label '#START#'.
777
910a0a6d 778=item B<end>
779
780my $end = $collation->end();
781
782Returns the end of the collation, a meta-reading with label '#END#'.
783
910a0a6d 784
e2902068 785=item B<reading_sequence>
786
787my @readings = $graph->reading_sequence( $first, $last, $path[, $alt_path] );
788
789Returns the ordered list of readings, starting with $first and ending
790with $last, along the given witness path. If no path is specified,
791assume that the path is that of the base text (if any.)
792
793=cut
794
910a0a6d 795# TODO Think about returning some lazy-eval iterator.
796
e2902068 797sub reading_sequence {
798 my( $self, $start, $end, $witness, $backup ) = @_;
799
930ff666 800 $witness = $self->baselabel unless $witness;
e2902068 801 my @readings = ( $start );
802 my %seen;
803 my $n = $start;
3a2ebbf4 804 while( $n && $n->id ne $end->id ) {
805 if( exists( $seen{$n->id} ) ) {
806 warn "Detected loop at " . $n->id;
910a0a6d 807 last;
808 }
3a2ebbf4 809 $seen{$n->id} = 1;
910a0a6d 810
811 my $next = $self->next_reading( $n, $witness, $backup );
44771cf2 812 unless( $next ) {
3a2ebbf4 813 warn "Did not find any path for $witness from reading " . $n->id;
44771cf2 814 last;
815 }
910a0a6d 816 push( @readings, $next );
817 $n = $next;
e2902068 818 }
819 # Check that the last reading is our end reading.
820 my $last = $readings[$#readings];
3a2ebbf4 821 warn "Last reading found from " . $start->text .
910a0a6d 822 " for witness $witness is not the end!"
3a2ebbf4 823 unless $last->id eq $end->id;
e2902068 824
825 return @readings;
826}
827
4a8828f0 828=item B<next_reading>
8e1394aa 829
4a8828f0 830my $next_reading = $graph->next_reading( $reading, $witpath );
8e1394aa 831
4a8828f0 832Returns the reading that follows the given reading along the given witness
930ff666 833path.
8e1394aa 834
835=cut
836
4a8828f0 837sub next_reading {
e2902068 838 # Return the successor via the corresponding path.
8e1394aa 839 my $self = shift;
3a2ebbf4 840 my $answer = $self->_find_linked_reading( 'next', @_ );
841 return $self->reading( $answer );
8e1394aa 842}
843
4a8828f0 844=item B<prior_reading>
8e1394aa 845
4a8828f0 846my $prior_reading = $graph->prior_reading( $reading, $witpath );
8e1394aa 847
4a8828f0 848Returns the reading that precedes the given reading along the given witness
930ff666 849path.
8e1394aa 850
851=cut
852
4a8828f0 853sub prior_reading {
e2902068 854 # Return the predecessor via the corresponding path.
8e1394aa 855 my $self = shift;
3a2ebbf4 856 my $answer = $self->_find_linked_reading( 'prior', @_ );
857 return $self->reading( $answer );
8e1394aa 858}
859
4a8828f0 860sub _find_linked_reading {
e2902068 861 my( $self, $direction, $node, $path, $alt_path ) = @_;
862 my @linked_paths = $direction eq 'next'
3a2ebbf4 863 ? $self->sequence->edges_from( $node )
864 : $self->sequence->edges_to( $node );
e2902068 865 return undef unless scalar( @linked_paths );
8e1394aa 866
e2902068 867 # We have to find the linked path that contains all of the
868 # witnesses supplied in $path.
869 my( @path_wits, @alt_path_wits );
3a2ebbf4 870 @path_wits = sort( $self->witnesses_of_label( $path ) ) if $path;
871 @alt_path_wits = sort( $self->witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 872 my $base_le;
873 my $alt_le;
874 foreach my $le ( @linked_paths ) {
3a2ebbf4 875 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 876 $base_le = $le;
910a0a6d 877 }
3a2ebbf4 878 my @le_wits = $self->path_witnesses( $le );
879 if( _is_within( \@path_wits, \@le_wits ) ) {
880 # This is the right path.
881 return $direction eq 'next' ? $le->[1] : $le->[0];
882 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
883 $alt_le = $le;
884 }
8e1394aa 885 }
e2902068 886 # Got this far? Return the alternate path if it exists.
3a2ebbf4 887 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 888 if $alt_le;
e2902068 889
890 # Got this far? Return the base path if it exists.
3a2ebbf4 891 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 892 if $base_le;
e2902068 893
894 # Got this far? We have no appropriate path.
8e1394aa 895 warn "Could not find $direction node from " . $node->label
910a0a6d 896 . " along path $path";
8e1394aa 897 return undef;
898}
899
4a8828f0 900# Some set logic.
901sub _is_within {
902 my( $set1, $set2 ) = @_;
7854e12e 903 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 904 foreach my $el ( @$set1 ) {
910a0a6d 905 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 906 }
907 return $ret;
908}
909
de51424a 910
911## INITIALIZATION METHODS - for use by parsers
930ff666 912
7e450e44 913# For use when a collation is constructed from a base text and an apparatus.
914# We have the sequences of readings and just need to add path edges.
1f7aa795 915# When we are done, clear out the witness path attributes, as they are no
916# longer needed.
917# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 918
6a222840 919sub make_witness_paths {
920 my( $self ) = @_;
910a0a6d 921 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 922 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 923 $self->make_witness_path( $wit );
7854e12e 924 }
7854e12e 925}
926
6a222840 927sub make_witness_path {
7854e12e 928 my( $self, $wit ) = @_;
929 my @chain = @{$wit->path};
15d2d3df 930 my $sig = $wit->sigil;
7854e12e 931 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 932 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 933 }
1f7aa795 934 if( $wit->is_layered ) {
d9e873d0 935 @chain = @{$wit->uncorrected_path};
936 foreach my $idx( 0 .. $#chain-1 ) {
937 my $source = $chain[$idx];
938 my $target = $chain[$idx+1];
939 $self->add_path( $source, $target, $sig.$self->ac_label )
940 unless $self->has_path( $source, $target, $sig );
941 }
15d2d3df 942 }
1f7aa795 943 $wit->clear_path;
944 $wit->clear_uncorrected_path;
e2902068 945}
946
910a0a6d 947sub calculate_ranks {
948 my $self = shift;
949 # Walk a version of the graph where every node linked by a relationship
950 # edge is fundamentally the same node, and do a topological ranking on
951 # the nodes in this graph.
c9bf3dbf 952 my $topo_graph = Graph->new();
910a0a6d 953 my %rel_containers;
954 my $rel_ctr = 0;
955 # Add the nodes
956 foreach my $r ( $self->readings ) {
3a2ebbf4 957 next if exists $rel_containers{$r->id};
910a0a6d 958 my @rels = $r->related_readings( 'colocated' );
959 if( @rels ) {
960 # Make a relationship container.
961 push( @rels, $r );
c9bf3dbf 962 my $rn = 'rel_container_' . $rel_ctr++;
963 $topo_graph->add_vertex( $rn );
910a0a6d 964 foreach( @rels ) {
3a2ebbf4 965 $rel_containers{$_->id} = $rn;
910a0a6d 966 }
967 } else {
968 # Add a new node to mirror the old node.
3a2ebbf4 969 $rel_containers{$r->id} = $r->id;
970 $topo_graph->add_vertex( $r->id );
910a0a6d 971 }
4a8828f0 972 }
3a1f2523 973
3a2ebbf4 974 # Add the edges.
910a0a6d 975 foreach my $r ( $self->readings ) {
3a2ebbf4 976 foreach my $n ( $self->sequence->successors( $r->id ) ) {
977 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
978 $rel_containers{$n} );
49d4f2ac 979 $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 980 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 981 }
982 }
983
984 # Now do the rankings, starting with the start node.
3a2ebbf4 985 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 986 my $node_ranks = { $topo_start => 0 };
910a0a6d 987 my @curr_origin = ( $topo_start );
988 # A little iterative function.
989 while( @curr_origin ) {
c9bf3dbf 990 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 991 }
992 # Transfer our rankings from the topological graph to the real one.
993 foreach my $r ( $self->readings ) {
3a2ebbf4 994 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
995 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 996 } else {
997 $DB::single = 1;
3a2ebbf4 998 die "No rank calculated for node " . $r->id
67da8d6c 999 . " - do you have a cycle in the graph?";
1000 }
de51424a 1001 }
8e1394aa 1002}
3a1f2523 1003
910a0a6d 1004sub _assign_rank {
c9bf3dbf 1005 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1006 # Look at each of the children of @current_nodes. If all the child's
1007 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1008 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1009 # parent gets a rank.
910a0a6d 1010 my @next_nodes;
1011 foreach my $c ( @current_nodes ) {
c9bf3dbf 1012 warn "Current reading $c has no rank!"
1013 unless exists $node_ranks->{$c};
1014 # print STDERR "Looking at child of node $c, rank "
1015 # . $node_ranks->{$c} . "\n";
1016 foreach my $child ( $graph->successors( $c ) ) {
1017 next if exists $node_ranks->{$child};
910a0a6d 1018 my $highest_rank = -1;
1019 my $skip = 0;
c9bf3dbf 1020 foreach my $parent ( $graph->predecessors( $child ) ) {
1021 if( exists $node_ranks->{$parent} ) {
1022 $highest_rank = $node_ranks->{$parent}
1023 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1024 } else {
1025 $skip = 1;
1026 last;
1027 }
1028 }
1029 next if $skip;
c9bf3dbf 1030 my $c_rank = $highest_rank + 1;
1031 # print STDERR "Assigning rank $c_rank to node $child \n";
1032 $node_ranks->{$child} = $c_rank;
910a0a6d 1033 push( @next_nodes, $child );
1034 }
1035 }
1036 return @next_nodes;
4cdd82f1 1037}
910a0a6d 1038
0e476982 1039# Another method to make up for rough collation methods. If the same reading
1040# appears multiple times at the same rank, collapse the nodes.
1041sub flatten_ranks {
1042 my $self = shift;
1043 my %unique_rank_rdg;
1044 foreach my $rdg ( $self->readings ) {
1045 next unless $rdg->has_rank;
1046 my $key = $rdg->rank . "||" . $rdg->text;
1047 if( exists $unique_rank_rdg{$key} ) {
1048 # Combine!
0068967c 1049 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1050 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
1051 } else {
1052 $unique_rank_rdg{$key} = $rdg;
1053 }
1054 }
1055}
1056
1057
fa954f4c 1058## Utility functions
de51424a 1059
4a8828f0 1060# Return the string that joins together a list of witnesses for
1061# display on a single path.
4a8828f0 1062sub witnesses_of_label {
de51424a 1063 my( $self, $label ) = @_;
4a8828f0 1064 my $regex = $self->wit_list_separator;
de51424a 1065 my @answer = split( /\Q$regex\E/, $label );
1066 return @answer;
4a8828f0 1067}
8e1394aa 1068
dd3b58b0 1069no Moose;
1070__PACKAGE__->meta->make_immutable;
e867486f 1071
1072=head1 BUGS / TODO
1073
1074=over
1075
0068967c 1076=item * Think about making Relationship objects again
e867486f 1077
1078=back