Change alignment table to CollateX format; make version 3 of GraphML output
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
c9bf3dbf 5use Graph;
8e1394aa 6use IPC::Run qw( run binary );
910a0a6d 7use Text::CSV_XS;
b15511bf 8use Text::Tradition::Collation::Reading;
df6d9812 9use XML::LibXML;
dd3b58b0 10use Moose;
11
3a2ebbf4 12has 'sequence' => (
d047cd52 13 is => 'ro',
3a2ebbf4 14 isa => 'Graph',
15 default => sub { Graph->new() },
d047cd52 16 handles => {
3a2ebbf4 17 paths => 'edges',
d047cd52 18 },
d047cd52 19 );
3a2ebbf4 20
21has 'relations' => (
22 is => 'ro',
23 isa => 'Graph',
24 default => sub { Graph->new( undirected => 1 ) },
25 handles => {
26 relationships => 'edges',
27 },
28 );
dd3b58b0 29
3a2ebbf4 30has 'tradition' => (
31 is => 'ro',
d047cd52 32 isa => 'Text::Tradition',
8d9a1cd8 33 weak_ref => 1,
d047cd52 34 );
dd3b58b0 35
3a2ebbf4 36has 'readings' => (
37 isa => 'HashRef[Text::Tradition::Collation::Reading]',
38 traits => ['Hash'],
39 handles => {
40 reading => 'get',
41 _add_reading => 'set',
42 del_reading => 'delete',
43 has_reading => 'exists',
44 readings => 'values',
45 },
46 default => sub { {} },
47 );
910a0a6d 48
4a8828f0 49has 'wit_list_separator' => (
7854e12e 50 is => 'rw',
51 isa => 'Str',
52 default => ', ',
53 );
54
55has 'baselabel' => (
56 is => 'rw',
57 isa => 'Str',
58 default => 'base text',
59 );
4a8828f0 60
15d2d3df 61has 'linear' => (
62 is => 'rw',
63 isa => 'Bool',
64 default => 1,
65 );
1f563ac3 66
ef9d481f 67has 'ac_label' => (
68 is => 'rw',
69 isa => 'Str',
70 default => ' (a.c.)',
71 );
3a2ebbf4 72
73has 'start' => (
74 is => 'ro',
75 isa => 'Text::Tradition::Collation::Reading',
76 writer => '_set_start',
77 weak_ref => 1,
78 );
79
80has 'end' => (
81 is => 'ro',
82 isa => 'Text::Tradition::Collation::Reading',
83 writer => '_set_end',
84 weak_ref => 1,
85 );
1f563ac3 86
dd3b58b0 87# The collation can be created two ways:
88# 1. Collate a set of witnesses (with CollateX I guess) and process
89# the results as in 2.
90# 2. Read a pre-prepared collation in one of a variety of formats,
91# and make the graph from that.
92
93# The graph itself will (for now) be immutable, and the positions
94# within the graph will also be immutable. We need to calculate those
95# positions upon graph construction. The equivalences between graph
96# nodes will be mutable, entirely determined by the user (or possibly
97# by some semantic pre-processing provided by the user.) So the
98# constructor should just make an empty equivalences object. The
99# constructor will also need to make the witness objects, if we didn't
100# come through option 1.
101
d047cd52 102sub BUILD {
3a2ebbf4 103 my $self = shift;
104 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
105 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 106}
784877d9 107
3a2ebbf4 108### Reading construct/destruct functions
109
110sub add_reading {
111 my( $self, $reading ) = @_;
112 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
113 my %args = %$reading;
114 $reading = Text::Tradition::Collation::Reading->new(
115 'collation' => $self,
116 %args );
117 }
118 # First check to see if a reading with this ID exists.
119 if( $self->reading( $reading->id ) ) {
120 warn "Collation already has a reading with id " . $reading->id;
121 return undef;
122 }
123 $self->_add_reading( $reading->id => $reading );
124 # Once the reading has been added, put it in both graphs.
125 $self->sequence->add_vertex( $reading->id );
126 $self->relations->add_vertex( $reading->id );
127 return $reading;
eca16057 128};
129
3a2ebbf4 130around del_reading => sub {
131 my $orig = shift;
132 my $self = shift;
133 my $arg = shift;
134
135 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
136 $arg = $arg->id;
137 }
3a2ebbf4 138 # Remove the reading from the graphs.
139 $self->sequence->delete_vertex( $arg );
140 $self->relations->delete_vertex( $arg );
141
142 # Carry on.
143 $self->$orig( $arg );
144};
7854e12e 145
3a2ebbf4 146# merge_readings( $main, $to_be_deleted );
7854e12e 147
3a2ebbf4 148sub merge_readings {
149 my $self = shift;
150
151 # We only need the IDs for adding paths to the graph, not the reading
152 # objects themselves.
49d4f2ac 153 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
3a2ebbf4 154
155 # The kept reading should inherit the paths and the relationships
156 # of the deleted reading.
157 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
158 my @vector = ( $kept );
159 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
160 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 161 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 162 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
163 $self->sequence->add_edge( @vector );
164 my $fwits = $self->sequence->get_edge_attributes( @vector );
165 @wits{keys %$fwits} = values %$fwits;
166 $self->sequence->set_edge_attributes( @vector, \%wits );
167 }
168 foreach my $rel ( $self->relations->edges_at( $deleted ) ) {
169 my @vector = ( $kept );
170 push( @vector, $rel->[0] eq $deleted ? $rel->[1] : $rel->[0] );
49d4f2ac 171 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 172 # Is there a relationship here already? If so, keep it.
173 # TODO Warn about conflicting relationships
174 next if $self->relations->has_edge( @vector );
175 # If not, adopt the relationship that would be deleted.
176 $self->relations->add_edge( @vector );
177 my $attr = $self->relations->get_edge_attributes( @$rel );
178 $self->relations->set_edge_attributes( @vector, $attr );
179 }
180
181 # Do the deletion deed.
49d4f2ac 182 if( $combine_char ) {
183 my $kept_obj = $self->reading( $kept );
184 my $new_text = join( $combine_char, $kept_obj->text,
185 $self->reading( $deleted )->text );
186 $kept_obj->alter_text( $new_text );
187 }
3a2ebbf4 188 $self->del_reading( $deleted );
189}
7854e12e 190
3265b0ce 191
3a2ebbf4 192# Helper function for manipulating the graph.
193sub _stringify_args {
194 my( $self, $first, $second, $arg ) = @_;
195 $first = $first->id
196 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
197 $second = $second->id
198 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
199 return( $first, $second, $arg );
200}
df6d9812 201
3a2ebbf4 202### Path logic
203
204sub add_path {
205 my $self = shift;
206
207 # We only need the IDs for adding paths to the graph, not the reading
208 # objects themselves.
209 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
210
211 # Connect the readings
212 $self->sequence->add_edge( $source, $target );
213 # Note the witness in question
214 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 215};
216
3a2ebbf4 217sub del_path {
218 my $self = shift;
49d4f2ac 219 my @args;
220 if( ref( $_[0] ) eq 'ARRAY' ) {
221 my $e = shift @_;
222 @args = ( @$e, @_ );
223 } else {
224 @args = @_;
225 }
3a2ebbf4 226
227 # We only need the IDs for adding paths to the graph, not the reading
228 # objects themselves.
49d4f2ac 229 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 230
231 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 232 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 233 }
234 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
235 $self->sequence->delete_edge( $source, $target );
236 }
784877d9 237}
238
3a2ebbf4 239
15d2d3df 240# Extra graph-alike utility
241sub has_path {
3a2ebbf4 242 my $self = shift;
243 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
244 return undef unless $self->sequence->has_edge( $source, $target );
245 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 246}
247
3a2ebbf4 248### Relationship logic
3265b0ce 249
3a2ebbf4 250=head2 add_relationship( $reading1, $reading2, $definition )
251
252Adds the specified relationship between the two readings. A relationship
253is transitive (i.e. undirected), and must have the following attributes
254specified in the hashref $definition:
255
256=over 4
257
ad1291ee 258=item * type - Can be one of spelling, orthographic, grammatical, meaning, lexical, collated, repetition, transposition. All but the last two are only valid relationships between readings that occur at the same point in the text.
3a2ebbf4 259
260=item * non_correctable - (Optional) True if the reading would not have been corrected independently.
ef9d481f 261
3a2ebbf4 262=item * non_independent - (Optional) True if the variant is unlikely to have occurred independently in unrelated witnesses.
263
264=item * global - (Optional) A meta-attribute, to set the same relationship between readings with the same text whenever they occur in the same place.
265
266=back
267
268=cut
269
270# Wouldn't it be lovely if edges could be objects, and all this type checking
271# and attribute management could be done via Moose?
272
273sub add_relationship {
274 my $self = shift;
275 my( $source, $target, $options ) = $self->_stringify_args( @_ );
276
277 # Check the options
278 if( !defined $options->{'type'} ||
131fa659 279 $options->{'type'} !~ /^(spelling|orthographic|grammatical|meaning|lexical|collated|repetition|transposition)$/i ) {
3a2ebbf4 280 my $t = $options->{'type'} ? $options->{'type'} : '';
49d4f2ac 281 return( undef, "Invalid or missing type " . $options->{'type'} );
3a2ebbf4 282 }
49d4f2ac 283 unless ( $options->{'type'} =~ /^(repetition|transposition)$/ ) {
3a2ebbf4 284 $options->{'colocated'} = 1;
285 }
286
ef9d481f 287 # Make sure there is not another relationship between these two
94c00c71 288 # readings already
3a2ebbf4 289 if( $self->relations->has_edge( $source, $target ) ) {
290 return ( undef, "Relationship already exists between these readings" );
4cdd82f1 291 }
1d310495 292 if( !$self->relationship_valid( $source, $target, $options->{'type'} ) ) {
910a0a6d 293 return ( undef, 'Relationship creates witness loop' );
ef9d481f 294 }
295
3a2ebbf4 296 my @vector = ( $source, $target );
297 $self->relations->add_edge( @vector );
298 $self->relations->set_edge_attributes( @vector, $options );
910a0a6d 299
300 # TODO Handle global relationship setting
301
3a2ebbf4 302 return( 1, @vector );
3265b0ce 303}
304
910a0a6d 305sub relationship_valid {
1d310495 306 my( $self, $source, $target, $rel ) = @_;
307 if( $rel eq 'repetition' ) {
308 return 1;
309 } elsif ( $rel eq 'transposition' ) {
310 # Check that the two readings do not appear in the same witness.
311 my %seen_wits;
312 map { $seen_wits{$_} = 1 } $self->reading_witnesses( $source );
313 foreach my $w ( $self->reading_witnesses( $target ) ) {
314 return 0 if $seen_wits{$w};
315 }
316 return 1;
317 } else {
318 # Check that linking the source and target in a relationship won't lead
319 # to a path loop for any witness. First make a lookup table of all the
320 # readings related to either the source or the target.
321 my @proposed_related = ( $source, $target );
49d4f2ac 322 push( @proposed_related, $self->related_readings( $source, 'colocated' ) );
323 push( @proposed_related, $self->related_readings( $target, 'colocated' ) );
1d310495 324 my %pr_ids;
49d4f2ac 325 map { $pr_ids{ $_ } = 1 } @proposed_related;
1d310495 326
327 # None of these proposed related readings should have a neighbor that
328 # is also in proposed_related.
329 foreach my $pr ( keys %pr_ids ) {
330 foreach my $neighbor( $self->sequence->neighbors( $pr ) ) {
331 return 0 if exists $pr_ids{$neighbor};
332 }
333 }
334 return 1;
335 }
336}
337
338# Return a list of the witnesses in which the reading appears.
339sub reading_witnesses {
340 my( $self, $reading ) = @_;
341 # We need only check either the incoming or the outgoing edges; I have
342 # arbitrarily chosen "incoming".
343 my %all_witnesses;
344 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
345 my $wits = $self->sequence->get_edge_attributes( @$e );
346 @all_witnesses{ keys %$wits } = 1;
347 }
348 return keys %all_witnesses;
910a0a6d 349}
350
3a2ebbf4 351sub related_readings {
352 my( $self, $reading, $colocated ) = @_;
a753cc84 353 my $return_object;
354 if( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
355 $reading = $reading->id;
356 $return_object = 1;
7035e3a6 357# print STDERR "Returning related objects\n";
358# } else {
359# print STDERR "Returning related object names\n";
a753cc84 360 }
3a2ebbf4 361 my @related = $self->relations->all_reachable( $reading );
362 if( $colocated ) {
363 my @colo = grep { $self->relations->has_edge_attribute( $reading, $_, 'colocated' ) } @related;
a753cc84 364 @related = @colo;
365 }
366 return $return_object ? map { $self->reading( $_ ) } @related : @related;
3a2ebbf4 367}
368
8e1394aa 369=head2 Output method(s)
370
371=over
372
373=item B<as_svg>
374
375print $graph->as_svg( $recalculate );
376
0068967c 377Returns an SVG string that represents the graph, via as_dot and graphviz.
8e1394aa 378
379=cut
380
381sub as_svg {
3a2ebbf4 382 my( $self ) = @_;
383
8e1394aa 384 my @cmd = qw/dot -Tsvg/;
385 my( $svg, $err );
910a0a6d 386 my $dotfile = File::Temp->new();
d9e873d0 387 ## TODO REMOVE
eca16057 388 # $dotfile->unlink_on_destroy(0);
910a0a6d 389 binmode $dotfile, ':utf8';
390 print $dotfile $self->as_dot();
391 push( @cmd, $dotfile->filename );
392 run( \@cmd, ">", binary(), \$svg );
393 $svg = decode_utf8( $svg );
8e1394aa 394 return $svg;
395}
396
df6d9812 397=item B<as_dot>
398
399print $graph->as_dot( $view, $recalculate );
400
401Returns a string that is the collation graph expressed in dot
402(i.e. GraphViz) format. The 'view' argument determines what kind of
403graph is produced.
404 * 'path': a graph of witness paths through the collation (DEFAULT)
405 * 'relationship': a graph of how collation readings relate to
406 each other
407
408=cut
409
410sub as_dot {
411 my( $self, $view ) = @_;
3a2ebbf4 412 $view = 'sequence' unless $view;
df6d9812 413 # TODO consider making some of these things configurable
67da8d6c 414 my $graph_name = $self->tradition->name;
415 $graph_name =~ s/[^\w\s]//g;
416 $graph_name = join( '_', split( /\s+/, $graph_name ) );
417 my $dot = sprintf( "digraph %s {\n", $graph_name );
df6d9812 418 $dot .= "\tedge [ arrowhead=open ];\n";
419 $dot .= "\tgraph [ rankdir=LR ];\n";
420 $dot .= sprintf( "\tnode [ fontsize=%d, fillcolor=%s, style=%s, shape=%s ];\n",
3a2ebbf4 421 11, "white", "filled", "ellipse" );
df6d9812 422
423 foreach my $reading ( $self->readings ) {
910a0a6d 424 # Need not output nodes without separate labels
3a2ebbf4 425 next if $reading->id eq $reading->text;
8f9cab7b 426 my $label = $reading->text;
427 $label =~ s/\"/\\\"/g;
428 $dot .= sprintf( "\t\"%s\" [ label=\"%s\" ];\n", $reading->id, $label );
df6d9812 429 }
3a2ebbf4 430
431 # TODO do something sensible for relationships
df6d9812 432
3a2ebbf4 433 my @edges = $self->paths;
df6d9812 434 foreach my $edge ( @edges ) {
910a0a6d 435 my %variables = ( 'color' => '#000000',
436 'fontcolor' => '#000000',
8f9cab7b 437 'label' => join( ', ', $self->path_display_label( $edge ) ),
910a0a6d 438 );
439 my $varopts = join( ', ', map { $_.'="'.$variables{$_}.'"' } sort keys %variables );
440 $dot .= sprintf( "\t\"%s\" -> \"%s\" [ %s ];\n",
3a2ebbf4 441 $edge->[0], $edge->[1], $varopts );
df6d9812 442 }
df6d9812 443 $dot .= "}\n";
444 return $dot;
445}
446
3a2ebbf4 447sub path_witnesses {
448 my( $self, @edge ) = @_;
449 # If edge is an arrayref, cope.
450 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
451 my $e = shift @edge;
452 @edge = @$e;
453 }
454 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
455 return sort @wits;
456}
457
8f9cab7b 458sub path_display_label {
459 my( $self, $edge ) = @_;
460 my @wits = $self->path_witnesses( $edge );
461 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
462 if( scalar @wits > $maj ) {
463 return 'majority';
464 } else {
465 return join( ', ', @wits );
466 }
467}
468
469
8e1394aa 470=item B<as_graphml>
471
472print $graph->as_graphml( $recalculate )
473
474Returns a GraphML representation of the collation graph, with
475transposition information and position information. Unless
476$recalculate is passed (and is a true value), the method will return a
477cached copy of the SVG after the first call to the method.
478
479=cut
480
481sub as_graphml {
3a2ebbf4 482 my( $self ) = @_;
8e1394aa 483
484 # Some namespaces
485 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
486 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
487 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 488 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 489
490 # Create the document and root node
491 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
492 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
493 $graphml->setDocumentElement( $root );
494 $root->setNamespace( $xsi_ns, 'xsi', 0 );
495 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
496
e309421a 497 # Add the data keys for the graph
498 my %graph_data_keys;
499 my $gdi = 0;
1d310495 500 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
e309421a 501 foreach my $datum ( @graph_attributes ) {
502 $graph_data_keys{$datum} = 'dg'.$gdi++;
503 my $key = $root->addNewChild( $graphml_ns, 'key' );
504 $key->setAttribute( 'attr.name', $datum );
505 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
506 $key->setAttribute( 'for', 'graph' );
507 $key->setAttribute( 'id', $graph_data_keys{$datum} );
508 }
f6066bac 509
8e1394aa 510 # Add the data keys for nodes
ef9d481f 511 my %node_data_keys;
512 my $ndi = 0;
3a2ebbf4 513 my %node_data = (
514 id => 'string',
255875b8 515 text => 'string',
3a2ebbf4 516 rank => 'string',
517 is_start => 'boolean',
518 is_end => 'boolean',
519 is_lacuna => 'boolean',
520 );
521 foreach my $datum ( keys %node_data ) {
910a0a6d 522 $node_data_keys{$datum} = 'dn'.$ndi++;
523 my $key = $root->addNewChild( $graphml_ns, 'key' );
524 $key->setAttribute( 'attr.name', $datum );
3a2ebbf4 525 $key->setAttribute( 'attr.type', $node_data{$datum} );
910a0a6d 526 $key->setAttribute( 'for', 'node' );
527 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 528 }
529
df6d9812 530 # Add the data keys for edges, i.e. witnesses
ef9d481f 531 my $edi = 0;
532 my %edge_data_keys;
3a2ebbf4 533 my %edge_data = (
3a2ebbf4 534 witness => 'string', # ID/label for a path
535 relationship => 'string', # ID/label for a relationship
536 extra => 'boolean', # Path key
537 colocated => 'boolean', # Relationship key
538 non_correctable => 'boolean', # Relationship key
539 non_independent => 'boolean', # Relationship key
540 );
541 foreach my $datum ( keys %edge_data ) {
542 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 543 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 544 $key->setAttribute( 'attr.name', $datum );
545 $key->setAttribute( 'attr.type', $edge_data{$datum} );
910a0a6d 546 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 547 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 548 }
3a2ebbf4 549
2c669bca 550 # Add the collation graphs themselves
551 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
552 $sgraph->setAttribute( 'edgedefault', 'directed' );
553 $sgraph->setAttribute( 'id', $self->tradition->name );
554 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
555 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
556 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
557 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
558 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
559
560 my $rgraph = $root->addNewChild( $graphml_ns, 'graph' );
561 $rgraph->setAttribute( 'edgedefault', 'undirected' );
562 $rgraph->setAttribute( 'id', 'relationships' );
563 $rgraph->setAttribute( 'parse.edgeids', 'canonical' );
564 $rgraph->setAttribute( 'parse.edges', scalar($self->relationships) );
565 $rgraph->setAttribute( 'parse.nodeids', 'canonical' );
566 $rgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
567 $rgraph->setAttribute( 'parse.order', 'nodesfirst' );
e309421a 568
569 # Collation attribute data
570 foreach my $datum ( @graph_attributes ) {
2c669bca 571 my $value = $datum eq 'version' ? '3.0' : $self->$datum;
572 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 573 }
8e1394aa 574
575 my $node_ctr = 0;
576 my %node_hash;
2c669bca 577 # Add our readings to the graphs
3a2ebbf4 578 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
2c669bca 579 # Add to the main graph
580 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 581 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 582 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 583 $node_el->setAttribute( 'id', $node_xmlid );
255875b8 584 foreach my $d ( keys %node_data ) {
585 my $nval = $n->$d;
586 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
587 if defined $nval;
588 }
2c669bca 589 # Add to the relationships graph
590 my $rnode_el = $rgraph->addNewChild( $graphml_ns, 'node' );
591 $rnode_el->setAttribute( 'id', $node_xmlid );
b15511bf 592 }
593
2c669bca 594 # Add the path edges to the sequence graph
df6d9812 595 my $edge_ctr = 0;
3a2ebbf4 596 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
597 # We add an edge in the graphml for every witness in $e.
598 foreach my $wit ( $self->path_witnesses( $e ) ) {
599 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
600 $node_hash{ $e->[0] },
601 $node_hash{ $e->[1] } );
2c669bca 602 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 603 $edge_el->setAttribute( 'source', $from );
604 $edge_el->setAttribute( 'target', $to );
605 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 606
607 # It's a witness path, so add the witness
608 my $base = $wit;
609 my $key = $edge_data_keys{'witness'};
610 # Is this an ante-corr witness?
611 my $aclabel = $self->ac_label;
612 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
613 # Keep the base witness
614 $base = $1;
615 # ...and record that this is an 'extra' reading path
616 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
617 }
618 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
619 }
620 }
621
2c669bca 622 # Add the relationship edges to the relationships graph
3a2ebbf4 623 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->relationships ) {
624 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
625 $node_hash{ $e->[0] },
626 $node_hash{ $e->[1] } );
2c669bca 627 my $edge_el = $rgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 628 $edge_el->setAttribute( 'source', $from );
629 $edge_el->setAttribute( 'target', $to );
630 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 631
632 my $data = $self->relations->get_edge_attributes( @$e );
633 # It's a relationship, so save the relationship data
634 _add_graphml_data( $edge_el, $edge_data_keys{'relationship'}, $data->{type} );
635 _add_graphml_data( $edge_el, $edge_data_keys{'colocated'}, $data->{colocated} );
636 if( exists $data->{non_correctable} ) {
637 _add_graphml_data( $edge_el, $edge_data_keys{'non_correctable'},
638 $data->{non_correctable} );
639 }
640 if( exists $data->{non_independent} ) {
641 _add_graphml_data( $edge_el, $edge_data_keys{'non_independent'},
642 $data->{non_independent} );
643 }
8e1394aa 644 }
645
94c00c71 646 # Save and return the thing
647 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 648 return $result;
df6d9812 649}
650
b15511bf 651sub _add_graphml_data {
652 my( $el, $key, $value ) = @_;
b15511bf 653 return unless defined $value;
c9bf3dbf 654 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 655 $data_el->setAttribute( 'key', $key );
656 $data_el->appendText( $value );
8e1394aa 657}
658
910a0a6d 659=item B<as_csv>
660
661print $graph->as_csv( $recalculate )
662
663Returns a CSV alignment table representation of the collation graph, one
2c669bca 664row per witness (or witness uncorrected.)
910a0a6d 665
666=cut
667
668sub as_csv {
3a2ebbf4 669 my( $self ) = @_;
910a0a6d 670 my $table = $self->make_alignment_table;
671 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
672 my @result;
2c669bca 673 # Make the header row
674 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
675 push( @result, decode_utf8( $csv->string ) );
676 # Make the rest of the rows
677 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
678 my @rowobjs = map { $_->[$idx] } @{$table->{'alignment'}};
679 my @row = map { $_ ? $_->{'t'} : $_ } @rowobjs;
680 $csv->combine( @row );
910a0a6d 681 push( @result, decode_utf8( $csv->string ) );
682 }
3a2ebbf4 683 return join( "\n", @result );
910a0a6d 684}
685
2c669bca 686=item B<make_alignment_table>
687
688my $table = $graph->make_alignment_table( $use_refs, \@wits_to_include )
689
690Return a reference to an alignment table, in the format described at
691L<http://gregor.middell.net/collatex>. If $use_refs is set to 1, the reading
692object is returned in the table; if not, the text of the reading is returned.
693If $wits_to_include is set to an arrayref, only the witnesses listed will be
694included in the table.
695
696=cut
9f3ba6f7 697
910a0a6d 698sub make_alignment_table {
08e0fb85 699 my( $self, $noderefs, $include ) = @_;
910a0a6d 700 unless( $self->linear ) {
701 warn "Need a linear graph in order to make an alignment table";
702 return;
703 }
2c669bca 704 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 705 my @all_pos = ( 1 .. $self->end->rank - 1 );
910a0a6d 706 foreach my $wit ( $self->tradition->witnesses ) {
2c669bca 707 if( $include ) {
708 next unless grep { $_ eq $wit->sigil } @$include;
709 }
710 $DB::single = 1 if $wit->sigil eq 'U';
eca16057 711 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 712 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
713 my @row = _make_witness_row( \@wit_path, \@all_pos, $noderefs );
2c669bca 714 push( @{$table->{'alignment'}},
715 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 716 if( $wit->is_layered ) {
717 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
718 $wit->sigil.$self->ac_label, $wit->sigil );
719 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos, $noderefs );
2c669bca 720 push( @{$table->{'alignment'}},
721 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 722 }
723 }
2c669bca 724 return $table;
910a0a6d 725}
726
727sub _make_witness_row {
0e476982 728 my( $path, $positions, $noderefs ) = @_;
910a0a6d 729 my %char_hash;
730 map { $char_hash{$_} = undef } @$positions;
2c669bca 731 my $debug = 0;
910a0a6d 732 foreach my $rdg ( @$path ) {
eca16057 733 my $rtext = $rdg->text;
734 $rtext = '#LACUNA#' if $rdg->is_lacuna;
2c669bca 735 print STDERR "rank " . $rdg->rank . "\n" if $debug;
3a2ebbf4 736 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
2c669bca 737 $char_hash{$rdg->rank} = $noderefs ? { 't' => $rdg }
738 : { 't' => $rtext };
910a0a6d 739 }
740 my @row = map { $char_hash{$_} } @$positions;
eca16057 741 # Fill in lacuna markers for undef spots in the row
742 my $last_el = shift @row;
743 my @filled_row = ( $last_el );
744 foreach my $el ( @row ) {
0e476982 745 # If we are using node reference, make the lacuna node appear many times
746 # in the table. If not, use the lacuna tag.
747 if( $last_el && _el_is_lacuna( $last_el ) && !defined $el ) {
2c669bca 748 $el = $noderefs ? { 't' => $last_el } : { 't' => '#LACUNA#' };
eca16057 749 }
750 push( @filled_row, $el );
751 $last_el = $el;
752 }
753 return @filled_row;
910a0a6d 754}
755
0e476982 756# Tiny utility function to say if a table element is a lacuna
757sub _el_is_lacuna {
758 my $el = shift;
2c669bca 759 return 1 if $el->{'t'} eq '#LACUNA#';
760 return 1 if ref( $el->{'t'} ) eq 'Text::Tradition::Collation::Reading'
761 && $el->{'t'}->is_lacuna;
0e476982 762 return 0;
763}
764
910a0a6d 765# Helper to turn the witnesses along columns rather than rows. Assumes
766# equal-sized rows.
767sub _turn_table {
768 my( $table ) = @_;
769 my $result = [];
770 return $result unless scalar @$table;
771 my $nrows = scalar @{$table->[0]};
772 foreach my $idx ( 0 .. $nrows - 1 ) {
773 foreach my $wit ( 0 .. $#{$table} ) {
774 $result->[$idx]->[$wit] = $table->[$wit]->[$idx];
775 }
776 }
777 return $result;
778}
779
8e1394aa 780=back
781
de51424a 782=head2 Navigation methods
783
784=over
785
8e1394aa 786=item B<start>
787
788my $beginning = $collation->start();
789
790Returns the beginning of the collation, a meta-reading with label '#START#'.
791
910a0a6d 792=item B<end>
793
794my $end = $collation->end();
795
796Returns the end of the collation, a meta-reading with label '#END#'.
797
910a0a6d 798
e2902068 799=item B<reading_sequence>
800
801my @readings = $graph->reading_sequence( $first, $last, $path[, $alt_path] );
802
803Returns the ordered list of readings, starting with $first and ending
804with $last, along the given witness path. If no path is specified,
805assume that the path is that of the base text (if any.)
806
807=cut
808
910a0a6d 809# TODO Think about returning some lazy-eval iterator.
810
e2902068 811sub reading_sequence {
812 my( $self, $start, $end, $witness, $backup ) = @_;
813
930ff666 814 $witness = $self->baselabel unless $witness;
e2902068 815 my @readings = ( $start );
816 my %seen;
817 my $n = $start;
3a2ebbf4 818 while( $n && $n->id ne $end->id ) {
819 if( exists( $seen{$n->id} ) ) {
820 warn "Detected loop at " . $n->id;
910a0a6d 821 last;
822 }
3a2ebbf4 823 $seen{$n->id} = 1;
910a0a6d 824
825 my $next = $self->next_reading( $n, $witness, $backup );
44771cf2 826 unless( $next ) {
3a2ebbf4 827 warn "Did not find any path for $witness from reading " . $n->id;
44771cf2 828 last;
829 }
910a0a6d 830 push( @readings, $next );
831 $n = $next;
e2902068 832 }
833 # Check that the last reading is our end reading.
834 my $last = $readings[$#readings];
3a2ebbf4 835 warn "Last reading found from " . $start->text .
910a0a6d 836 " for witness $witness is not the end!"
3a2ebbf4 837 unless $last->id eq $end->id;
e2902068 838
839 return @readings;
840}
841
4a8828f0 842=item B<next_reading>
8e1394aa 843
4a8828f0 844my $next_reading = $graph->next_reading( $reading, $witpath );
8e1394aa 845
4a8828f0 846Returns the reading that follows the given reading along the given witness
930ff666 847path.
8e1394aa 848
849=cut
850
4a8828f0 851sub next_reading {
e2902068 852 # Return the successor via the corresponding path.
8e1394aa 853 my $self = shift;
3a2ebbf4 854 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 855 return undef unless $answer;
3a2ebbf4 856 return $self->reading( $answer );
8e1394aa 857}
858
4a8828f0 859=item B<prior_reading>
8e1394aa 860
4a8828f0 861my $prior_reading = $graph->prior_reading( $reading, $witpath );
8e1394aa 862
4a8828f0 863Returns the reading that precedes the given reading along the given witness
930ff666 864path.
8e1394aa 865
866=cut
867
4a8828f0 868sub prior_reading {
e2902068 869 # Return the predecessor via the corresponding path.
8e1394aa 870 my $self = shift;
3a2ebbf4 871 my $answer = $self->_find_linked_reading( 'prior', @_ );
872 return $self->reading( $answer );
8e1394aa 873}
874
4a8828f0 875sub _find_linked_reading {
e2902068 876 my( $self, $direction, $node, $path, $alt_path ) = @_;
877 my @linked_paths = $direction eq 'next'
3a2ebbf4 878 ? $self->sequence->edges_from( $node )
879 : $self->sequence->edges_to( $node );
e2902068 880 return undef unless scalar( @linked_paths );
8e1394aa 881
e2902068 882 # We have to find the linked path that contains all of the
883 # witnesses supplied in $path.
884 my( @path_wits, @alt_path_wits );
3a2ebbf4 885 @path_wits = sort( $self->witnesses_of_label( $path ) ) if $path;
886 @alt_path_wits = sort( $self->witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 887 my $base_le;
888 my $alt_le;
889 foreach my $le ( @linked_paths ) {
3a2ebbf4 890 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 891 $base_le = $le;
910a0a6d 892 }
3a2ebbf4 893 my @le_wits = $self->path_witnesses( $le );
894 if( _is_within( \@path_wits, \@le_wits ) ) {
895 # This is the right path.
896 return $direction eq 'next' ? $le->[1] : $le->[0];
897 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
898 $alt_le = $le;
899 }
8e1394aa 900 }
e2902068 901 # Got this far? Return the alternate path if it exists.
3a2ebbf4 902 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 903 if $alt_le;
e2902068 904
905 # Got this far? Return the base path if it exists.
3a2ebbf4 906 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 907 if $base_le;
e2902068 908
909 # Got this far? We have no appropriate path.
2c669bca 910 warn "Could not find $direction node from " . $node->id
910a0a6d 911 . " along path $path";
8e1394aa 912 return undef;
913}
914
4a8828f0 915# Some set logic.
916sub _is_within {
917 my( $set1, $set2 ) = @_;
7854e12e 918 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 919 foreach my $el ( @$set1 ) {
910a0a6d 920 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 921 }
922 return $ret;
923}
924
de51424a 925
926## INITIALIZATION METHODS - for use by parsers
930ff666 927
7e450e44 928# For use when a collation is constructed from a base text and an apparatus.
929# We have the sequences of readings and just need to add path edges.
1f7aa795 930# When we are done, clear out the witness path attributes, as they are no
931# longer needed.
932# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 933
6a222840 934sub make_witness_paths {
935 my( $self ) = @_;
910a0a6d 936 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 937 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 938 $self->make_witness_path( $wit );
7854e12e 939 }
7854e12e 940}
941
6a222840 942sub make_witness_path {
7854e12e 943 my( $self, $wit ) = @_;
944 my @chain = @{$wit->path};
15d2d3df 945 my $sig = $wit->sigil;
7854e12e 946 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 947 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 948 }
1f7aa795 949 if( $wit->is_layered ) {
d9e873d0 950 @chain = @{$wit->uncorrected_path};
951 foreach my $idx( 0 .. $#chain-1 ) {
952 my $source = $chain[$idx];
953 my $target = $chain[$idx+1];
954 $self->add_path( $source, $target, $sig.$self->ac_label )
955 unless $self->has_path( $source, $target, $sig );
956 }
15d2d3df 957 }
1f7aa795 958 $wit->clear_path;
959 $wit->clear_uncorrected_path;
e2902068 960}
961
910a0a6d 962sub calculate_ranks {
963 my $self = shift;
964 # Walk a version of the graph where every node linked by a relationship
965 # edge is fundamentally the same node, and do a topological ranking on
966 # the nodes in this graph.
c9bf3dbf 967 my $topo_graph = Graph->new();
910a0a6d 968 my %rel_containers;
969 my $rel_ctr = 0;
970 # Add the nodes
971 foreach my $r ( $self->readings ) {
3a2ebbf4 972 next if exists $rel_containers{$r->id};
910a0a6d 973 my @rels = $r->related_readings( 'colocated' );
974 if( @rels ) {
975 # Make a relationship container.
976 push( @rels, $r );
c9bf3dbf 977 my $rn = 'rel_container_' . $rel_ctr++;
978 $topo_graph->add_vertex( $rn );
910a0a6d 979 foreach( @rels ) {
3a2ebbf4 980 $rel_containers{$_->id} = $rn;
910a0a6d 981 }
982 } else {
983 # Add a new node to mirror the old node.
3a2ebbf4 984 $rel_containers{$r->id} = $r->id;
985 $topo_graph->add_vertex( $r->id );
910a0a6d 986 }
4a8828f0 987 }
3a1f2523 988
3a2ebbf4 989 # Add the edges.
910a0a6d 990 foreach my $r ( $self->readings ) {
3a2ebbf4 991 foreach my $n ( $self->sequence->successors( $r->id ) ) {
992 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
993 $rel_containers{$n} );
49d4f2ac 994 $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 995 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 996 }
997 }
998
999 # Now do the rankings, starting with the start node.
3a2ebbf4 1000 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 1001 my $node_ranks = { $topo_start => 0 };
910a0a6d 1002 my @curr_origin = ( $topo_start );
1003 # A little iterative function.
1004 while( @curr_origin ) {
c9bf3dbf 1005 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 1006 }
1007 # Transfer our rankings from the topological graph to the real one.
1008 foreach my $r ( $self->readings ) {
3a2ebbf4 1009 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1010 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 1011 } else {
1012 $DB::single = 1;
3a2ebbf4 1013 die "No rank calculated for node " . $r->id
67da8d6c 1014 . " - do you have a cycle in the graph?";
1015 }
de51424a 1016 }
8e1394aa 1017}
3a1f2523 1018
910a0a6d 1019sub _assign_rank {
c9bf3dbf 1020 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1021 # Look at each of the children of @current_nodes. If all the child's
1022 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1023 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1024 # parent gets a rank.
910a0a6d 1025 my @next_nodes;
1026 foreach my $c ( @current_nodes ) {
c9bf3dbf 1027 warn "Current reading $c has no rank!"
1028 unless exists $node_ranks->{$c};
1029 # print STDERR "Looking at child of node $c, rank "
1030 # . $node_ranks->{$c} . "\n";
1031 foreach my $child ( $graph->successors( $c ) ) {
1032 next if exists $node_ranks->{$child};
910a0a6d 1033 my $highest_rank = -1;
1034 my $skip = 0;
c9bf3dbf 1035 foreach my $parent ( $graph->predecessors( $child ) ) {
1036 if( exists $node_ranks->{$parent} ) {
1037 $highest_rank = $node_ranks->{$parent}
1038 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1039 } else {
1040 $skip = 1;
1041 last;
1042 }
1043 }
1044 next if $skip;
c9bf3dbf 1045 my $c_rank = $highest_rank + 1;
1046 # print STDERR "Assigning rank $c_rank to node $child \n";
1047 $node_ranks->{$child} = $c_rank;
910a0a6d 1048 push( @next_nodes, $child );
1049 }
1050 }
1051 return @next_nodes;
4cdd82f1 1052}
910a0a6d 1053
0e476982 1054# Another method to make up for rough collation methods. If the same reading
1055# appears multiple times at the same rank, collapse the nodes.
1056sub flatten_ranks {
1057 my $self = shift;
1058 my %unique_rank_rdg;
1059 foreach my $rdg ( $self->readings ) {
1060 next unless $rdg->has_rank;
1061 my $key = $rdg->rank . "||" . $rdg->text;
1062 if( exists $unique_rank_rdg{$key} ) {
1063 # Combine!
0068967c 1064 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1065 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
1066 } else {
1067 $unique_rank_rdg{$key} = $rdg;
1068 }
1069 }
1070}
1071
1072
fa954f4c 1073## Utility functions
de51424a 1074
4a8828f0 1075# Return the string that joins together a list of witnesses for
1076# display on a single path.
4a8828f0 1077sub witnesses_of_label {
de51424a 1078 my( $self, $label ) = @_;
4a8828f0 1079 my $regex = $self->wit_list_separator;
de51424a 1080 my @answer = split( /\Q$regex\E/, $label );
1081 return @answer;
4a8828f0 1082}
8e1394aa 1083
dd3b58b0 1084no Moose;
1085__PACKAGE__->meta->make_immutable;
e867486f 1086
1087=head1 BUGS / TODO
1088
1089=over
1090
0068967c 1091=item * Think about making Relationship objects again
e867486f 1092
1093=back