fix some bugs for alignment table with object refs
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
c9bf3dbf 5use Graph;
8e1394aa 6use IPC::Run qw( run binary );
910a0a6d 7use Text::CSV_XS;
b15511bf 8use Text::Tradition::Collation::Reading;
df6d9812 9use XML::LibXML;
dd3b58b0 10use Moose;
11
3a2ebbf4 12has 'sequence' => (
d047cd52 13 is => 'ro',
3a2ebbf4 14 isa => 'Graph',
15 default => sub { Graph->new() },
d047cd52 16 handles => {
3a2ebbf4 17 paths => 'edges',
d047cd52 18 },
d047cd52 19 );
3a2ebbf4 20
21has 'relations' => (
22 is => 'ro',
23 isa => 'Graph',
24 default => sub { Graph->new( undirected => 1 ) },
25 handles => {
26 relationships => 'edges',
27 },
28 );
dd3b58b0 29
3a2ebbf4 30has 'tradition' => (
31 is => 'ro',
d047cd52 32 isa => 'Text::Tradition',
8d9a1cd8 33 weak_ref => 1,
d047cd52 34 );
dd3b58b0 35
3a2ebbf4 36has 'readings' => (
37 isa => 'HashRef[Text::Tradition::Collation::Reading]',
38 traits => ['Hash'],
39 handles => {
40 reading => 'get',
41 _add_reading => 'set',
42 del_reading => 'delete',
43 has_reading => 'exists',
44 readings => 'values',
45 },
46 default => sub { {} },
47 );
910a0a6d 48
4a8828f0 49has 'wit_list_separator' => (
7854e12e 50 is => 'rw',
51 isa => 'Str',
52 default => ', ',
53 );
54
55has 'baselabel' => (
56 is => 'rw',
57 isa => 'Str',
58 default => 'base text',
59 );
4a8828f0 60
15d2d3df 61has 'linear' => (
62 is => 'rw',
63 isa => 'Bool',
64 default => 1,
65 );
1f563ac3 66
ef9d481f 67has 'ac_label' => (
68 is => 'rw',
69 isa => 'Str',
70 default => ' (a.c.)',
71 );
3a2ebbf4 72
73has 'start' => (
74 is => 'ro',
75 isa => 'Text::Tradition::Collation::Reading',
76 writer => '_set_start',
77 weak_ref => 1,
78 );
79
80has 'end' => (
81 is => 'ro',
82 isa => 'Text::Tradition::Collation::Reading',
83 writer => '_set_end',
84 weak_ref => 1,
85 );
1f563ac3 86
dd3b58b0 87# The collation can be created two ways:
88# 1. Collate a set of witnesses (with CollateX I guess) and process
89# the results as in 2.
90# 2. Read a pre-prepared collation in one of a variety of formats,
91# and make the graph from that.
92
93# The graph itself will (for now) be immutable, and the positions
94# within the graph will also be immutable. We need to calculate those
95# positions upon graph construction. The equivalences between graph
96# nodes will be mutable, entirely determined by the user (or possibly
97# by some semantic pre-processing provided by the user.) So the
98# constructor should just make an empty equivalences object. The
99# constructor will also need to make the witness objects, if we didn't
100# come through option 1.
101
d047cd52 102sub BUILD {
3a2ebbf4 103 my $self = shift;
104 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
105 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 106}
784877d9 107
3a2ebbf4 108### Reading construct/destruct functions
109
110sub add_reading {
111 my( $self, $reading ) = @_;
112 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
113 my %args = %$reading;
114 $reading = Text::Tradition::Collation::Reading->new(
115 'collation' => $self,
116 %args );
117 }
118 # First check to see if a reading with this ID exists.
119 if( $self->reading( $reading->id ) ) {
120 warn "Collation already has a reading with id " . $reading->id;
121 return undef;
122 }
123 $self->_add_reading( $reading->id => $reading );
124 # Once the reading has been added, put it in both graphs.
125 $self->sequence->add_vertex( $reading->id );
126 $self->relations->add_vertex( $reading->id );
127 return $reading;
eca16057 128};
129
3a2ebbf4 130around del_reading => sub {
131 my $orig = shift;
132 my $self = shift;
133 my $arg = shift;
134
135 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
136 $arg = $arg->id;
137 }
3a2ebbf4 138 # Remove the reading from the graphs.
139 $self->sequence->delete_vertex( $arg );
140 $self->relations->delete_vertex( $arg );
141
142 # Carry on.
143 $self->$orig( $arg );
144};
7854e12e 145
3a2ebbf4 146# merge_readings( $main, $to_be_deleted );
7854e12e 147
3a2ebbf4 148sub merge_readings {
149 my $self = shift;
150
151 # We only need the IDs for adding paths to the graph, not the reading
152 # objects themselves.
49d4f2ac 153 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
3a2ebbf4 154
155 # The kept reading should inherit the paths and the relationships
156 # of the deleted reading.
157 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
158 my @vector = ( $kept );
159 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
160 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 161 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 162 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
163 $self->sequence->add_edge( @vector );
164 my $fwits = $self->sequence->get_edge_attributes( @vector );
165 @wits{keys %$fwits} = values %$fwits;
166 $self->sequence->set_edge_attributes( @vector, \%wits );
167 }
168 foreach my $rel ( $self->relations->edges_at( $deleted ) ) {
169 my @vector = ( $kept );
170 push( @vector, $rel->[0] eq $deleted ? $rel->[1] : $rel->[0] );
49d4f2ac 171 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 172 # Is there a relationship here already? If so, keep it.
173 # TODO Warn about conflicting relationships
174 next if $self->relations->has_edge( @vector );
175 # If not, adopt the relationship that would be deleted.
176 $self->relations->add_edge( @vector );
177 my $attr = $self->relations->get_edge_attributes( @$rel );
178 $self->relations->set_edge_attributes( @vector, $attr );
179 }
180
181 # Do the deletion deed.
49d4f2ac 182 if( $combine_char ) {
183 my $kept_obj = $self->reading( $kept );
184 my $new_text = join( $combine_char, $kept_obj->text,
185 $self->reading( $deleted )->text );
186 $kept_obj->alter_text( $new_text );
187 }
3a2ebbf4 188 $self->del_reading( $deleted );
189}
7854e12e 190
3265b0ce 191
3a2ebbf4 192# Helper function for manipulating the graph.
193sub _stringify_args {
194 my( $self, $first, $second, $arg ) = @_;
195 $first = $first->id
196 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
197 $second = $second->id
198 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
199 return( $first, $second, $arg );
200}
df6d9812 201
3a2ebbf4 202### Path logic
203
204sub add_path {
205 my $self = shift;
206
207 # We only need the IDs for adding paths to the graph, not the reading
208 # objects themselves.
209 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
210
211 # Connect the readings
212 $self->sequence->add_edge( $source, $target );
213 # Note the witness in question
214 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 215};
216
3a2ebbf4 217sub del_path {
218 my $self = shift;
49d4f2ac 219 my @args;
220 if( ref( $_[0] ) eq 'ARRAY' ) {
221 my $e = shift @_;
222 @args = ( @$e, @_ );
223 } else {
224 @args = @_;
225 }
3a2ebbf4 226
227 # We only need the IDs for adding paths to the graph, not the reading
228 # objects themselves.
49d4f2ac 229 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 230
231 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 232 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 233 }
234 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
235 $self->sequence->delete_edge( $source, $target );
236 }
784877d9 237}
238
3a2ebbf4 239
15d2d3df 240# Extra graph-alike utility
241sub has_path {
3a2ebbf4 242 my $self = shift;
243 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
244 return undef unless $self->sequence->has_edge( $source, $target );
245 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 246}
247
3a2ebbf4 248### Relationship logic
3265b0ce 249
3a2ebbf4 250=head2 add_relationship( $reading1, $reading2, $definition )
251
252Adds the specified relationship between the two readings. A relationship
253is transitive (i.e. undirected), and must have the following attributes
254specified in the hashref $definition:
255
256=over 4
257
ad1291ee 258=item * type - Can be one of spelling, orthographic, grammatical, meaning, lexical, collated, repetition, transposition. All but the last two are only valid relationships between readings that occur at the same point in the text.
3a2ebbf4 259
260=item * non_correctable - (Optional) True if the reading would not have been corrected independently.
ef9d481f 261
3a2ebbf4 262=item * non_independent - (Optional) True if the variant is unlikely to have occurred independently in unrelated witnesses.
263
264=item * global - (Optional) A meta-attribute, to set the same relationship between readings with the same text whenever they occur in the same place.
265
266=back
267
268=cut
269
270# Wouldn't it be lovely if edges could be objects, and all this type checking
271# and attribute management could be done via Moose?
272
273sub add_relationship {
274 my $self = shift;
275 my( $source, $target, $options ) = $self->_stringify_args( @_ );
276
277 # Check the options
278 if( !defined $options->{'type'} ||
131fa659 279 $options->{'type'} !~ /^(spelling|orthographic|grammatical|meaning|lexical|collated|repetition|transposition)$/i ) {
3a2ebbf4 280 my $t = $options->{'type'} ? $options->{'type'} : '';
49d4f2ac 281 return( undef, "Invalid or missing type " . $options->{'type'} );
3a2ebbf4 282 }
49d4f2ac 283 unless ( $options->{'type'} =~ /^(repetition|transposition)$/ ) {
3a2ebbf4 284 $options->{'colocated'} = 1;
285 }
286
ef9d481f 287 # Make sure there is not another relationship between these two
94c00c71 288 # readings already
3a2ebbf4 289 if( $self->relations->has_edge( $source, $target ) ) {
290 return ( undef, "Relationship already exists between these readings" );
4cdd82f1 291 }
1d310495 292 if( !$self->relationship_valid( $source, $target, $options->{'type'} ) ) {
910a0a6d 293 return ( undef, 'Relationship creates witness loop' );
ef9d481f 294 }
295
3a2ebbf4 296 my @vector = ( $source, $target );
297 $self->relations->add_edge( @vector );
298 $self->relations->set_edge_attributes( @vector, $options );
910a0a6d 299
300 # TODO Handle global relationship setting
301
3a2ebbf4 302 return( 1, @vector );
3265b0ce 303}
304
910a0a6d 305sub relationship_valid {
1d310495 306 my( $self, $source, $target, $rel ) = @_;
307 if( $rel eq 'repetition' ) {
308 return 1;
309 } elsif ( $rel eq 'transposition' ) {
310 # Check that the two readings do not appear in the same witness.
311 my %seen_wits;
312 map { $seen_wits{$_} = 1 } $self->reading_witnesses( $source );
313 foreach my $w ( $self->reading_witnesses( $target ) ) {
314 return 0 if $seen_wits{$w};
315 }
316 return 1;
317 } else {
318 # Check that linking the source and target in a relationship won't lead
319 # to a path loop for any witness. First make a lookup table of all the
320 # readings related to either the source or the target.
321 my @proposed_related = ( $source, $target );
49d4f2ac 322 push( @proposed_related, $self->related_readings( $source, 'colocated' ) );
323 push( @proposed_related, $self->related_readings( $target, 'colocated' ) );
1d310495 324 my %pr_ids;
49d4f2ac 325 map { $pr_ids{ $_ } = 1 } @proposed_related;
1d310495 326
327 # None of these proposed related readings should have a neighbor that
328 # is also in proposed_related.
329 foreach my $pr ( keys %pr_ids ) {
330 foreach my $neighbor( $self->sequence->neighbors( $pr ) ) {
331 return 0 if exists $pr_ids{$neighbor};
332 }
333 }
334 return 1;
335 }
336}
337
338# Return a list of the witnesses in which the reading appears.
339sub reading_witnesses {
340 my( $self, $reading ) = @_;
341 # We need only check either the incoming or the outgoing edges; I have
342 # arbitrarily chosen "incoming".
343 my %all_witnesses;
344 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
345 my $wits = $self->sequence->get_edge_attributes( @$e );
346 @all_witnesses{ keys %$wits } = 1;
347 }
348 return keys %all_witnesses;
910a0a6d 349}
350
3a2ebbf4 351sub related_readings {
352 my( $self, $reading, $colocated ) = @_;
a753cc84 353 my $return_object;
354 if( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
355 $reading = $reading->id;
356 $return_object = 1;
7035e3a6 357# print STDERR "Returning related objects\n";
358# } else {
359# print STDERR "Returning related object names\n";
a753cc84 360 }
3a2ebbf4 361 my @related = $self->relations->all_reachable( $reading );
362 if( $colocated ) {
363 my @colo = grep { $self->relations->has_edge_attribute( $reading, $_, 'colocated' ) } @related;
a753cc84 364 @related = @colo;
365 }
366 return $return_object ? map { $self->reading( $_ ) } @related : @related;
3a2ebbf4 367}
368
8e1394aa 369=head2 Output method(s)
370
371=over
372
373=item B<as_svg>
374
375print $graph->as_svg( $recalculate );
376
0068967c 377Returns an SVG string that represents the graph, via as_dot and graphviz.
8e1394aa 378
379=cut
380
381sub as_svg {
3a2ebbf4 382 my( $self ) = @_;
383
8e1394aa 384 my @cmd = qw/dot -Tsvg/;
385 my( $svg, $err );
910a0a6d 386 my $dotfile = File::Temp->new();
d9e873d0 387 ## TODO REMOVE
eca16057 388 # $dotfile->unlink_on_destroy(0);
910a0a6d 389 binmode $dotfile, ':utf8';
390 print $dotfile $self->as_dot();
391 push( @cmd, $dotfile->filename );
392 run( \@cmd, ">", binary(), \$svg );
393 $svg = decode_utf8( $svg );
8e1394aa 394 return $svg;
395}
396
df6d9812 397=item B<as_dot>
398
399print $graph->as_dot( $view, $recalculate );
400
401Returns a string that is the collation graph expressed in dot
402(i.e. GraphViz) format. The 'view' argument determines what kind of
403graph is produced.
404 * 'path': a graph of witness paths through the collation (DEFAULT)
405 * 'relationship': a graph of how collation readings relate to
406 each other
407
408=cut
409
410sub as_dot {
411 my( $self, $view ) = @_;
3a2ebbf4 412 $view = 'sequence' unless $view;
df6d9812 413 # TODO consider making some of these things configurable
67da8d6c 414 my $graph_name = $self->tradition->name;
415 $graph_name =~ s/[^\w\s]//g;
416 $graph_name = join( '_', split( /\s+/, $graph_name ) );
417 my $dot = sprintf( "digraph %s {\n", $graph_name );
df6d9812 418 $dot .= "\tedge [ arrowhead=open ];\n";
419 $dot .= "\tgraph [ rankdir=LR ];\n";
420 $dot .= sprintf( "\tnode [ fontsize=%d, fillcolor=%s, style=%s, shape=%s ];\n",
3a2ebbf4 421 11, "white", "filled", "ellipse" );
df6d9812 422
423 foreach my $reading ( $self->readings ) {
910a0a6d 424 # Need not output nodes without separate labels
3a2ebbf4 425 next if $reading->id eq $reading->text;
8f9cab7b 426 my $label = $reading->text;
427 $label =~ s/\"/\\\"/g;
428 $dot .= sprintf( "\t\"%s\" [ label=\"%s\" ];\n", $reading->id, $label );
df6d9812 429 }
3a2ebbf4 430
431 # TODO do something sensible for relationships
df6d9812 432
3a2ebbf4 433 my @edges = $self->paths;
df6d9812 434 foreach my $edge ( @edges ) {
910a0a6d 435 my %variables = ( 'color' => '#000000',
436 'fontcolor' => '#000000',
8f9cab7b 437 'label' => join( ', ', $self->path_display_label( $edge ) ),
910a0a6d 438 );
439 my $varopts = join( ', ', map { $_.'="'.$variables{$_}.'"' } sort keys %variables );
81a4c6a0 440 # Account for the rank gap if necessary
441 my $rankgap = $self->reading( $edge->[1] )->rank
442 - $self->reading( $edge->[0] )->rank;
443 $varopts .= ", minlen=$rankgap" if $rankgap > 1;
910a0a6d 444 $dot .= sprintf( "\t\"%s\" -> \"%s\" [ %s ];\n",
3a2ebbf4 445 $edge->[0], $edge->[1], $varopts );
df6d9812 446 }
df6d9812 447 $dot .= "}\n";
448 return $dot;
449}
450
3a2ebbf4 451sub path_witnesses {
452 my( $self, @edge ) = @_;
453 # If edge is an arrayref, cope.
454 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
455 my $e = shift @edge;
456 @edge = @$e;
457 }
458 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
459 return sort @wits;
460}
461
8f9cab7b 462sub path_display_label {
463 my( $self, $edge ) = @_;
464 my @wits = $self->path_witnesses( $edge );
465 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
466 if( scalar @wits > $maj ) {
467 return 'majority';
468 } else {
469 return join( ', ', @wits );
470 }
471}
472
473
8e1394aa 474=item B<as_graphml>
475
476print $graph->as_graphml( $recalculate )
477
478Returns a GraphML representation of the collation graph, with
479transposition information and position information. Unless
480$recalculate is passed (and is a true value), the method will return a
481cached copy of the SVG after the first call to the method.
482
483=cut
484
485sub as_graphml {
3a2ebbf4 486 my( $self ) = @_;
8e1394aa 487
488 # Some namespaces
489 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
490 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
491 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 492 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 493
494 # Create the document and root node
495 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
496 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
497 $graphml->setDocumentElement( $root );
498 $root->setNamespace( $xsi_ns, 'xsi', 0 );
499 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
500
e309421a 501 # Add the data keys for the graph
502 my %graph_data_keys;
503 my $gdi = 0;
1d310495 504 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
e309421a 505 foreach my $datum ( @graph_attributes ) {
506 $graph_data_keys{$datum} = 'dg'.$gdi++;
507 my $key = $root->addNewChild( $graphml_ns, 'key' );
508 $key->setAttribute( 'attr.name', $datum );
509 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
510 $key->setAttribute( 'for', 'graph' );
511 $key->setAttribute( 'id', $graph_data_keys{$datum} );
512 }
f6066bac 513
8e1394aa 514 # Add the data keys for nodes
ef9d481f 515 my %node_data_keys;
516 my $ndi = 0;
3a2ebbf4 517 my %node_data = (
518 id => 'string',
255875b8 519 text => 'string',
3a2ebbf4 520 rank => 'string',
521 is_start => 'boolean',
522 is_end => 'boolean',
523 is_lacuna => 'boolean',
524 );
525 foreach my $datum ( keys %node_data ) {
910a0a6d 526 $node_data_keys{$datum} = 'dn'.$ndi++;
527 my $key = $root->addNewChild( $graphml_ns, 'key' );
528 $key->setAttribute( 'attr.name', $datum );
3a2ebbf4 529 $key->setAttribute( 'attr.type', $node_data{$datum} );
910a0a6d 530 $key->setAttribute( 'for', 'node' );
531 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 532 }
533
df6d9812 534 # Add the data keys for edges, i.e. witnesses
ef9d481f 535 my $edi = 0;
536 my %edge_data_keys;
3a2ebbf4 537 my %edge_data = (
3a2ebbf4 538 witness => 'string', # ID/label for a path
539 relationship => 'string', # ID/label for a relationship
540 extra => 'boolean', # Path key
541 colocated => 'boolean', # Relationship key
542 non_correctable => 'boolean', # Relationship key
543 non_independent => 'boolean', # Relationship key
544 );
545 foreach my $datum ( keys %edge_data ) {
546 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 547 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 548 $key->setAttribute( 'attr.name', $datum );
549 $key->setAttribute( 'attr.type', $edge_data{$datum} );
910a0a6d 550 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 551 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 552 }
3a2ebbf4 553
2c669bca 554 # Add the collation graphs themselves
555 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
556 $sgraph->setAttribute( 'edgedefault', 'directed' );
557 $sgraph->setAttribute( 'id', $self->tradition->name );
558 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
559 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
560 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
561 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
562 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
563
564 my $rgraph = $root->addNewChild( $graphml_ns, 'graph' );
565 $rgraph->setAttribute( 'edgedefault', 'undirected' );
566 $rgraph->setAttribute( 'id', 'relationships' );
567 $rgraph->setAttribute( 'parse.edgeids', 'canonical' );
568 $rgraph->setAttribute( 'parse.edges', scalar($self->relationships) );
569 $rgraph->setAttribute( 'parse.nodeids', 'canonical' );
570 $rgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
571 $rgraph->setAttribute( 'parse.order', 'nodesfirst' );
e309421a 572
573 # Collation attribute data
574 foreach my $datum ( @graph_attributes ) {
2c669bca 575 my $value = $datum eq 'version' ? '3.0' : $self->$datum;
576 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 577 }
8e1394aa 578
579 my $node_ctr = 0;
580 my %node_hash;
2c669bca 581 # Add our readings to the graphs
3a2ebbf4 582 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
2c669bca 583 # Add to the main graph
584 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 585 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 586 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 587 $node_el->setAttribute( 'id', $node_xmlid );
255875b8 588 foreach my $d ( keys %node_data ) {
589 my $nval = $n->$d;
590 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
591 if defined $nval;
592 }
2c669bca 593 # Add to the relationships graph
594 my $rnode_el = $rgraph->addNewChild( $graphml_ns, 'node' );
595 $rnode_el->setAttribute( 'id', $node_xmlid );
b15511bf 596 }
597
2c669bca 598 # Add the path edges to the sequence graph
df6d9812 599 my $edge_ctr = 0;
3a2ebbf4 600 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
601 # We add an edge in the graphml for every witness in $e.
602 foreach my $wit ( $self->path_witnesses( $e ) ) {
603 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
604 $node_hash{ $e->[0] },
605 $node_hash{ $e->[1] } );
2c669bca 606 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 607 $edge_el->setAttribute( 'source', $from );
608 $edge_el->setAttribute( 'target', $to );
609 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 610
611 # It's a witness path, so add the witness
612 my $base = $wit;
613 my $key = $edge_data_keys{'witness'};
614 # Is this an ante-corr witness?
615 my $aclabel = $self->ac_label;
616 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
617 # Keep the base witness
618 $base = $1;
619 # ...and record that this is an 'extra' reading path
620 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
621 }
622 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
623 }
624 }
625
2c669bca 626 # Add the relationship edges to the relationships graph
3a2ebbf4 627 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->relationships ) {
628 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
629 $node_hash{ $e->[0] },
630 $node_hash{ $e->[1] } );
2c669bca 631 my $edge_el = $rgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 632 $edge_el->setAttribute( 'source', $from );
633 $edge_el->setAttribute( 'target', $to );
634 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 635
636 my $data = $self->relations->get_edge_attributes( @$e );
637 # It's a relationship, so save the relationship data
638 _add_graphml_data( $edge_el, $edge_data_keys{'relationship'}, $data->{type} );
639 _add_graphml_data( $edge_el, $edge_data_keys{'colocated'}, $data->{colocated} );
640 if( exists $data->{non_correctable} ) {
641 _add_graphml_data( $edge_el, $edge_data_keys{'non_correctable'},
642 $data->{non_correctable} );
643 }
644 if( exists $data->{non_independent} ) {
645 _add_graphml_data( $edge_el, $edge_data_keys{'non_independent'},
646 $data->{non_independent} );
647 }
8e1394aa 648 }
649
94c00c71 650 # Save and return the thing
651 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 652 return $result;
df6d9812 653}
654
b15511bf 655sub _add_graphml_data {
656 my( $el, $key, $value ) = @_;
b15511bf 657 return unless defined $value;
c9bf3dbf 658 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 659 $data_el->setAttribute( 'key', $key );
660 $data_el->appendText( $value );
8e1394aa 661}
662
910a0a6d 663=item B<as_csv>
664
665print $graph->as_csv( $recalculate )
666
667Returns a CSV alignment table representation of the collation graph, one
2c669bca 668row per witness (or witness uncorrected.)
910a0a6d 669
670=cut
671
672sub as_csv {
3a2ebbf4 673 my( $self ) = @_;
910a0a6d 674 my $table = $self->make_alignment_table;
675 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
676 my @result;
2c669bca 677 # Make the header row
678 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
679 push( @result, decode_utf8( $csv->string ) );
680 # Make the rest of the rows
681 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 682 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
2c669bca 683 my @row = map { $_ ? $_->{'t'} : $_ } @rowobjs;
684 $csv->combine( @row );
910a0a6d 685 push( @result, decode_utf8( $csv->string ) );
686 }
3a2ebbf4 687 return join( "\n", @result );
910a0a6d 688}
689
2c669bca 690=item B<make_alignment_table>
691
692my $table = $graph->make_alignment_table( $use_refs, \@wits_to_include )
693
566f4595 694Return a reference to an alignment table, in a slightly enhanced CollateX
695format which looks like this:
696
697 $table = { alignment => [ { witness => "SIGIL",
698 tokens => [ { t => "READINGTEXT" }, ... ] },
699 { witness => "SIG2",
700 tokens => [ { t => "READINGTEXT" }, ... ] },
701 ... ],
702 length => TEXTLEN };
703
704If $use_refs is set to 1, the reading object is returned in the table
705instead of READINGTEXT; if not, the text of the reading is returned.
706If $wits_to_include is set to a hashref, only the witnesses whose sigil
707keys have a true hash value will be included.
2c669bca 708
709=cut
9f3ba6f7 710
910a0a6d 711sub make_alignment_table {
08e0fb85 712 my( $self, $noderefs, $include ) = @_;
910a0a6d 713 unless( $self->linear ) {
714 warn "Need a linear graph in order to make an alignment table";
715 return;
716 }
2c669bca 717 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 718 my @all_pos = ( 1 .. $self->end->rank - 1 );
910a0a6d 719 foreach my $wit ( $self->tradition->witnesses ) {
2c669bca 720 if( $include ) {
566f4595 721 next unless $include->{$wit->sigil};
2c669bca 722 }
eca16057 723 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 724 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
725 my @row = _make_witness_row( \@wit_path, \@all_pos, $noderefs );
2c669bca 726 push( @{$table->{'alignment'}},
727 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 728 if( $wit->is_layered ) {
729 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
730 $wit->sigil.$self->ac_label, $wit->sigil );
731 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos, $noderefs );
2c669bca 732 push( @{$table->{'alignment'}},
733 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 734 }
735 }
2c669bca 736 return $table;
910a0a6d 737}
738
739sub _make_witness_row {
0e476982 740 my( $path, $positions, $noderefs ) = @_;
910a0a6d 741 my %char_hash;
742 map { $char_hash{$_} = undef } @$positions;
2c669bca 743 my $debug = 0;
910a0a6d 744 foreach my $rdg ( @$path ) {
eca16057 745 my $rtext = $rdg->text;
746 $rtext = '#LACUNA#' if $rdg->is_lacuna;
2c669bca 747 print STDERR "rank " . $rdg->rank . "\n" if $debug;
3a2ebbf4 748 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
2c669bca 749 $char_hash{$rdg->rank} = $noderefs ? { 't' => $rdg }
750 : { 't' => $rtext };
910a0a6d 751 }
752 my @row = map { $char_hash{$_} } @$positions;
eca16057 753 # Fill in lacuna markers for undef spots in the row
754 my $last_el = shift @row;
755 my @filled_row = ( $last_el );
756 foreach my $el ( @row ) {
0e476982 757 # If we are using node reference, make the lacuna node appear many times
758 # in the table. If not, use the lacuna tag.
759 if( $last_el && _el_is_lacuna( $last_el ) && !defined $el ) {
566f4595 760 $el = $noderefs ? $last_el : { 't' => '#LACUNA#' };
eca16057 761 }
762 push( @filled_row, $el );
763 $last_el = $el;
764 }
765 return @filled_row;
910a0a6d 766}
767
0e476982 768# Tiny utility function to say if a table element is a lacuna
769sub _el_is_lacuna {
770 my $el = shift;
2c669bca 771 return 1 if $el->{'t'} eq '#LACUNA#';
772 return 1 if ref( $el->{'t'} ) eq 'Text::Tradition::Collation::Reading'
773 && $el->{'t'}->is_lacuna;
0e476982 774 return 0;
775}
776
910a0a6d 777# Helper to turn the witnesses along columns rather than rows. Assumes
778# equal-sized rows.
779sub _turn_table {
780 my( $table ) = @_;
781 my $result = [];
782 return $result unless scalar @$table;
783 my $nrows = scalar @{$table->[0]};
784 foreach my $idx ( 0 .. $nrows - 1 ) {
785 foreach my $wit ( 0 .. $#{$table} ) {
786 $result->[$idx]->[$wit] = $table->[$wit]->[$idx];
787 }
788 }
789 return $result;
790}
791
8e1394aa 792=back
793
de51424a 794=head2 Navigation methods
795
796=over
797
8e1394aa 798=item B<start>
799
800my $beginning = $collation->start();
801
802Returns the beginning of the collation, a meta-reading with label '#START#'.
803
910a0a6d 804=item B<end>
805
806my $end = $collation->end();
807
808Returns the end of the collation, a meta-reading with label '#END#'.
809
910a0a6d 810
e2902068 811=item B<reading_sequence>
812
813my @readings = $graph->reading_sequence( $first, $last, $path[, $alt_path] );
814
815Returns the ordered list of readings, starting with $first and ending
816with $last, along the given witness path. If no path is specified,
817assume that the path is that of the base text (if any.)
818
819=cut
820
910a0a6d 821# TODO Think about returning some lazy-eval iterator.
822
e2902068 823sub reading_sequence {
824 my( $self, $start, $end, $witness, $backup ) = @_;
825
930ff666 826 $witness = $self->baselabel unless $witness;
e2902068 827 my @readings = ( $start );
828 my %seen;
829 my $n = $start;
3a2ebbf4 830 while( $n && $n->id ne $end->id ) {
831 if( exists( $seen{$n->id} ) ) {
832 warn "Detected loop at " . $n->id;
910a0a6d 833 last;
834 }
3a2ebbf4 835 $seen{$n->id} = 1;
910a0a6d 836
837 my $next = $self->next_reading( $n, $witness, $backup );
44771cf2 838 unless( $next ) {
3a2ebbf4 839 warn "Did not find any path for $witness from reading " . $n->id;
44771cf2 840 last;
841 }
910a0a6d 842 push( @readings, $next );
843 $n = $next;
e2902068 844 }
845 # Check that the last reading is our end reading.
846 my $last = $readings[$#readings];
3a2ebbf4 847 warn "Last reading found from " . $start->text .
910a0a6d 848 " for witness $witness is not the end!"
3a2ebbf4 849 unless $last->id eq $end->id;
e2902068 850
851 return @readings;
852}
853
4a8828f0 854=item B<next_reading>
8e1394aa 855
4a8828f0 856my $next_reading = $graph->next_reading( $reading, $witpath );
8e1394aa 857
4a8828f0 858Returns the reading that follows the given reading along the given witness
930ff666 859path.
8e1394aa 860
861=cut
862
4a8828f0 863sub next_reading {
e2902068 864 # Return the successor via the corresponding path.
8e1394aa 865 my $self = shift;
3a2ebbf4 866 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 867 return undef unless $answer;
3a2ebbf4 868 return $self->reading( $answer );
8e1394aa 869}
870
4a8828f0 871=item B<prior_reading>
8e1394aa 872
4a8828f0 873my $prior_reading = $graph->prior_reading( $reading, $witpath );
8e1394aa 874
4a8828f0 875Returns the reading that precedes the given reading along the given witness
930ff666 876path.
8e1394aa 877
878=cut
879
4a8828f0 880sub prior_reading {
e2902068 881 # Return the predecessor via the corresponding path.
8e1394aa 882 my $self = shift;
3a2ebbf4 883 my $answer = $self->_find_linked_reading( 'prior', @_ );
884 return $self->reading( $answer );
8e1394aa 885}
886
4a8828f0 887sub _find_linked_reading {
e2902068 888 my( $self, $direction, $node, $path, $alt_path ) = @_;
889 my @linked_paths = $direction eq 'next'
3a2ebbf4 890 ? $self->sequence->edges_from( $node )
891 : $self->sequence->edges_to( $node );
e2902068 892 return undef unless scalar( @linked_paths );
8e1394aa 893
e2902068 894 # We have to find the linked path that contains all of the
895 # witnesses supplied in $path.
896 my( @path_wits, @alt_path_wits );
3a2ebbf4 897 @path_wits = sort( $self->witnesses_of_label( $path ) ) if $path;
898 @alt_path_wits = sort( $self->witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 899 my $base_le;
900 my $alt_le;
901 foreach my $le ( @linked_paths ) {
3a2ebbf4 902 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 903 $base_le = $le;
910a0a6d 904 }
3a2ebbf4 905 my @le_wits = $self->path_witnesses( $le );
906 if( _is_within( \@path_wits, \@le_wits ) ) {
907 # This is the right path.
908 return $direction eq 'next' ? $le->[1] : $le->[0];
909 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
910 $alt_le = $le;
911 }
8e1394aa 912 }
e2902068 913 # Got this far? Return the alternate path if it exists.
3a2ebbf4 914 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 915 if $alt_le;
e2902068 916
917 # Got this far? Return the base path if it exists.
3a2ebbf4 918 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 919 if $base_le;
e2902068 920
921 # Got this far? We have no appropriate path.
2c669bca 922 warn "Could not find $direction node from " . $node->id
910a0a6d 923 . " along path $path";
8e1394aa 924 return undef;
925}
926
4a8828f0 927# Some set logic.
928sub _is_within {
929 my( $set1, $set2 ) = @_;
7854e12e 930 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 931 foreach my $el ( @$set1 ) {
910a0a6d 932 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 933 }
934 return $ret;
935}
936
de51424a 937
938## INITIALIZATION METHODS - for use by parsers
930ff666 939
7e450e44 940# For use when a collation is constructed from a base text and an apparatus.
941# We have the sequences of readings and just need to add path edges.
1f7aa795 942# When we are done, clear out the witness path attributes, as they are no
943# longer needed.
944# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 945
6a222840 946sub make_witness_paths {
947 my( $self ) = @_;
910a0a6d 948 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 949 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 950 $self->make_witness_path( $wit );
7854e12e 951 }
7854e12e 952}
953
6a222840 954sub make_witness_path {
7854e12e 955 my( $self, $wit ) = @_;
956 my @chain = @{$wit->path};
15d2d3df 957 my $sig = $wit->sigil;
7854e12e 958 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 959 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 960 }
1f7aa795 961 if( $wit->is_layered ) {
d9e873d0 962 @chain = @{$wit->uncorrected_path};
963 foreach my $idx( 0 .. $#chain-1 ) {
964 my $source = $chain[$idx];
965 my $target = $chain[$idx+1];
966 $self->add_path( $source, $target, $sig.$self->ac_label )
967 unless $self->has_path( $source, $target, $sig );
968 }
15d2d3df 969 }
1f7aa795 970 $wit->clear_path;
971 $wit->clear_uncorrected_path;
e2902068 972}
973
910a0a6d 974sub calculate_ranks {
975 my $self = shift;
976 # Walk a version of the graph where every node linked by a relationship
977 # edge is fundamentally the same node, and do a topological ranking on
978 # the nodes in this graph.
c9bf3dbf 979 my $topo_graph = Graph->new();
910a0a6d 980 my %rel_containers;
981 my $rel_ctr = 0;
982 # Add the nodes
983 foreach my $r ( $self->readings ) {
3a2ebbf4 984 next if exists $rel_containers{$r->id};
910a0a6d 985 my @rels = $r->related_readings( 'colocated' );
986 if( @rels ) {
987 # Make a relationship container.
988 push( @rels, $r );
c9bf3dbf 989 my $rn = 'rel_container_' . $rel_ctr++;
990 $topo_graph->add_vertex( $rn );
910a0a6d 991 foreach( @rels ) {
3a2ebbf4 992 $rel_containers{$_->id} = $rn;
910a0a6d 993 }
994 } else {
995 # Add a new node to mirror the old node.
3a2ebbf4 996 $rel_containers{$r->id} = $r->id;
997 $topo_graph->add_vertex( $r->id );
910a0a6d 998 }
4a8828f0 999 }
3a1f2523 1000
3a2ebbf4 1001 # Add the edges.
910a0a6d 1002 foreach my $r ( $self->readings ) {
3a2ebbf4 1003 foreach my $n ( $self->sequence->successors( $r->id ) ) {
1004 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
1005 $rel_containers{$n} );
49d4f2ac 1006 $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 1007 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 1008 }
1009 }
1010
1011 # Now do the rankings, starting with the start node.
3a2ebbf4 1012 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 1013 my $node_ranks = { $topo_start => 0 };
910a0a6d 1014 my @curr_origin = ( $topo_start );
1015 # A little iterative function.
1016 while( @curr_origin ) {
c9bf3dbf 1017 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 1018 }
1019 # Transfer our rankings from the topological graph to the real one.
1020 foreach my $r ( $self->readings ) {
3a2ebbf4 1021 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1022 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 1023 } else {
1024 $DB::single = 1;
3a2ebbf4 1025 die "No rank calculated for node " . $r->id
67da8d6c 1026 . " - do you have a cycle in the graph?";
1027 }
de51424a 1028 }
8e1394aa 1029}
3a1f2523 1030
910a0a6d 1031sub _assign_rank {
c9bf3dbf 1032 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1033 # Look at each of the children of @current_nodes. If all the child's
1034 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1035 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1036 # parent gets a rank.
910a0a6d 1037 my @next_nodes;
1038 foreach my $c ( @current_nodes ) {
c9bf3dbf 1039 warn "Current reading $c has no rank!"
1040 unless exists $node_ranks->{$c};
1041 # print STDERR "Looking at child of node $c, rank "
1042 # . $node_ranks->{$c} . "\n";
1043 foreach my $child ( $graph->successors( $c ) ) {
1044 next if exists $node_ranks->{$child};
910a0a6d 1045 my $highest_rank = -1;
1046 my $skip = 0;
c9bf3dbf 1047 foreach my $parent ( $graph->predecessors( $child ) ) {
1048 if( exists $node_ranks->{$parent} ) {
1049 $highest_rank = $node_ranks->{$parent}
1050 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1051 } else {
1052 $skip = 1;
1053 last;
1054 }
1055 }
1056 next if $skip;
c9bf3dbf 1057 my $c_rank = $highest_rank + 1;
1058 # print STDERR "Assigning rank $c_rank to node $child \n";
1059 $node_ranks->{$child} = $c_rank;
910a0a6d 1060 push( @next_nodes, $child );
1061 }
1062 }
1063 return @next_nodes;
4cdd82f1 1064}
910a0a6d 1065
0e476982 1066# Another method to make up for rough collation methods. If the same reading
1067# appears multiple times at the same rank, collapse the nodes.
1068sub flatten_ranks {
1069 my $self = shift;
1070 my %unique_rank_rdg;
1071 foreach my $rdg ( $self->readings ) {
1072 next unless $rdg->has_rank;
1073 my $key = $rdg->rank . "||" . $rdg->text;
1074 if( exists $unique_rank_rdg{$key} ) {
1075 # Combine!
0068967c 1076 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1077 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
1078 } else {
1079 $unique_rank_rdg{$key} = $rdg;
1080 }
1081 }
1082}
1083
1084
fa954f4c 1085## Utility functions
de51424a 1086
4a8828f0 1087# Return the string that joins together a list of witnesses for
1088# display on a single path.
4a8828f0 1089sub witnesses_of_label {
de51424a 1090 my( $self, $label ) = @_;
4a8828f0 1091 my $regex = $self->wit_list_separator;
de51424a 1092 my @answer = split( /\Q$regex\E/, $label );
1093 return @answer;
4a8828f0 1094}
8e1394aa 1095
dd3b58b0 1096no Moose;
1097__PACKAGE__->meta->make_immutable;
e867486f 1098
1099=head1 BUGS / TODO
1100
1101=over
1102
0068967c 1103=item * Think about making Relationship objects again
e867486f 1104
1105=back