make Analysis work with the new alignment table
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
c9bf3dbf 5use Graph;
8e1394aa 6use IPC::Run qw( run binary );
910a0a6d 7use Text::CSV_XS;
b15511bf 8use Text::Tradition::Collation::Reading;
df6d9812 9use XML::LibXML;
dd3b58b0 10use Moose;
11
3a2ebbf4 12has 'sequence' => (
d047cd52 13 is => 'ro',
3a2ebbf4 14 isa => 'Graph',
15 default => sub { Graph->new() },
d047cd52 16 handles => {
3a2ebbf4 17 paths => 'edges',
d047cd52 18 },
d047cd52 19 );
3a2ebbf4 20
21has 'relations' => (
22 is => 'ro',
23 isa => 'Graph',
24 default => sub { Graph->new( undirected => 1 ) },
25 handles => {
26 relationships => 'edges',
27 },
28 );
dd3b58b0 29
3a2ebbf4 30has 'tradition' => (
31 is => 'ro',
d047cd52 32 isa => 'Text::Tradition',
8d9a1cd8 33 weak_ref => 1,
d047cd52 34 );
dd3b58b0 35
3a2ebbf4 36has 'readings' => (
37 isa => 'HashRef[Text::Tradition::Collation::Reading]',
38 traits => ['Hash'],
39 handles => {
40 reading => 'get',
41 _add_reading => 'set',
42 del_reading => 'delete',
43 has_reading => 'exists',
44 readings => 'values',
45 },
46 default => sub { {} },
47 );
910a0a6d 48
4a8828f0 49has 'wit_list_separator' => (
7854e12e 50 is => 'rw',
51 isa => 'Str',
52 default => ', ',
53 );
54
55has 'baselabel' => (
56 is => 'rw',
57 isa => 'Str',
58 default => 'base text',
59 );
4a8828f0 60
15d2d3df 61has 'linear' => (
62 is => 'rw',
63 isa => 'Bool',
64 default => 1,
65 );
1f563ac3 66
ef9d481f 67has 'ac_label' => (
68 is => 'rw',
69 isa => 'Str',
70 default => ' (a.c.)',
71 );
3a2ebbf4 72
73has 'start' => (
74 is => 'ro',
75 isa => 'Text::Tradition::Collation::Reading',
76 writer => '_set_start',
77 weak_ref => 1,
78 );
79
80has 'end' => (
81 is => 'ro',
82 isa => 'Text::Tradition::Collation::Reading',
83 writer => '_set_end',
84 weak_ref => 1,
85 );
1f563ac3 86
dd3b58b0 87# The collation can be created two ways:
88# 1. Collate a set of witnesses (with CollateX I guess) and process
89# the results as in 2.
90# 2. Read a pre-prepared collation in one of a variety of formats,
91# and make the graph from that.
92
93# The graph itself will (for now) be immutable, and the positions
94# within the graph will also be immutable. We need to calculate those
95# positions upon graph construction. The equivalences between graph
96# nodes will be mutable, entirely determined by the user (or possibly
97# by some semantic pre-processing provided by the user.) So the
98# constructor should just make an empty equivalences object. The
99# constructor will also need to make the witness objects, if we didn't
100# come through option 1.
101
d047cd52 102sub BUILD {
3a2ebbf4 103 my $self = shift;
104 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
105 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 106}
784877d9 107
3a2ebbf4 108### Reading construct/destruct functions
109
110sub add_reading {
111 my( $self, $reading ) = @_;
112 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
113 my %args = %$reading;
114 $reading = Text::Tradition::Collation::Reading->new(
115 'collation' => $self,
116 %args );
117 }
118 # First check to see if a reading with this ID exists.
119 if( $self->reading( $reading->id ) ) {
120 warn "Collation already has a reading with id " . $reading->id;
121 return undef;
122 }
123 $self->_add_reading( $reading->id => $reading );
124 # Once the reading has been added, put it in both graphs.
125 $self->sequence->add_vertex( $reading->id );
126 $self->relations->add_vertex( $reading->id );
127 return $reading;
eca16057 128};
129
3a2ebbf4 130around del_reading => sub {
131 my $orig = shift;
132 my $self = shift;
133 my $arg = shift;
134
135 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
136 $arg = $arg->id;
137 }
3a2ebbf4 138 # Remove the reading from the graphs.
139 $self->sequence->delete_vertex( $arg );
140 $self->relations->delete_vertex( $arg );
141
142 # Carry on.
143 $self->$orig( $arg );
144};
7854e12e 145
3a2ebbf4 146# merge_readings( $main, $to_be_deleted );
7854e12e 147
3a2ebbf4 148sub merge_readings {
149 my $self = shift;
150
151 # We only need the IDs for adding paths to the graph, not the reading
152 # objects themselves.
49d4f2ac 153 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
3a2ebbf4 154
155 # The kept reading should inherit the paths and the relationships
156 # of the deleted reading.
157 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
158 my @vector = ( $kept );
159 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
160 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 161 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 162 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
163 $self->sequence->add_edge( @vector );
164 my $fwits = $self->sequence->get_edge_attributes( @vector );
165 @wits{keys %$fwits} = values %$fwits;
166 $self->sequence->set_edge_attributes( @vector, \%wits );
167 }
168 foreach my $rel ( $self->relations->edges_at( $deleted ) ) {
169 my @vector = ( $kept );
170 push( @vector, $rel->[0] eq $deleted ? $rel->[1] : $rel->[0] );
49d4f2ac 171 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 172 # Is there a relationship here already? If so, keep it.
173 # TODO Warn about conflicting relationships
174 next if $self->relations->has_edge( @vector );
175 # If not, adopt the relationship that would be deleted.
176 $self->relations->add_edge( @vector );
177 my $attr = $self->relations->get_edge_attributes( @$rel );
178 $self->relations->set_edge_attributes( @vector, $attr );
179 }
180
181 # Do the deletion deed.
49d4f2ac 182 if( $combine_char ) {
183 my $kept_obj = $self->reading( $kept );
184 my $new_text = join( $combine_char, $kept_obj->text,
185 $self->reading( $deleted )->text );
186 $kept_obj->alter_text( $new_text );
187 }
3a2ebbf4 188 $self->del_reading( $deleted );
189}
7854e12e 190
3265b0ce 191
3a2ebbf4 192# Helper function for manipulating the graph.
193sub _stringify_args {
194 my( $self, $first, $second, $arg ) = @_;
195 $first = $first->id
196 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
197 $second = $second->id
198 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
199 return( $first, $second, $arg );
200}
df6d9812 201
3a2ebbf4 202### Path logic
203
204sub add_path {
205 my $self = shift;
206
207 # We only need the IDs for adding paths to the graph, not the reading
208 # objects themselves.
209 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
210
211 # Connect the readings
212 $self->sequence->add_edge( $source, $target );
213 # Note the witness in question
214 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 215};
216
3a2ebbf4 217sub del_path {
218 my $self = shift;
49d4f2ac 219 my @args;
220 if( ref( $_[0] ) eq 'ARRAY' ) {
221 my $e = shift @_;
222 @args = ( @$e, @_ );
223 } else {
224 @args = @_;
225 }
3a2ebbf4 226
227 # We only need the IDs for adding paths to the graph, not the reading
228 # objects themselves.
49d4f2ac 229 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 230
231 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 232 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 233 }
234 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
235 $self->sequence->delete_edge( $source, $target );
236 }
784877d9 237}
238
3a2ebbf4 239
15d2d3df 240# Extra graph-alike utility
241sub has_path {
3a2ebbf4 242 my $self = shift;
243 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
244 return undef unless $self->sequence->has_edge( $source, $target );
245 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 246}
247
3a2ebbf4 248### Relationship logic
3265b0ce 249
3a2ebbf4 250=head2 add_relationship( $reading1, $reading2, $definition )
251
252Adds the specified relationship between the two readings. A relationship
253is transitive (i.e. undirected), and must have the following attributes
254specified in the hashref $definition:
255
256=over 4
257
ad1291ee 258=item * type - Can be one of spelling, orthographic, grammatical, meaning, lexical, collated, repetition, transposition. All but the last two are only valid relationships between readings that occur at the same point in the text.
3a2ebbf4 259
260=item * non_correctable - (Optional) True if the reading would not have been corrected independently.
ef9d481f 261
3a2ebbf4 262=item * non_independent - (Optional) True if the variant is unlikely to have occurred independently in unrelated witnesses.
263
264=item * global - (Optional) A meta-attribute, to set the same relationship between readings with the same text whenever they occur in the same place.
265
266=back
267
268=cut
269
270# Wouldn't it be lovely if edges could be objects, and all this type checking
271# and attribute management could be done via Moose?
272
273sub add_relationship {
274 my $self = shift;
275 my( $source, $target, $options ) = $self->_stringify_args( @_ );
276
277 # Check the options
278 if( !defined $options->{'type'} ||
131fa659 279 $options->{'type'} !~ /^(spelling|orthographic|grammatical|meaning|lexical|collated|repetition|transposition)$/i ) {
3a2ebbf4 280 my $t = $options->{'type'} ? $options->{'type'} : '';
49d4f2ac 281 return( undef, "Invalid or missing type " . $options->{'type'} );
3a2ebbf4 282 }
49d4f2ac 283 unless ( $options->{'type'} =~ /^(repetition|transposition)$/ ) {
3a2ebbf4 284 $options->{'colocated'} = 1;
285 }
286
ef9d481f 287 # Make sure there is not another relationship between these two
94c00c71 288 # readings already
3a2ebbf4 289 if( $self->relations->has_edge( $source, $target ) ) {
290 return ( undef, "Relationship already exists between these readings" );
4cdd82f1 291 }
1d310495 292 if( !$self->relationship_valid( $source, $target, $options->{'type'} ) ) {
910a0a6d 293 return ( undef, 'Relationship creates witness loop' );
ef9d481f 294 }
295
3a2ebbf4 296 my @vector = ( $source, $target );
297 $self->relations->add_edge( @vector );
298 $self->relations->set_edge_attributes( @vector, $options );
910a0a6d 299
300 # TODO Handle global relationship setting
301
3a2ebbf4 302 return( 1, @vector );
3265b0ce 303}
304
910a0a6d 305sub relationship_valid {
1d310495 306 my( $self, $source, $target, $rel ) = @_;
307 if( $rel eq 'repetition' ) {
308 return 1;
309 } elsif ( $rel eq 'transposition' ) {
310 # Check that the two readings do not appear in the same witness.
311 my %seen_wits;
312 map { $seen_wits{$_} = 1 } $self->reading_witnesses( $source );
313 foreach my $w ( $self->reading_witnesses( $target ) ) {
314 return 0 if $seen_wits{$w};
315 }
316 return 1;
317 } else {
318 # Check that linking the source and target in a relationship won't lead
319 # to a path loop for any witness. First make a lookup table of all the
320 # readings related to either the source or the target.
321 my @proposed_related = ( $source, $target );
49d4f2ac 322 push( @proposed_related, $self->related_readings( $source, 'colocated' ) );
323 push( @proposed_related, $self->related_readings( $target, 'colocated' ) );
1d310495 324 my %pr_ids;
49d4f2ac 325 map { $pr_ids{ $_ } = 1 } @proposed_related;
1d310495 326
327 # None of these proposed related readings should have a neighbor that
328 # is also in proposed_related.
329 foreach my $pr ( keys %pr_ids ) {
330 foreach my $neighbor( $self->sequence->neighbors( $pr ) ) {
331 return 0 if exists $pr_ids{$neighbor};
332 }
333 }
334 return 1;
335 }
336}
337
338# Return a list of the witnesses in which the reading appears.
339sub reading_witnesses {
340 my( $self, $reading ) = @_;
341 # We need only check either the incoming or the outgoing edges; I have
342 # arbitrarily chosen "incoming".
343 my %all_witnesses;
344 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
345 my $wits = $self->sequence->get_edge_attributes( @$e );
346 @all_witnesses{ keys %$wits } = 1;
347 }
348 return keys %all_witnesses;
910a0a6d 349}
350
3a2ebbf4 351sub related_readings {
352 my( $self, $reading, $colocated ) = @_;
a753cc84 353 my $return_object;
354 if( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
355 $reading = $reading->id;
356 $return_object = 1;
7035e3a6 357# print STDERR "Returning related objects\n";
358# } else {
359# print STDERR "Returning related object names\n";
a753cc84 360 }
3a2ebbf4 361 my @related = $self->relations->all_reachable( $reading );
362 if( $colocated ) {
363 my @colo = grep { $self->relations->has_edge_attribute( $reading, $_, 'colocated' ) } @related;
a753cc84 364 @related = @colo;
365 }
366 return $return_object ? map { $self->reading( $_ ) } @related : @related;
3a2ebbf4 367}
368
8e1394aa 369=head2 Output method(s)
370
371=over
372
373=item B<as_svg>
374
375print $graph->as_svg( $recalculate );
376
0068967c 377Returns an SVG string that represents the graph, via as_dot and graphviz.
8e1394aa 378
379=cut
380
381sub as_svg {
3a2ebbf4 382 my( $self ) = @_;
383
8e1394aa 384 my @cmd = qw/dot -Tsvg/;
385 my( $svg, $err );
910a0a6d 386 my $dotfile = File::Temp->new();
d9e873d0 387 ## TODO REMOVE
eca16057 388 # $dotfile->unlink_on_destroy(0);
910a0a6d 389 binmode $dotfile, ':utf8';
390 print $dotfile $self->as_dot();
391 push( @cmd, $dotfile->filename );
392 run( \@cmd, ">", binary(), \$svg );
393 $svg = decode_utf8( $svg );
8e1394aa 394 return $svg;
395}
396
df6d9812 397=item B<as_dot>
398
399print $graph->as_dot( $view, $recalculate );
400
401Returns a string that is the collation graph expressed in dot
402(i.e. GraphViz) format. The 'view' argument determines what kind of
403graph is produced.
404 * 'path': a graph of witness paths through the collation (DEFAULT)
405 * 'relationship': a graph of how collation readings relate to
406 each other
407
408=cut
409
410sub as_dot {
411 my( $self, $view ) = @_;
3a2ebbf4 412 $view = 'sequence' unless $view;
df6d9812 413 # TODO consider making some of these things configurable
67da8d6c 414 my $graph_name = $self->tradition->name;
415 $graph_name =~ s/[^\w\s]//g;
416 $graph_name = join( '_', split( /\s+/, $graph_name ) );
417 my $dot = sprintf( "digraph %s {\n", $graph_name );
df6d9812 418 $dot .= "\tedge [ arrowhead=open ];\n";
419 $dot .= "\tgraph [ rankdir=LR ];\n";
420 $dot .= sprintf( "\tnode [ fontsize=%d, fillcolor=%s, style=%s, shape=%s ];\n",
3a2ebbf4 421 11, "white", "filled", "ellipse" );
df6d9812 422
423 foreach my $reading ( $self->readings ) {
910a0a6d 424 # Need not output nodes without separate labels
3a2ebbf4 425 next if $reading->id eq $reading->text;
8f9cab7b 426 my $label = $reading->text;
427 $label =~ s/\"/\\\"/g;
428 $dot .= sprintf( "\t\"%s\" [ label=\"%s\" ];\n", $reading->id, $label );
df6d9812 429 }
3a2ebbf4 430
431 # TODO do something sensible for relationships
df6d9812 432
3a2ebbf4 433 my @edges = $self->paths;
df6d9812 434 foreach my $edge ( @edges ) {
910a0a6d 435 my %variables = ( 'color' => '#000000',
436 'fontcolor' => '#000000',
8f9cab7b 437 'label' => join( ', ', $self->path_display_label( $edge ) ),
910a0a6d 438 );
439 my $varopts = join( ', ', map { $_.'="'.$variables{$_}.'"' } sort keys %variables );
81a4c6a0 440 # Account for the rank gap if necessary
441 my $rankgap = $self->reading( $edge->[1] )->rank
442 - $self->reading( $edge->[0] )->rank;
443 $varopts .= ", minlen=$rankgap" if $rankgap > 1;
910a0a6d 444 $dot .= sprintf( "\t\"%s\" -> \"%s\" [ %s ];\n",
3a2ebbf4 445 $edge->[0], $edge->[1], $varopts );
df6d9812 446 }
df6d9812 447 $dot .= "}\n";
448 return $dot;
449}
450
3a2ebbf4 451sub path_witnesses {
452 my( $self, @edge ) = @_;
453 # If edge is an arrayref, cope.
454 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
455 my $e = shift @edge;
456 @edge = @$e;
457 }
458 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
459 return sort @wits;
460}
461
8f9cab7b 462sub path_display_label {
463 my( $self, $edge ) = @_;
464 my @wits = $self->path_witnesses( $edge );
465 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
466 if( scalar @wits > $maj ) {
467 return 'majority';
468 } else {
469 return join( ', ', @wits );
470 }
471}
472
473
8e1394aa 474=item B<as_graphml>
475
476print $graph->as_graphml( $recalculate )
477
478Returns a GraphML representation of the collation graph, with
479transposition information and position information. Unless
480$recalculate is passed (and is a true value), the method will return a
481cached copy of the SVG after the first call to the method.
482
483=cut
484
485sub as_graphml {
3a2ebbf4 486 my( $self ) = @_;
8e1394aa 487
488 # Some namespaces
489 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
490 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
491 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 492 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 493
494 # Create the document and root node
495 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
496 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
497 $graphml->setDocumentElement( $root );
498 $root->setNamespace( $xsi_ns, 'xsi', 0 );
499 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
500
e309421a 501 # Add the data keys for the graph
502 my %graph_data_keys;
503 my $gdi = 0;
1d310495 504 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
e309421a 505 foreach my $datum ( @graph_attributes ) {
506 $graph_data_keys{$datum} = 'dg'.$gdi++;
507 my $key = $root->addNewChild( $graphml_ns, 'key' );
508 $key->setAttribute( 'attr.name', $datum );
509 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
510 $key->setAttribute( 'for', 'graph' );
511 $key->setAttribute( 'id', $graph_data_keys{$datum} );
512 }
f6066bac 513
8e1394aa 514 # Add the data keys for nodes
ef9d481f 515 my %node_data_keys;
516 my $ndi = 0;
3a2ebbf4 517 my %node_data = (
518 id => 'string',
255875b8 519 text => 'string',
3a2ebbf4 520 rank => 'string',
521 is_start => 'boolean',
522 is_end => 'boolean',
523 is_lacuna => 'boolean',
524 );
525 foreach my $datum ( keys %node_data ) {
910a0a6d 526 $node_data_keys{$datum} = 'dn'.$ndi++;
527 my $key = $root->addNewChild( $graphml_ns, 'key' );
528 $key->setAttribute( 'attr.name', $datum );
3a2ebbf4 529 $key->setAttribute( 'attr.type', $node_data{$datum} );
910a0a6d 530 $key->setAttribute( 'for', 'node' );
531 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 532 }
533
df6d9812 534 # Add the data keys for edges, i.e. witnesses
ef9d481f 535 my $edi = 0;
536 my %edge_data_keys;
3a2ebbf4 537 my %edge_data = (
3a2ebbf4 538 witness => 'string', # ID/label for a path
539 relationship => 'string', # ID/label for a relationship
540 extra => 'boolean', # Path key
541 colocated => 'boolean', # Relationship key
542 non_correctable => 'boolean', # Relationship key
543 non_independent => 'boolean', # Relationship key
544 );
545 foreach my $datum ( keys %edge_data ) {
546 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 547 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 548 $key->setAttribute( 'attr.name', $datum );
549 $key->setAttribute( 'attr.type', $edge_data{$datum} );
910a0a6d 550 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 551 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 552 }
3a2ebbf4 553
2c669bca 554 # Add the collation graphs themselves
555 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
556 $sgraph->setAttribute( 'edgedefault', 'directed' );
557 $sgraph->setAttribute( 'id', $self->tradition->name );
558 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
559 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
560 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
561 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
562 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
563
564 my $rgraph = $root->addNewChild( $graphml_ns, 'graph' );
565 $rgraph->setAttribute( 'edgedefault', 'undirected' );
566 $rgraph->setAttribute( 'id', 'relationships' );
567 $rgraph->setAttribute( 'parse.edgeids', 'canonical' );
568 $rgraph->setAttribute( 'parse.edges', scalar($self->relationships) );
569 $rgraph->setAttribute( 'parse.nodeids', 'canonical' );
570 $rgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
571 $rgraph->setAttribute( 'parse.order', 'nodesfirst' );
e309421a 572
573 # Collation attribute data
574 foreach my $datum ( @graph_attributes ) {
2c669bca 575 my $value = $datum eq 'version' ? '3.0' : $self->$datum;
576 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 577 }
8e1394aa 578
579 my $node_ctr = 0;
580 my %node_hash;
2c669bca 581 # Add our readings to the graphs
3a2ebbf4 582 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
2c669bca 583 # Add to the main graph
584 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 585 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 586 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 587 $node_el->setAttribute( 'id', $node_xmlid );
255875b8 588 foreach my $d ( keys %node_data ) {
589 my $nval = $n->$d;
590 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
591 if defined $nval;
592 }
2c669bca 593 # Add to the relationships graph
594 my $rnode_el = $rgraph->addNewChild( $graphml_ns, 'node' );
595 $rnode_el->setAttribute( 'id', $node_xmlid );
b15511bf 596 }
597
2c669bca 598 # Add the path edges to the sequence graph
df6d9812 599 my $edge_ctr = 0;
3a2ebbf4 600 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
601 # We add an edge in the graphml for every witness in $e.
602 foreach my $wit ( $self->path_witnesses( $e ) ) {
603 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
604 $node_hash{ $e->[0] },
605 $node_hash{ $e->[1] } );
2c669bca 606 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 607 $edge_el->setAttribute( 'source', $from );
608 $edge_el->setAttribute( 'target', $to );
609 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 610
611 # It's a witness path, so add the witness
612 my $base = $wit;
613 my $key = $edge_data_keys{'witness'};
614 # Is this an ante-corr witness?
615 my $aclabel = $self->ac_label;
616 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
617 # Keep the base witness
618 $base = $1;
619 # ...and record that this is an 'extra' reading path
620 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
621 }
622 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
623 }
624 }
625
2c669bca 626 # Add the relationship edges to the relationships graph
3a2ebbf4 627 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->relationships ) {
628 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
629 $node_hash{ $e->[0] },
630 $node_hash{ $e->[1] } );
2c669bca 631 my $edge_el = $rgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 632 $edge_el->setAttribute( 'source', $from );
633 $edge_el->setAttribute( 'target', $to );
634 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 635
636 my $data = $self->relations->get_edge_attributes( @$e );
637 # It's a relationship, so save the relationship data
638 _add_graphml_data( $edge_el, $edge_data_keys{'relationship'}, $data->{type} );
639 _add_graphml_data( $edge_el, $edge_data_keys{'colocated'}, $data->{colocated} );
640 if( exists $data->{non_correctable} ) {
641 _add_graphml_data( $edge_el, $edge_data_keys{'non_correctable'},
642 $data->{non_correctable} );
643 }
644 if( exists $data->{non_independent} ) {
645 _add_graphml_data( $edge_el, $edge_data_keys{'non_independent'},
646 $data->{non_independent} );
647 }
8e1394aa 648 }
649
94c00c71 650 # Save and return the thing
651 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 652 return $result;
df6d9812 653}
654
b15511bf 655sub _add_graphml_data {
656 my( $el, $key, $value ) = @_;
b15511bf 657 return unless defined $value;
c9bf3dbf 658 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 659 $data_el->setAttribute( 'key', $key );
660 $data_el->appendText( $value );
8e1394aa 661}
662
910a0a6d 663=item B<as_csv>
664
665print $graph->as_csv( $recalculate )
666
667Returns a CSV alignment table representation of the collation graph, one
2c669bca 668row per witness (or witness uncorrected.)
910a0a6d 669
670=cut
671
672sub as_csv {
3a2ebbf4 673 my( $self ) = @_;
910a0a6d 674 my $table = $self->make_alignment_table;
675 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
676 my @result;
2c669bca 677 # Make the header row
678 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
679 push( @result, decode_utf8( $csv->string ) );
680 # Make the rest of the rows
681 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
682 my @rowobjs = map { $_->[$idx] } @{$table->{'alignment'}};
683 my @row = map { $_ ? $_->{'t'} : $_ } @rowobjs;
684 $csv->combine( @row );
910a0a6d 685 push( @result, decode_utf8( $csv->string ) );
686 }
3a2ebbf4 687 return join( "\n", @result );
910a0a6d 688}
689
2c669bca 690=item B<make_alignment_table>
691
692my $table = $graph->make_alignment_table( $use_refs, \@wits_to_include )
693
694Return a reference to an alignment table, in the format described at
695L<http://gregor.middell.net/collatex>. If $use_refs is set to 1, the reading
696object is returned in the table; if not, the text of the reading is returned.
697If $wits_to_include is set to an arrayref, only the witnesses listed will be
698included in the table.
699
700=cut
9f3ba6f7 701
910a0a6d 702sub make_alignment_table {
08e0fb85 703 my( $self, $noderefs, $include ) = @_;
910a0a6d 704 unless( $self->linear ) {
705 warn "Need a linear graph in order to make an alignment table";
706 return;
707 }
2c669bca 708 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 709 my @all_pos = ( 1 .. $self->end->rank - 1 );
910a0a6d 710 foreach my $wit ( $self->tradition->witnesses ) {
2c669bca 711 if( $include ) {
712 next unless grep { $_ eq $wit->sigil } @$include;
713 }
714 $DB::single = 1 if $wit->sigil eq 'U';
eca16057 715 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 716 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
717 my @row = _make_witness_row( \@wit_path, \@all_pos, $noderefs );
2c669bca 718 push( @{$table->{'alignment'}},
719 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 720 if( $wit->is_layered ) {
721 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
722 $wit->sigil.$self->ac_label, $wit->sigil );
723 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos, $noderefs );
2c669bca 724 push( @{$table->{'alignment'}},
725 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 726 }
727 }
2c669bca 728 return $table;
910a0a6d 729}
730
731sub _make_witness_row {
0e476982 732 my( $path, $positions, $noderefs ) = @_;
910a0a6d 733 my %char_hash;
734 map { $char_hash{$_} = undef } @$positions;
2c669bca 735 my $debug = 0;
910a0a6d 736 foreach my $rdg ( @$path ) {
eca16057 737 my $rtext = $rdg->text;
738 $rtext = '#LACUNA#' if $rdg->is_lacuna;
2c669bca 739 print STDERR "rank " . $rdg->rank . "\n" if $debug;
3a2ebbf4 740 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
2c669bca 741 $char_hash{$rdg->rank} = $noderefs ? { 't' => $rdg }
742 : { 't' => $rtext };
910a0a6d 743 }
744 my @row = map { $char_hash{$_} } @$positions;
eca16057 745 # Fill in lacuna markers for undef spots in the row
746 my $last_el = shift @row;
747 my @filled_row = ( $last_el );
748 foreach my $el ( @row ) {
0e476982 749 # If we are using node reference, make the lacuna node appear many times
750 # in the table. If not, use the lacuna tag.
751 if( $last_el && _el_is_lacuna( $last_el ) && !defined $el ) {
2c669bca 752 $el = $noderefs ? { 't' => $last_el } : { 't' => '#LACUNA#' };
eca16057 753 }
754 push( @filled_row, $el );
755 $last_el = $el;
756 }
757 return @filled_row;
910a0a6d 758}
759
0e476982 760# Tiny utility function to say if a table element is a lacuna
761sub _el_is_lacuna {
762 my $el = shift;
2c669bca 763 return 1 if $el->{'t'} eq '#LACUNA#';
764 return 1 if ref( $el->{'t'} ) eq 'Text::Tradition::Collation::Reading'
765 && $el->{'t'}->is_lacuna;
0e476982 766 return 0;
767}
768
910a0a6d 769# Helper to turn the witnesses along columns rather than rows. Assumes
770# equal-sized rows.
771sub _turn_table {
772 my( $table ) = @_;
773 my $result = [];
774 return $result unless scalar @$table;
775 my $nrows = scalar @{$table->[0]};
776 foreach my $idx ( 0 .. $nrows - 1 ) {
777 foreach my $wit ( 0 .. $#{$table} ) {
778 $result->[$idx]->[$wit] = $table->[$wit]->[$idx];
779 }
780 }
781 return $result;
782}
783
8e1394aa 784=back
785
de51424a 786=head2 Navigation methods
787
788=over
789
8e1394aa 790=item B<start>
791
792my $beginning = $collation->start();
793
794Returns the beginning of the collation, a meta-reading with label '#START#'.
795
910a0a6d 796=item B<end>
797
798my $end = $collation->end();
799
800Returns the end of the collation, a meta-reading with label '#END#'.
801
910a0a6d 802
e2902068 803=item B<reading_sequence>
804
805my @readings = $graph->reading_sequence( $first, $last, $path[, $alt_path] );
806
807Returns the ordered list of readings, starting with $first and ending
808with $last, along the given witness path. If no path is specified,
809assume that the path is that of the base text (if any.)
810
811=cut
812
910a0a6d 813# TODO Think about returning some lazy-eval iterator.
814
e2902068 815sub reading_sequence {
816 my( $self, $start, $end, $witness, $backup ) = @_;
817
930ff666 818 $witness = $self->baselabel unless $witness;
e2902068 819 my @readings = ( $start );
820 my %seen;
821 my $n = $start;
3a2ebbf4 822 while( $n && $n->id ne $end->id ) {
823 if( exists( $seen{$n->id} ) ) {
824 warn "Detected loop at " . $n->id;
910a0a6d 825 last;
826 }
3a2ebbf4 827 $seen{$n->id} = 1;
910a0a6d 828
829 my $next = $self->next_reading( $n, $witness, $backup );
44771cf2 830 unless( $next ) {
3a2ebbf4 831 warn "Did not find any path for $witness from reading " . $n->id;
44771cf2 832 last;
833 }
910a0a6d 834 push( @readings, $next );
835 $n = $next;
e2902068 836 }
837 # Check that the last reading is our end reading.
838 my $last = $readings[$#readings];
3a2ebbf4 839 warn "Last reading found from " . $start->text .
910a0a6d 840 " for witness $witness is not the end!"
3a2ebbf4 841 unless $last->id eq $end->id;
e2902068 842
843 return @readings;
844}
845
4a8828f0 846=item B<next_reading>
8e1394aa 847
4a8828f0 848my $next_reading = $graph->next_reading( $reading, $witpath );
8e1394aa 849
4a8828f0 850Returns the reading that follows the given reading along the given witness
930ff666 851path.
8e1394aa 852
853=cut
854
4a8828f0 855sub next_reading {
e2902068 856 # Return the successor via the corresponding path.
8e1394aa 857 my $self = shift;
3a2ebbf4 858 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 859 return undef unless $answer;
3a2ebbf4 860 return $self->reading( $answer );
8e1394aa 861}
862
4a8828f0 863=item B<prior_reading>
8e1394aa 864
4a8828f0 865my $prior_reading = $graph->prior_reading( $reading, $witpath );
8e1394aa 866
4a8828f0 867Returns the reading that precedes the given reading along the given witness
930ff666 868path.
8e1394aa 869
870=cut
871
4a8828f0 872sub prior_reading {
e2902068 873 # Return the predecessor via the corresponding path.
8e1394aa 874 my $self = shift;
3a2ebbf4 875 my $answer = $self->_find_linked_reading( 'prior', @_ );
876 return $self->reading( $answer );
8e1394aa 877}
878
4a8828f0 879sub _find_linked_reading {
e2902068 880 my( $self, $direction, $node, $path, $alt_path ) = @_;
881 my @linked_paths = $direction eq 'next'
3a2ebbf4 882 ? $self->sequence->edges_from( $node )
883 : $self->sequence->edges_to( $node );
e2902068 884 return undef unless scalar( @linked_paths );
8e1394aa 885
e2902068 886 # We have to find the linked path that contains all of the
887 # witnesses supplied in $path.
888 my( @path_wits, @alt_path_wits );
3a2ebbf4 889 @path_wits = sort( $self->witnesses_of_label( $path ) ) if $path;
890 @alt_path_wits = sort( $self->witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 891 my $base_le;
892 my $alt_le;
893 foreach my $le ( @linked_paths ) {
3a2ebbf4 894 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 895 $base_le = $le;
910a0a6d 896 }
3a2ebbf4 897 my @le_wits = $self->path_witnesses( $le );
898 if( _is_within( \@path_wits, \@le_wits ) ) {
899 # This is the right path.
900 return $direction eq 'next' ? $le->[1] : $le->[0];
901 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
902 $alt_le = $le;
903 }
8e1394aa 904 }
e2902068 905 # Got this far? Return the alternate path if it exists.
3a2ebbf4 906 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 907 if $alt_le;
e2902068 908
909 # Got this far? Return the base path if it exists.
3a2ebbf4 910 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 911 if $base_le;
e2902068 912
913 # Got this far? We have no appropriate path.
2c669bca 914 warn "Could not find $direction node from " . $node->id
910a0a6d 915 . " along path $path";
8e1394aa 916 return undef;
917}
918
4a8828f0 919# Some set logic.
920sub _is_within {
921 my( $set1, $set2 ) = @_;
7854e12e 922 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 923 foreach my $el ( @$set1 ) {
910a0a6d 924 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 925 }
926 return $ret;
927}
928
de51424a 929
930## INITIALIZATION METHODS - for use by parsers
930ff666 931
7e450e44 932# For use when a collation is constructed from a base text and an apparatus.
933# We have the sequences of readings and just need to add path edges.
1f7aa795 934# When we are done, clear out the witness path attributes, as they are no
935# longer needed.
936# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 937
6a222840 938sub make_witness_paths {
939 my( $self ) = @_;
910a0a6d 940 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 941 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 942 $self->make_witness_path( $wit );
7854e12e 943 }
7854e12e 944}
945
6a222840 946sub make_witness_path {
7854e12e 947 my( $self, $wit ) = @_;
948 my @chain = @{$wit->path};
15d2d3df 949 my $sig = $wit->sigil;
7854e12e 950 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 951 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 952 }
1f7aa795 953 if( $wit->is_layered ) {
d9e873d0 954 @chain = @{$wit->uncorrected_path};
955 foreach my $idx( 0 .. $#chain-1 ) {
956 my $source = $chain[$idx];
957 my $target = $chain[$idx+1];
958 $self->add_path( $source, $target, $sig.$self->ac_label )
959 unless $self->has_path( $source, $target, $sig );
960 }
15d2d3df 961 }
1f7aa795 962 $wit->clear_path;
963 $wit->clear_uncorrected_path;
e2902068 964}
965
910a0a6d 966sub calculate_ranks {
967 my $self = shift;
968 # Walk a version of the graph where every node linked by a relationship
969 # edge is fundamentally the same node, and do a topological ranking on
970 # the nodes in this graph.
c9bf3dbf 971 my $topo_graph = Graph->new();
910a0a6d 972 my %rel_containers;
973 my $rel_ctr = 0;
974 # Add the nodes
975 foreach my $r ( $self->readings ) {
3a2ebbf4 976 next if exists $rel_containers{$r->id};
910a0a6d 977 my @rels = $r->related_readings( 'colocated' );
978 if( @rels ) {
979 # Make a relationship container.
980 push( @rels, $r );
c9bf3dbf 981 my $rn = 'rel_container_' . $rel_ctr++;
982 $topo_graph->add_vertex( $rn );
910a0a6d 983 foreach( @rels ) {
3a2ebbf4 984 $rel_containers{$_->id} = $rn;
910a0a6d 985 }
986 } else {
987 # Add a new node to mirror the old node.
3a2ebbf4 988 $rel_containers{$r->id} = $r->id;
989 $topo_graph->add_vertex( $r->id );
910a0a6d 990 }
4a8828f0 991 }
3a1f2523 992
3a2ebbf4 993 # Add the edges.
910a0a6d 994 foreach my $r ( $self->readings ) {
3a2ebbf4 995 foreach my $n ( $self->sequence->successors( $r->id ) ) {
996 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
997 $rel_containers{$n} );
49d4f2ac 998 $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 999 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 1000 }
1001 }
1002
1003 # Now do the rankings, starting with the start node.
3a2ebbf4 1004 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 1005 my $node_ranks = { $topo_start => 0 };
910a0a6d 1006 my @curr_origin = ( $topo_start );
1007 # A little iterative function.
1008 while( @curr_origin ) {
c9bf3dbf 1009 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 1010 }
1011 # Transfer our rankings from the topological graph to the real one.
1012 foreach my $r ( $self->readings ) {
3a2ebbf4 1013 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1014 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 1015 } else {
1016 $DB::single = 1;
3a2ebbf4 1017 die "No rank calculated for node " . $r->id
67da8d6c 1018 . " - do you have a cycle in the graph?";
1019 }
de51424a 1020 }
8e1394aa 1021}
3a1f2523 1022
910a0a6d 1023sub _assign_rank {
c9bf3dbf 1024 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1025 # Look at each of the children of @current_nodes. If all the child's
1026 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1027 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1028 # parent gets a rank.
910a0a6d 1029 my @next_nodes;
1030 foreach my $c ( @current_nodes ) {
c9bf3dbf 1031 warn "Current reading $c has no rank!"
1032 unless exists $node_ranks->{$c};
1033 # print STDERR "Looking at child of node $c, rank "
1034 # . $node_ranks->{$c} . "\n";
1035 foreach my $child ( $graph->successors( $c ) ) {
1036 next if exists $node_ranks->{$child};
910a0a6d 1037 my $highest_rank = -1;
1038 my $skip = 0;
c9bf3dbf 1039 foreach my $parent ( $graph->predecessors( $child ) ) {
1040 if( exists $node_ranks->{$parent} ) {
1041 $highest_rank = $node_ranks->{$parent}
1042 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1043 } else {
1044 $skip = 1;
1045 last;
1046 }
1047 }
1048 next if $skip;
c9bf3dbf 1049 my $c_rank = $highest_rank + 1;
1050 # print STDERR "Assigning rank $c_rank to node $child \n";
1051 $node_ranks->{$child} = $c_rank;
910a0a6d 1052 push( @next_nodes, $child );
1053 }
1054 }
1055 return @next_nodes;
4cdd82f1 1056}
910a0a6d 1057
0e476982 1058# Another method to make up for rough collation methods. If the same reading
1059# appears multiple times at the same rank, collapse the nodes.
1060sub flatten_ranks {
1061 my $self = shift;
1062 my %unique_rank_rdg;
1063 foreach my $rdg ( $self->readings ) {
1064 next unless $rdg->has_rank;
1065 my $key = $rdg->rank . "||" . $rdg->text;
1066 if( exists $unique_rank_rdg{$key} ) {
1067 # Combine!
0068967c 1068 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1069 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
1070 } else {
1071 $unique_rank_rdg{$key} = $rdg;
1072 }
1073 }
1074}
1075
1076
fa954f4c 1077## Utility functions
de51424a 1078
4a8828f0 1079# Return the string that joins together a list of witnesses for
1080# display on a single path.
4a8828f0 1081sub witnesses_of_label {
de51424a 1082 my( $self, $label ) = @_;
4a8828f0 1083 my $regex = $self->wit_list_separator;
de51424a 1084 my @answer = split( /\Q$regex\E/, $label );
1085 return @answer;
4a8828f0 1086}
8e1394aa 1087
dd3b58b0 1088no Moose;
1089__PACKAGE__->meta->make_immutable;
e867486f 1090
1091=head1 BUGS / TODO
1092
1093=over
1094
0068967c 1095=item * Think about making Relationship objects again
e867486f 1096
1097=back