Merge branch 'master' into relationships
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
c9bf3dbf 5use Graph;
8e1394aa 6use IPC::Run qw( run binary );
910a0a6d 7use Text::CSV_XS;
b15511bf 8use Text::Tradition::Collation::Reading;
df6d9812 9use XML::LibXML;
dd3b58b0 10use Moose;
11
3a2ebbf4 12has 'sequence' => (
d047cd52 13 is => 'ro',
3a2ebbf4 14 isa => 'Graph',
15 default => sub { Graph->new() },
d047cd52 16 handles => {
3a2ebbf4 17 paths => 'edges',
d047cd52 18 },
d047cd52 19 );
3a2ebbf4 20
21has 'relations' => (
22 is => 'ro',
23 isa => 'Graph',
24 default => sub { Graph->new( undirected => 1 ) },
25 handles => {
26 relationships => 'edges',
27 },
28 );
dd3b58b0 29
3a2ebbf4 30has 'tradition' => (
31 is => 'ro',
d047cd52 32 isa => 'Text::Tradition',
8d9a1cd8 33 weak_ref => 1,
d047cd52 34 );
dd3b58b0 35
3a2ebbf4 36has 'readings' => (
37 isa => 'HashRef[Text::Tradition::Collation::Reading]',
38 traits => ['Hash'],
39 handles => {
40 reading => 'get',
41 _add_reading => 'set',
42 del_reading => 'delete',
43 has_reading => 'exists',
44 readings => 'values',
45 },
46 default => sub { {} },
47 );
910a0a6d 48
4a8828f0 49has 'wit_list_separator' => (
7854e12e 50 is => 'rw',
51 isa => 'Str',
52 default => ', ',
53 );
54
55has 'baselabel' => (
56 is => 'rw',
57 isa => 'Str',
58 default => 'base text',
59 );
4a8828f0 60
15d2d3df 61has 'linear' => (
62 is => 'rw',
63 isa => 'Bool',
64 default => 1,
65 );
1f563ac3 66
ef9d481f 67has 'ac_label' => (
68 is => 'rw',
69 isa => 'Str',
70 default => ' (a.c.)',
71 );
3a2ebbf4 72
73has 'start' => (
74 is => 'ro',
75 isa => 'Text::Tradition::Collation::Reading',
76 writer => '_set_start',
77 weak_ref => 1,
78 );
79
80has 'end' => (
81 is => 'ro',
82 isa => 'Text::Tradition::Collation::Reading',
83 writer => '_set_end',
84 weak_ref => 1,
85 );
1f563ac3 86
dd3b58b0 87# The collation can be created two ways:
88# 1. Collate a set of witnesses (with CollateX I guess) and process
89# the results as in 2.
90# 2. Read a pre-prepared collation in one of a variety of formats,
91# and make the graph from that.
92
93# The graph itself will (for now) be immutable, and the positions
94# within the graph will also be immutable. We need to calculate those
95# positions upon graph construction. The equivalences between graph
96# nodes will be mutable, entirely determined by the user (or possibly
97# by some semantic pre-processing provided by the user.) So the
98# constructor should just make an empty equivalences object. The
99# constructor will also need to make the witness objects, if we didn't
100# come through option 1.
101
d047cd52 102sub BUILD {
3a2ebbf4 103 my $self = shift;
104 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
105 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 106}
784877d9 107
3a2ebbf4 108### Reading construct/destruct functions
109
110sub add_reading {
111 my( $self, $reading ) = @_;
112 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
113 my %args = %$reading;
114 $reading = Text::Tradition::Collation::Reading->new(
115 'collation' => $self,
116 %args );
117 }
118 # First check to see if a reading with this ID exists.
119 if( $self->reading( $reading->id ) ) {
120 warn "Collation already has a reading with id " . $reading->id;
121 return undef;
122 }
123 $self->_add_reading( $reading->id => $reading );
124 # Once the reading has been added, put it in both graphs.
125 $self->sequence->add_vertex( $reading->id );
126 $self->relations->add_vertex( $reading->id );
127 return $reading;
eca16057 128};
129
3a2ebbf4 130around del_reading => sub {
131 my $orig = shift;
132 my $self = shift;
133 my $arg = shift;
134
135 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
136 $arg = $arg->id;
137 }
3a2ebbf4 138 # Remove the reading from the graphs.
139 $self->sequence->delete_vertex( $arg );
140 $self->relations->delete_vertex( $arg );
141
142 # Carry on.
143 $self->$orig( $arg );
144};
7854e12e 145
3a2ebbf4 146# merge_readings( $main, $to_be_deleted );
7854e12e 147
3a2ebbf4 148sub merge_readings {
149 my $self = shift;
150
151 # We only need the IDs for adding paths to the graph, not the reading
152 # objects themselves.
49d4f2ac 153 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
3a2ebbf4 154
155 # The kept reading should inherit the paths and the relationships
156 # of the deleted reading.
157 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
158 my @vector = ( $kept );
159 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
160 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 161 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 162 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
163 $self->sequence->add_edge( @vector );
164 my $fwits = $self->sequence->get_edge_attributes( @vector );
165 @wits{keys %$fwits} = values %$fwits;
166 $self->sequence->set_edge_attributes( @vector, \%wits );
167 }
168 foreach my $rel ( $self->relations->edges_at( $deleted ) ) {
169 my @vector = ( $kept );
170 push( @vector, $rel->[0] eq $deleted ? $rel->[1] : $rel->[0] );
49d4f2ac 171 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 172 # Is there a relationship here already? If so, keep it.
173 # TODO Warn about conflicting relationships
174 next if $self->relations->has_edge( @vector );
175 # If not, adopt the relationship that would be deleted.
176 $self->relations->add_edge( @vector );
177 my $attr = $self->relations->get_edge_attributes( @$rel );
178 $self->relations->set_edge_attributes( @vector, $attr );
179 }
180
181 # Do the deletion deed.
49d4f2ac 182 if( $combine_char ) {
183 my $kept_obj = $self->reading( $kept );
184 my $new_text = join( $combine_char, $kept_obj->text,
185 $self->reading( $deleted )->text );
186 $kept_obj->alter_text( $new_text );
187 }
3a2ebbf4 188 $self->del_reading( $deleted );
189}
7854e12e 190
3265b0ce 191
3a2ebbf4 192# Helper function for manipulating the graph.
193sub _stringify_args {
194 my( $self, $first, $second, $arg ) = @_;
195 $first = $first->id
196 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
197 $second = $second->id
198 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
199 return( $first, $second, $arg );
200}
df6d9812 201
3a2ebbf4 202### Path logic
203
204sub add_path {
205 my $self = shift;
206
207 # We only need the IDs for adding paths to the graph, not the reading
208 # objects themselves.
209 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
210
211 # Connect the readings
212 $self->sequence->add_edge( $source, $target );
213 # Note the witness in question
214 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 215};
216
3a2ebbf4 217sub del_path {
218 my $self = shift;
49d4f2ac 219 my @args;
220 if( ref( $_[0] ) eq 'ARRAY' ) {
221 my $e = shift @_;
222 @args = ( @$e, @_ );
223 } else {
224 @args = @_;
225 }
3a2ebbf4 226
227 # We only need the IDs for adding paths to the graph, not the reading
228 # objects themselves.
49d4f2ac 229 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 230
231 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 232 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 233 }
234 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
235 $self->sequence->delete_edge( $source, $target );
236 }
784877d9 237}
238
3a2ebbf4 239
15d2d3df 240# Extra graph-alike utility
241sub has_path {
3a2ebbf4 242 my $self = shift;
243 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
244 return undef unless $self->sequence->has_edge( $source, $target );
245 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 246}
247
3a2ebbf4 248### Relationship logic
3265b0ce 249
3a2ebbf4 250=head2 add_relationship( $reading1, $reading2, $definition )
251
252Adds the specified relationship between the two readings. A relationship
253is transitive (i.e. undirected), and must have the following attributes
254specified in the hashref $definition:
255
256=over 4
257
ad1291ee 258=item * type - Can be one of spelling, orthographic, grammatical, meaning, lexical, collated, repetition, transposition. All but the last two are only valid relationships between readings that occur at the same point in the text.
3a2ebbf4 259
260=item * non_correctable - (Optional) True if the reading would not have been corrected independently.
ef9d481f 261
3a2ebbf4 262=item * non_independent - (Optional) True if the variant is unlikely to have occurred independently in unrelated witnesses.
263
264=item * global - (Optional) A meta-attribute, to set the same relationship between readings with the same text whenever they occur in the same place.
265
266=back
267
268=cut
269
270# Wouldn't it be lovely if edges could be objects, and all this type checking
271# and attribute management could be done via Moose?
272
273sub add_relationship {
274 my $self = shift;
275 my( $source, $target, $options ) = $self->_stringify_args( @_ );
276
277 # Check the options
278 if( !defined $options->{'type'} ||
131fa659 279 $options->{'type'} !~ /^(spelling|orthographic|grammatical|meaning|lexical|collated|repetition|transposition)$/i ) {
3a2ebbf4 280 my $t = $options->{'type'} ? $options->{'type'} : '';
49d4f2ac 281 return( undef, "Invalid or missing type " . $options->{'type'} );
3a2ebbf4 282 }
49d4f2ac 283 unless ( $options->{'type'} =~ /^(repetition|transposition)$/ ) {
3a2ebbf4 284 $options->{'colocated'} = 1;
285 }
286
ef9d481f 287 # Make sure there is not another relationship between these two
94c00c71 288 # readings already
3a2ebbf4 289 if( $self->relations->has_edge( $source, $target ) ) {
290 return ( undef, "Relationship already exists between these readings" );
4cdd82f1 291 }
1d310495 292 if( !$self->relationship_valid( $source, $target, $options->{'type'} ) ) {
910a0a6d 293 return ( undef, 'Relationship creates witness loop' );
ef9d481f 294 }
295
3a2ebbf4 296 my @vector = ( $source, $target );
297 $self->relations->add_edge( @vector );
298 $self->relations->set_edge_attributes( @vector, $options );
910a0a6d 299
300 # TODO Handle global relationship setting
301
3a2ebbf4 302 return( 1, @vector );
3265b0ce 303}
304
910a0a6d 305sub relationship_valid {
1d310495 306 my( $self, $source, $target, $rel ) = @_;
307 if( $rel eq 'repetition' ) {
308 return 1;
309 } elsif ( $rel eq 'transposition' ) {
310 # Check that the two readings do not appear in the same witness.
311 my %seen_wits;
312 map { $seen_wits{$_} = 1 } $self->reading_witnesses( $source );
313 foreach my $w ( $self->reading_witnesses( $target ) ) {
314 return 0 if $seen_wits{$w};
315 }
316 return 1;
317 } else {
318 # Check that linking the source and target in a relationship won't lead
319 # to a path loop for any witness. First make a lookup table of all the
320 # readings related to either the source or the target.
321 my @proposed_related = ( $source, $target );
49d4f2ac 322 push( @proposed_related, $self->related_readings( $source, 'colocated' ) );
323 push( @proposed_related, $self->related_readings( $target, 'colocated' ) );
1d310495 324 my %pr_ids;
49d4f2ac 325 map { $pr_ids{ $_ } = 1 } @proposed_related;
1d310495 326
327 # None of these proposed related readings should have a neighbor that
328 # is also in proposed_related.
329 foreach my $pr ( keys %pr_ids ) {
330 foreach my $neighbor( $self->sequence->neighbors( $pr ) ) {
331 return 0 if exists $pr_ids{$neighbor};
332 }
333 }
334 return 1;
335 }
336}
337
338# Return a list of the witnesses in which the reading appears.
339sub reading_witnesses {
340 my( $self, $reading ) = @_;
341 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 342 # arbitrarily chosen "incoming". Thus, special-case the start node.
343 if( $reading eq $self->start ) {
344 return map { $_->sigil } $self->tradition->witnesses;
345 }
1d310495 346 my %all_witnesses;
347 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
348 my $wits = $self->sequence->get_edge_attributes( @$e );
349 @all_witnesses{ keys %$wits } = 1;
350 }
351 return keys %all_witnesses;
910a0a6d 352}
353
3a2ebbf4 354sub related_readings {
355 my( $self, $reading, $colocated ) = @_;
a753cc84 356 my $return_object;
357 if( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
358 $reading = $reading->id;
359 $return_object = 1;
7035e3a6 360# print STDERR "Returning related objects\n";
361# } else {
362# print STDERR "Returning related object names\n";
a753cc84 363 }
3a2ebbf4 364 my @related = $self->relations->all_reachable( $reading );
365 if( $colocated ) {
366 my @colo = grep { $self->relations->has_edge_attribute( $reading, $_, 'colocated' ) } @related;
a753cc84 367 @related = @colo;
368 }
369 return $return_object ? map { $self->reading( $_ ) } @related : @related;
3a2ebbf4 370}
371
8e1394aa 372=head2 Output method(s)
373
374=over
375
376=item B<as_svg>
377
378print $graph->as_svg( $recalculate );
379
0068967c 380Returns an SVG string that represents the graph, via as_dot and graphviz.
8e1394aa 381
382=cut
383
384sub as_svg {
3a2ebbf4 385 my( $self ) = @_;
386
8e1394aa 387 my @cmd = qw/dot -Tsvg/;
388 my( $svg, $err );
910a0a6d 389 my $dotfile = File::Temp->new();
d9e873d0 390 ## TODO REMOVE
eca16057 391 # $dotfile->unlink_on_destroy(0);
910a0a6d 392 binmode $dotfile, ':utf8';
393 print $dotfile $self->as_dot();
394 push( @cmd, $dotfile->filename );
395 run( \@cmd, ">", binary(), \$svg );
396 $svg = decode_utf8( $svg );
8e1394aa 397 return $svg;
398}
399
df6d9812 400=item B<as_dot>
401
402print $graph->as_dot( $view, $recalculate );
403
404Returns a string that is the collation graph expressed in dot
405(i.e. GraphViz) format. The 'view' argument determines what kind of
406graph is produced.
407 * 'path': a graph of witness paths through the collation (DEFAULT)
408 * 'relationship': a graph of how collation readings relate to
409 each other
410
411=cut
412
413sub as_dot {
414 my( $self, $view ) = @_;
3a2ebbf4 415 $view = 'sequence' unless $view;
df6d9812 416 # TODO consider making some of these things configurable
67da8d6c 417 my $graph_name = $self->tradition->name;
418 $graph_name =~ s/[^\w\s]//g;
419 $graph_name = join( '_', split( /\s+/, $graph_name ) );
420 my $dot = sprintf( "digraph %s {\n", $graph_name );
df6d9812 421 $dot .= "\tedge [ arrowhead=open ];\n";
5f52ddca 422 $dot .= "\tgraph [ rankdir=LR,bgcolor=none ];\n";
df6d9812 423 $dot .= sprintf( "\tnode [ fontsize=%d, fillcolor=%s, style=%s, shape=%s ];\n",
3a2ebbf4 424 11, "white", "filled", "ellipse" );
df6d9812 425
426 foreach my $reading ( $self->readings ) {
910a0a6d 427 # Need not output nodes without separate labels
3a2ebbf4 428 next if $reading->id eq $reading->text;
8f9cab7b 429 my $label = $reading->text;
430 $label =~ s/\"/\\\"/g;
431 $dot .= sprintf( "\t\"%s\" [ label=\"%s\" ];\n", $reading->id, $label );
df6d9812 432 }
3a2ebbf4 433
434 # TODO do something sensible for relationships
df6d9812 435
3a2ebbf4 436 my @edges = $self->paths;
df6d9812 437 foreach my $edge ( @edges ) {
910a0a6d 438 my %variables = ( 'color' => '#000000',
439 'fontcolor' => '#000000',
8f9cab7b 440 'label' => join( ', ', $self->path_display_label( $edge ) ),
910a0a6d 441 );
442 my $varopts = join( ', ', map { $_.'="'.$variables{$_}.'"' } sort keys %variables );
81a4c6a0 443 # Account for the rank gap if necessary
444 my $rankgap = $self->reading( $edge->[1] )->rank
445 - $self->reading( $edge->[0] )->rank;
446 $varopts .= ", minlen=$rankgap" if $rankgap > 1;
910a0a6d 447 $dot .= sprintf( "\t\"%s\" -> \"%s\" [ %s ];\n",
3a2ebbf4 448 $edge->[0], $edge->[1], $varopts );
df6d9812 449 }
df6d9812 450 $dot .= "}\n";
451 return $dot;
452}
453
3a2ebbf4 454sub path_witnesses {
455 my( $self, @edge ) = @_;
456 # If edge is an arrayref, cope.
457 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
458 my $e = shift @edge;
459 @edge = @$e;
460 }
461 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
462 return sort @wits;
463}
464
8f9cab7b 465sub path_display_label {
466 my( $self, $edge ) = @_;
467 my @wits = $self->path_witnesses( $edge );
468 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
469 if( scalar @wits > $maj ) {
470 return 'majority';
471 } else {
472 return join( ', ', @wits );
473 }
474}
475
476
8e1394aa 477=item B<as_graphml>
478
479print $graph->as_graphml( $recalculate )
480
481Returns a GraphML representation of the collation graph, with
482transposition information and position information. Unless
483$recalculate is passed (and is a true value), the method will return a
484cached copy of the SVG after the first call to the method.
485
486=cut
487
488sub as_graphml {
3a2ebbf4 489 my( $self ) = @_;
8e1394aa 490
491 # Some namespaces
492 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
493 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
494 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 495 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 496
497 # Create the document and root node
498 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
499 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
500 $graphml->setDocumentElement( $root );
501 $root->setNamespace( $xsi_ns, 'xsi', 0 );
502 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
503
e309421a 504 # Add the data keys for the graph
505 my %graph_data_keys;
506 my $gdi = 0;
1d310495 507 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
e309421a 508 foreach my $datum ( @graph_attributes ) {
509 $graph_data_keys{$datum} = 'dg'.$gdi++;
510 my $key = $root->addNewChild( $graphml_ns, 'key' );
511 $key->setAttribute( 'attr.name', $datum );
512 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
513 $key->setAttribute( 'for', 'graph' );
514 $key->setAttribute( 'id', $graph_data_keys{$datum} );
515 }
f6066bac 516
8e1394aa 517 # Add the data keys for nodes
ef9d481f 518 my %node_data_keys;
519 my $ndi = 0;
3a2ebbf4 520 my %node_data = (
521 id => 'string',
255875b8 522 text => 'string',
3a2ebbf4 523 rank => 'string',
524 is_start => 'boolean',
525 is_end => 'boolean',
526 is_lacuna => 'boolean',
527 );
528 foreach my $datum ( keys %node_data ) {
910a0a6d 529 $node_data_keys{$datum} = 'dn'.$ndi++;
530 my $key = $root->addNewChild( $graphml_ns, 'key' );
531 $key->setAttribute( 'attr.name', $datum );
3a2ebbf4 532 $key->setAttribute( 'attr.type', $node_data{$datum} );
910a0a6d 533 $key->setAttribute( 'for', 'node' );
534 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 535 }
536
df6d9812 537 # Add the data keys for edges, i.e. witnesses
ef9d481f 538 my $edi = 0;
539 my %edge_data_keys;
3a2ebbf4 540 my %edge_data = (
3a2ebbf4 541 witness => 'string', # ID/label for a path
542 relationship => 'string', # ID/label for a relationship
543 extra => 'boolean', # Path key
544 colocated => 'boolean', # Relationship key
545 non_correctable => 'boolean', # Relationship key
546 non_independent => 'boolean', # Relationship key
547 );
548 foreach my $datum ( keys %edge_data ) {
549 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 550 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 551 $key->setAttribute( 'attr.name', $datum );
552 $key->setAttribute( 'attr.type', $edge_data{$datum} );
910a0a6d 553 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 554 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 555 }
3a2ebbf4 556
2c669bca 557 # Add the collation graphs themselves
558 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
559 $sgraph->setAttribute( 'edgedefault', 'directed' );
560 $sgraph->setAttribute( 'id', $self->tradition->name );
561 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
562 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
563 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
564 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
565 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
566
567 my $rgraph = $root->addNewChild( $graphml_ns, 'graph' );
568 $rgraph->setAttribute( 'edgedefault', 'undirected' );
569 $rgraph->setAttribute( 'id', 'relationships' );
570 $rgraph->setAttribute( 'parse.edgeids', 'canonical' );
571 $rgraph->setAttribute( 'parse.edges', scalar($self->relationships) );
572 $rgraph->setAttribute( 'parse.nodeids', 'canonical' );
573 $rgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
574 $rgraph->setAttribute( 'parse.order', 'nodesfirst' );
e309421a 575
576 # Collation attribute data
577 foreach my $datum ( @graph_attributes ) {
2c669bca 578 my $value = $datum eq 'version' ? '3.0' : $self->$datum;
579 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 580 }
8e1394aa 581
582 my $node_ctr = 0;
583 my %node_hash;
2c669bca 584 # Add our readings to the graphs
3a2ebbf4 585 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
2c669bca 586 # Add to the main graph
587 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 588 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 589 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 590 $node_el->setAttribute( 'id', $node_xmlid );
255875b8 591 foreach my $d ( keys %node_data ) {
592 my $nval = $n->$d;
593 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
594 if defined $nval;
595 }
2c669bca 596 # Add to the relationships graph
597 my $rnode_el = $rgraph->addNewChild( $graphml_ns, 'node' );
598 $rnode_el->setAttribute( 'id', $node_xmlid );
b15511bf 599 }
600
2c669bca 601 # Add the path edges to the sequence graph
df6d9812 602 my $edge_ctr = 0;
3a2ebbf4 603 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
604 # We add an edge in the graphml for every witness in $e.
605 foreach my $wit ( $self->path_witnesses( $e ) ) {
606 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
607 $node_hash{ $e->[0] },
608 $node_hash{ $e->[1] } );
2c669bca 609 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 610 $edge_el->setAttribute( 'source', $from );
611 $edge_el->setAttribute( 'target', $to );
612 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 613
614 # It's a witness path, so add the witness
615 my $base = $wit;
616 my $key = $edge_data_keys{'witness'};
617 # Is this an ante-corr witness?
618 my $aclabel = $self->ac_label;
619 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
620 # Keep the base witness
621 $base = $1;
622 # ...and record that this is an 'extra' reading path
623 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
624 }
625 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
626 }
627 }
628
2c669bca 629 # Add the relationship edges to the relationships graph
3a2ebbf4 630 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->relationships ) {
631 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
632 $node_hash{ $e->[0] },
633 $node_hash{ $e->[1] } );
2c669bca 634 my $edge_el = $rgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 635 $edge_el->setAttribute( 'source', $from );
636 $edge_el->setAttribute( 'target', $to );
637 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 638
639 my $data = $self->relations->get_edge_attributes( @$e );
640 # It's a relationship, so save the relationship data
641 _add_graphml_data( $edge_el, $edge_data_keys{'relationship'}, $data->{type} );
642 _add_graphml_data( $edge_el, $edge_data_keys{'colocated'}, $data->{colocated} );
643 if( exists $data->{non_correctable} ) {
644 _add_graphml_data( $edge_el, $edge_data_keys{'non_correctable'},
645 $data->{non_correctable} );
646 }
647 if( exists $data->{non_independent} ) {
648 _add_graphml_data( $edge_el, $edge_data_keys{'non_independent'},
649 $data->{non_independent} );
650 }
8e1394aa 651 }
652
94c00c71 653 # Save and return the thing
654 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 655 return $result;
df6d9812 656}
657
b15511bf 658sub _add_graphml_data {
659 my( $el, $key, $value ) = @_;
b15511bf 660 return unless defined $value;
c9bf3dbf 661 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 662 $data_el->setAttribute( 'key', $key );
663 $data_el->appendText( $value );
8e1394aa 664}
665
910a0a6d 666=item B<as_csv>
667
668print $graph->as_csv( $recalculate )
669
670Returns a CSV alignment table representation of the collation graph, one
2c669bca 671row per witness (or witness uncorrected.)
910a0a6d 672
673=cut
674
675sub as_csv {
3a2ebbf4 676 my( $self ) = @_;
910a0a6d 677 my $table = $self->make_alignment_table;
678 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
679 my @result;
2c669bca 680 # Make the header row
681 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
682 push( @result, decode_utf8( $csv->string ) );
683 # Make the rest of the rows
684 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 685 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
2c669bca 686 my @row = map { $_ ? $_->{'t'} : $_ } @rowobjs;
687 $csv->combine( @row );
910a0a6d 688 push( @result, decode_utf8( $csv->string ) );
689 }
3a2ebbf4 690 return join( "\n", @result );
910a0a6d 691}
692
2c669bca 693=item B<make_alignment_table>
694
695my $table = $graph->make_alignment_table( $use_refs, \@wits_to_include )
696
566f4595 697Return a reference to an alignment table, in a slightly enhanced CollateX
698format which looks like this:
699
700 $table = { alignment => [ { witness => "SIGIL",
701 tokens => [ { t => "READINGTEXT" }, ... ] },
702 { witness => "SIG2",
703 tokens => [ { t => "READINGTEXT" }, ... ] },
704 ... ],
705 length => TEXTLEN };
706
707If $use_refs is set to 1, the reading object is returned in the table
708instead of READINGTEXT; if not, the text of the reading is returned.
709If $wits_to_include is set to a hashref, only the witnesses whose sigil
710keys have a true hash value will be included.
2c669bca 711
712=cut
9f3ba6f7 713
910a0a6d 714sub make_alignment_table {
08e0fb85 715 my( $self, $noderefs, $include ) = @_;
910a0a6d 716 unless( $self->linear ) {
717 warn "Need a linear graph in order to make an alignment table";
718 return;
719 }
2c669bca 720 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 721 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 722 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
2c669bca 723 if( $include ) {
566f4595 724 next unless $include->{$wit->sigil};
2c669bca 725 }
eca16057 726 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 727 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
728 my @row = _make_witness_row( \@wit_path, \@all_pos, $noderefs );
2c669bca 729 push( @{$table->{'alignment'}},
730 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 731 if( $wit->is_layered ) {
732 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
733 $wit->sigil.$self->ac_label, $wit->sigil );
734 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos, $noderefs );
2c669bca 735 push( @{$table->{'alignment'}},
736 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 737 }
738 }
2c669bca 739 return $table;
910a0a6d 740}
741
742sub _make_witness_row {
0e476982 743 my( $path, $positions, $noderefs ) = @_;
910a0a6d 744 my %char_hash;
745 map { $char_hash{$_} = undef } @$positions;
2c669bca 746 my $debug = 0;
910a0a6d 747 foreach my $rdg ( @$path ) {
eca16057 748 my $rtext = $rdg->text;
749 $rtext = '#LACUNA#' if $rdg->is_lacuna;
2c669bca 750 print STDERR "rank " . $rdg->rank . "\n" if $debug;
3a2ebbf4 751 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
2c669bca 752 $char_hash{$rdg->rank} = $noderefs ? { 't' => $rdg }
753 : { 't' => $rtext };
910a0a6d 754 }
755 my @row = map { $char_hash{$_} } @$positions;
eca16057 756 # Fill in lacuna markers for undef spots in the row
757 my $last_el = shift @row;
758 my @filled_row = ( $last_el );
759 foreach my $el ( @row ) {
0e476982 760 # If we are using node reference, make the lacuna node appear many times
761 # in the table. If not, use the lacuna tag.
762 if( $last_el && _el_is_lacuna( $last_el ) && !defined $el ) {
566f4595 763 $el = $noderefs ? $last_el : { 't' => '#LACUNA#' };
eca16057 764 }
765 push( @filled_row, $el );
766 $last_el = $el;
767 }
768 return @filled_row;
910a0a6d 769}
770
0e476982 771# Tiny utility function to say if a table element is a lacuna
772sub _el_is_lacuna {
773 my $el = shift;
2c669bca 774 return 1 if $el->{'t'} eq '#LACUNA#';
775 return 1 if ref( $el->{'t'} ) eq 'Text::Tradition::Collation::Reading'
776 && $el->{'t'}->is_lacuna;
0e476982 777 return 0;
778}
779
910a0a6d 780# Helper to turn the witnesses along columns rather than rows. Assumes
781# equal-sized rows.
782sub _turn_table {
783 my( $table ) = @_;
784 my $result = [];
785 return $result unless scalar @$table;
786 my $nrows = scalar @{$table->[0]};
787 foreach my $idx ( 0 .. $nrows - 1 ) {
788 foreach my $wit ( 0 .. $#{$table} ) {
789 $result->[$idx]->[$wit] = $table->[$wit]->[$idx];
790 }
791 }
792 return $result;
793}
794
8e1394aa 795=back
796
de51424a 797=head2 Navigation methods
798
799=over
800
8e1394aa 801=item B<start>
802
803my $beginning = $collation->start();
804
805Returns the beginning of the collation, a meta-reading with label '#START#'.
806
910a0a6d 807=item B<end>
808
809my $end = $collation->end();
810
811Returns the end of the collation, a meta-reading with label '#END#'.
812
910a0a6d 813
e2902068 814=item B<reading_sequence>
815
816my @readings = $graph->reading_sequence( $first, $last, $path[, $alt_path] );
817
818Returns the ordered list of readings, starting with $first and ending
819with $last, along the given witness path. If no path is specified,
820assume that the path is that of the base text (if any.)
821
822=cut
823
910a0a6d 824# TODO Think about returning some lazy-eval iterator.
825
e2902068 826sub reading_sequence {
827 my( $self, $start, $end, $witness, $backup ) = @_;
828
930ff666 829 $witness = $self->baselabel unless $witness;
e2902068 830 my @readings = ( $start );
831 my %seen;
832 my $n = $start;
3a2ebbf4 833 while( $n && $n->id ne $end->id ) {
834 if( exists( $seen{$n->id} ) ) {
835 warn "Detected loop at " . $n->id;
910a0a6d 836 last;
837 }
3a2ebbf4 838 $seen{$n->id} = 1;
910a0a6d 839
840 my $next = $self->next_reading( $n, $witness, $backup );
44771cf2 841 unless( $next ) {
3a2ebbf4 842 warn "Did not find any path for $witness from reading " . $n->id;
44771cf2 843 last;
844 }
910a0a6d 845 push( @readings, $next );
846 $n = $next;
e2902068 847 }
848 # Check that the last reading is our end reading.
849 my $last = $readings[$#readings];
3a2ebbf4 850 warn "Last reading found from " . $start->text .
910a0a6d 851 " for witness $witness is not the end!"
3a2ebbf4 852 unless $last->id eq $end->id;
e2902068 853
854 return @readings;
855}
856
4a8828f0 857=item B<next_reading>
8e1394aa 858
4a8828f0 859my $next_reading = $graph->next_reading( $reading, $witpath );
8e1394aa 860
4a8828f0 861Returns the reading that follows the given reading along the given witness
930ff666 862path.
8e1394aa 863
864=cut
865
4a8828f0 866sub next_reading {
e2902068 867 # Return the successor via the corresponding path.
8e1394aa 868 my $self = shift;
3a2ebbf4 869 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 870 return undef unless $answer;
3a2ebbf4 871 return $self->reading( $answer );
8e1394aa 872}
873
4a8828f0 874=item B<prior_reading>
8e1394aa 875
4a8828f0 876my $prior_reading = $graph->prior_reading( $reading, $witpath );
8e1394aa 877
4a8828f0 878Returns the reading that precedes the given reading along the given witness
930ff666 879path.
8e1394aa 880
881=cut
882
4a8828f0 883sub prior_reading {
e2902068 884 # Return the predecessor via the corresponding path.
8e1394aa 885 my $self = shift;
3a2ebbf4 886 my $answer = $self->_find_linked_reading( 'prior', @_ );
887 return $self->reading( $answer );
8e1394aa 888}
889
4a8828f0 890sub _find_linked_reading {
e2902068 891 my( $self, $direction, $node, $path, $alt_path ) = @_;
892 my @linked_paths = $direction eq 'next'
3a2ebbf4 893 ? $self->sequence->edges_from( $node )
894 : $self->sequence->edges_to( $node );
e2902068 895 return undef unless scalar( @linked_paths );
8e1394aa 896
e2902068 897 # We have to find the linked path that contains all of the
898 # witnesses supplied in $path.
899 my( @path_wits, @alt_path_wits );
3a2ebbf4 900 @path_wits = sort( $self->witnesses_of_label( $path ) ) if $path;
901 @alt_path_wits = sort( $self->witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 902 my $base_le;
903 my $alt_le;
904 foreach my $le ( @linked_paths ) {
3a2ebbf4 905 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 906 $base_le = $le;
910a0a6d 907 }
3a2ebbf4 908 my @le_wits = $self->path_witnesses( $le );
909 if( _is_within( \@path_wits, \@le_wits ) ) {
910 # This is the right path.
911 return $direction eq 'next' ? $le->[1] : $le->[0];
912 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
913 $alt_le = $le;
914 }
8e1394aa 915 }
e2902068 916 # Got this far? Return the alternate path if it exists.
3a2ebbf4 917 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 918 if $alt_le;
e2902068 919
920 # Got this far? Return the base path if it exists.
3a2ebbf4 921 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 922 if $base_le;
e2902068 923
924 # Got this far? We have no appropriate path.
2c669bca 925 warn "Could not find $direction node from " . $node->id
910a0a6d 926 . " along path $path";
8e1394aa 927 return undef;
928}
929
4a8828f0 930# Some set logic.
931sub _is_within {
932 my( $set1, $set2 ) = @_;
7854e12e 933 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 934 foreach my $el ( @$set1 ) {
910a0a6d 935 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 936 }
937 return $ret;
938}
939
de51424a 940
941## INITIALIZATION METHODS - for use by parsers
930ff666 942
7e450e44 943# For use when a collation is constructed from a base text and an apparatus.
944# We have the sequences of readings and just need to add path edges.
1f7aa795 945# When we are done, clear out the witness path attributes, as they are no
946# longer needed.
947# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 948
6a222840 949sub make_witness_paths {
950 my( $self ) = @_;
910a0a6d 951 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 952 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 953 $self->make_witness_path( $wit );
7854e12e 954 }
7854e12e 955}
956
6a222840 957sub make_witness_path {
7854e12e 958 my( $self, $wit ) = @_;
959 my @chain = @{$wit->path};
15d2d3df 960 my $sig = $wit->sigil;
7854e12e 961 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 962 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 963 }
1f7aa795 964 if( $wit->is_layered ) {
d9e873d0 965 @chain = @{$wit->uncorrected_path};
966 foreach my $idx( 0 .. $#chain-1 ) {
967 my $source = $chain[$idx];
968 my $target = $chain[$idx+1];
969 $self->add_path( $source, $target, $sig.$self->ac_label )
970 unless $self->has_path( $source, $target, $sig );
971 }
15d2d3df 972 }
1f7aa795 973 $wit->clear_path;
974 $wit->clear_uncorrected_path;
e2902068 975}
976
910a0a6d 977sub calculate_ranks {
978 my $self = shift;
979 # Walk a version of the graph where every node linked by a relationship
980 # edge is fundamentally the same node, and do a topological ranking on
981 # the nodes in this graph.
c9bf3dbf 982 my $topo_graph = Graph->new();
910a0a6d 983 my %rel_containers;
984 my $rel_ctr = 0;
985 # Add the nodes
986 foreach my $r ( $self->readings ) {
3a2ebbf4 987 next if exists $rel_containers{$r->id};
910a0a6d 988 my @rels = $r->related_readings( 'colocated' );
989 if( @rels ) {
990 # Make a relationship container.
991 push( @rels, $r );
c9bf3dbf 992 my $rn = 'rel_container_' . $rel_ctr++;
993 $topo_graph->add_vertex( $rn );
910a0a6d 994 foreach( @rels ) {
3a2ebbf4 995 $rel_containers{$_->id} = $rn;
910a0a6d 996 }
997 } else {
998 # Add a new node to mirror the old node.
3a2ebbf4 999 $rel_containers{$r->id} = $r->id;
1000 $topo_graph->add_vertex( $r->id );
910a0a6d 1001 }
4a8828f0 1002 }
3a1f2523 1003
3a2ebbf4 1004 # Add the edges.
910a0a6d 1005 foreach my $r ( $self->readings ) {
3a2ebbf4 1006 foreach my $n ( $self->sequence->successors( $r->id ) ) {
1007 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
1008 $rel_containers{$n} );
49d4f2ac 1009 $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 1010 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 1011 }
1012 }
1013
1014 # Now do the rankings, starting with the start node.
3a2ebbf4 1015 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 1016 my $node_ranks = { $topo_start => 0 };
910a0a6d 1017 my @curr_origin = ( $topo_start );
1018 # A little iterative function.
1019 while( @curr_origin ) {
c9bf3dbf 1020 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 1021 }
1022 # Transfer our rankings from the topological graph to the real one.
1023 foreach my $r ( $self->readings ) {
3a2ebbf4 1024 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1025 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 1026 } else {
1027 $DB::single = 1;
3a2ebbf4 1028 die "No rank calculated for node " . $r->id
67da8d6c 1029 . " - do you have a cycle in the graph?";
1030 }
de51424a 1031 }
8e1394aa 1032}
3a1f2523 1033
910a0a6d 1034sub _assign_rank {
c9bf3dbf 1035 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1036 # Look at each of the children of @current_nodes. If all the child's
1037 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1038 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1039 # parent gets a rank.
910a0a6d 1040 my @next_nodes;
1041 foreach my $c ( @current_nodes ) {
c9bf3dbf 1042 warn "Current reading $c has no rank!"
1043 unless exists $node_ranks->{$c};
1044 # print STDERR "Looking at child of node $c, rank "
1045 # . $node_ranks->{$c} . "\n";
1046 foreach my $child ( $graph->successors( $c ) ) {
1047 next if exists $node_ranks->{$child};
910a0a6d 1048 my $highest_rank = -1;
1049 my $skip = 0;
c9bf3dbf 1050 foreach my $parent ( $graph->predecessors( $child ) ) {
1051 if( exists $node_ranks->{$parent} ) {
1052 $highest_rank = $node_ranks->{$parent}
1053 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1054 } else {
1055 $skip = 1;
1056 last;
1057 }
1058 }
1059 next if $skip;
c9bf3dbf 1060 my $c_rank = $highest_rank + 1;
1061 # print STDERR "Assigning rank $c_rank to node $child \n";
1062 $node_ranks->{$child} = $c_rank;
910a0a6d 1063 push( @next_nodes, $child );
1064 }
1065 }
1066 return @next_nodes;
4cdd82f1 1067}
910a0a6d 1068
0e476982 1069# Another method to make up for rough collation methods. If the same reading
1070# appears multiple times at the same rank, collapse the nodes.
1071sub flatten_ranks {
1072 my $self = shift;
1073 my %unique_rank_rdg;
1074 foreach my $rdg ( $self->readings ) {
1075 next unless $rdg->has_rank;
1076 my $key = $rdg->rank . "||" . $rdg->text;
1077 if( exists $unique_rank_rdg{$key} ) {
1078 # Combine!
0068967c 1079 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1080 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
1081 } else {
1082 $unique_rank_rdg{$key} = $rdg;
1083 }
1084 }
1085}
1086
1087
fa954f4c 1088## Utility functions
de51424a 1089
4a8828f0 1090# Return the string that joins together a list of witnesses for
1091# display on a single path.
4a8828f0 1092sub witnesses_of_label {
de51424a 1093 my( $self, $label ) = @_;
4a8828f0 1094 my $regex = $self->wit_list_separator;
de51424a 1095 my @answer = split( /\Q$regex\E/, $label );
1096 return @answer;
4a8828f0 1097}
8e1394aa 1098
dd3b58b0 1099no Moose;
1100__PACKAGE__->meta->make_immutable;
e867486f 1101
1102=head1 BUGS / TODO
1103
1104=over
1105
0068967c 1106=item * Think about making Relationship objects again
e867486f 1107
1108=back