Merge branch 'master' into relationships
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
910a0a6d 3use Encode qw( decode_utf8 );
4use File::Temp;
c9bf3dbf 5use Graph;
8e1394aa 6use IPC::Run qw( run binary );
910a0a6d 7use Text::CSV_XS;
b15511bf 8use Text::Tradition::Collation::Reading;
df6d9812 9use XML::LibXML;
dd3b58b0 10use Moose;
11
3a2ebbf4 12has 'sequence' => (
d047cd52 13 is => 'ro',
3a2ebbf4 14 isa => 'Graph',
15 default => sub { Graph->new() },
d047cd52 16 handles => {
3a2ebbf4 17 paths => 'edges',
d047cd52 18 },
d047cd52 19 );
3a2ebbf4 20
21has 'relations' => (
22 is => 'ro',
23 isa => 'Graph',
24 default => sub { Graph->new( undirected => 1 ) },
25 handles => {
26 relationships => 'edges',
27 },
28 );
dd3b58b0 29
3a2ebbf4 30has 'tradition' => (
31 is => 'ro',
d047cd52 32 isa => 'Text::Tradition',
8d9a1cd8 33 weak_ref => 1,
d047cd52 34 );
dd3b58b0 35
3a2ebbf4 36has 'readings' => (
37 isa => 'HashRef[Text::Tradition::Collation::Reading]',
38 traits => ['Hash'],
39 handles => {
40 reading => 'get',
41 _add_reading => 'set',
42 del_reading => 'delete',
43 has_reading => 'exists',
44 readings => 'values',
45 },
46 default => sub { {} },
47 );
910a0a6d 48
4a8828f0 49has 'wit_list_separator' => (
7854e12e 50 is => 'rw',
51 isa => 'Str',
52 default => ', ',
53 );
54
55has 'baselabel' => (
56 is => 'rw',
57 isa => 'Str',
58 default => 'base text',
59 );
4a8828f0 60
15d2d3df 61has 'linear' => (
62 is => 'rw',
63 isa => 'Bool',
64 default => 1,
65 );
1f563ac3 66
ef9d481f 67has 'ac_label' => (
68 is => 'rw',
69 isa => 'Str',
70 default => ' (a.c.)',
71 );
3a2ebbf4 72
73has 'start' => (
74 is => 'ro',
75 isa => 'Text::Tradition::Collation::Reading',
76 writer => '_set_start',
77 weak_ref => 1,
78 );
79
80has 'end' => (
81 is => 'ro',
82 isa => 'Text::Tradition::Collation::Reading',
83 writer => '_set_end',
84 weak_ref => 1,
85 );
1f563ac3 86
dd3b58b0 87# The collation can be created two ways:
88# 1. Collate a set of witnesses (with CollateX I guess) and process
89# the results as in 2.
90# 2. Read a pre-prepared collation in one of a variety of formats,
91# and make the graph from that.
92
93# The graph itself will (for now) be immutable, and the positions
94# within the graph will also be immutable. We need to calculate those
95# positions upon graph construction. The equivalences between graph
96# nodes will be mutable, entirely determined by the user (or possibly
97# by some semantic pre-processing provided by the user.) So the
98# constructor should just make an empty equivalences object. The
99# constructor will also need to make the witness objects, if we didn't
100# come through option 1.
101
d047cd52 102sub BUILD {
3a2ebbf4 103 my $self = shift;
104 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
105 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
d047cd52 106}
784877d9 107
3a2ebbf4 108### Reading construct/destruct functions
109
110sub add_reading {
111 my( $self, $reading ) = @_;
112 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
113 my %args = %$reading;
114 $reading = Text::Tradition::Collation::Reading->new(
115 'collation' => $self,
116 %args );
117 }
118 # First check to see if a reading with this ID exists.
119 if( $self->reading( $reading->id ) ) {
120 warn "Collation already has a reading with id " . $reading->id;
121 return undef;
122 }
123 $self->_add_reading( $reading->id => $reading );
124 # Once the reading has been added, put it in both graphs.
125 $self->sequence->add_vertex( $reading->id );
126 $self->relations->add_vertex( $reading->id );
127 return $reading;
eca16057 128};
129
3a2ebbf4 130around del_reading => sub {
131 my $orig = shift;
132 my $self = shift;
133 my $arg = shift;
134
135 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
136 $arg = $arg->id;
137 }
3a2ebbf4 138 # Remove the reading from the graphs.
139 $self->sequence->delete_vertex( $arg );
140 $self->relations->delete_vertex( $arg );
141
142 # Carry on.
143 $self->$orig( $arg );
144};
7854e12e 145
3a2ebbf4 146# merge_readings( $main, $to_be_deleted );
7854e12e 147
3a2ebbf4 148sub merge_readings {
149 my $self = shift;
150
151 # We only need the IDs for adding paths to the graph, not the reading
152 # objects themselves.
49d4f2ac 153 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
3a2ebbf4 154
155 # The kept reading should inherit the paths and the relationships
156 # of the deleted reading.
157 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
158 my @vector = ( $kept );
159 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
160 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 161 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 162 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
163 $self->sequence->add_edge( @vector );
164 my $fwits = $self->sequence->get_edge_attributes( @vector );
165 @wits{keys %$fwits} = values %$fwits;
166 $self->sequence->set_edge_attributes( @vector, \%wits );
167 }
168 foreach my $rel ( $self->relations->edges_at( $deleted ) ) {
169 my @vector = ( $kept );
170 push( @vector, $rel->[0] eq $deleted ? $rel->[1] : $rel->[0] );
49d4f2ac 171 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 172 # Is there a relationship here already? If so, keep it.
173 # TODO Warn about conflicting relationships
174 next if $self->relations->has_edge( @vector );
175 # If not, adopt the relationship that would be deleted.
176 $self->relations->add_edge( @vector );
177 my $attr = $self->relations->get_edge_attributes( @$rel );
178 $self->relations->set_edge_attributes( @vector, $attr );
179 }
180
181 # Do the deletion deed.
49d4f2ac 182 if( $combine_char ) {
183 my $kept_obj = $self->reading( $kept );
184 my $new_text = join( $combine_char, $kept_obj->text,
185 $self->reading( $deleted )->text );
186 $kept_obj->alter_text( $new_text );
187 }
3a2ebbf4 188 $self->del_reading( $deleted );
189}
7854e12e 190
3265b0ce 191
3a2ebbf4 192# Helper function for manipulating the graph.
193sub _stringify_args {
194 my( $self, $first, $second, $arg ) = @_;
195 $first = $first->id
196 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
197 $second = $second->id
198 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
199 return( $first, $second, $arg );
200}
df6d9812 201
3a2ebbf4 202### Path logic
203
204sub add_path {
205 my $self = shift;
206
207 # We only need the IDs for adding paths to the graph, not the reading
208 # objects themselves.
209 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
210
211 # Connect the readings
212 $self->sequence->add_edge( $source, $target );
213 # Note the witness in question
214 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
b15511bf 215};
216
3a2ebbf4 217sub del_path {
218 my $self = shift;
49d4f2ac 219 my @args;
220 if( ref( $_[0] ) eq 'ARRAY' ) {
221 my $e = shift @_;
222 @args = ( @$e, @_ );
223 } else {
224 @args = @_;
225 }
3a2ebbf4 226
227 # We only need the IDs for adding paths to the graph, not the reading
228 # objects themselves.
49d4f2ac 229 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 230
231 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 232 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 233 }
234 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
235 $self->sequence->delete_edge( $source, $target );
236 }
784877d9 237}
238
3a2ebbf4 239
15d2d3df 240# Extra graph-alike utility
241sub has_path {
3a2ebbf4 242 my $self = shift;
243 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
244 return undef unless $self->sequence->has_edge( $source, $target );
245 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 246}
247
3a2ebbf4 248### Relationship logic
3265b0ce 249
3a2ebbf4 250=head2 add_relationship( $reading1, $reading2, $definition )
251
252Adds the specified relationship between the two readings. A relationship
253is transitive (i.e. undirected), and must have the following attributes
254specified in the hashref $definition:
255
256=over 4
257
ad1291ee 258=item * type - Can be one of spelling, orthographic, grammatical, meaning, lexical, collated, repetition, transposition. All but the last two are only valid relationships between readings that occur at the same point in the text.
3a2ebbf4 259
260=item * non_correctable - (Optional) True if the reading would not have been corrected independently.
ef9d481f 261
3a2ebbf4 262=item * non_independent - (Optional) True if the variant is unlikely to have occurred independently in unrelated witnesses.
263
264=item * global - (Optional) A meta-attribute, to set the same relationship between readings with the same text whenever they occur in the same place.
265
266=back
267
268=cut
269
270# Wouldn't it be lovely if edges could be objects, and all this type checking
271# and attribute management could be done via Moose?
272
273sub add_relationship {
274 my $self = shift;
275 my( $source, $target, $options ) = $self->_stringify_args( @_ );
276
277 # Check the options
278 if( !defined $options->{'type'} ||
131fa659 279 $options->{'type'} !~ /^(spelling|orthographic|grammatical|meaning|lexical|collated|repetition|transposition)$/i ) {
3a2ebbf4 280 my $t = $options->{'type'} ? $options->{'type'} : '';
49d4f2ac 281 return( undef, "Invalid or missing type " . $options->{'type'} );
3a2ebbf4 282 }
49d4f2ac 283 unless ( $options->{'type'} =~ /^(repetition|transposition)$/ ) {
3a2ebbf4 284 $options->{'colocated'} = 1;
285 }
286
ef9d481f 287 # Make sure there is not another relationship between these two
94c00c71 288 # readings already
3a2ebbf4 289 if( $self->relations->has_edge( $source, $target ) ) {
290 return ( undef, "Relationship already exists between these readings" );
4cdd82f1 291 }
1d310495 292 if( !$self->relationship_valid( $source, $target, $options->{'type'} ) ) {
910a0a6d 293 return ( undef, 'Relationship creates witness loop' );
ef9d481f 294 }
295
3a2ebbf4 296 my @vector = ( $source, $target );
297 $self->relations->add_edge( @vector );
298 $self->relations->set_edge_attributes( @vector, $options );
910a0a6d 299
300 # TODO Handle global relationship setting
301
3a2ebbf4 302 return( 1, @vector );
3265b0ce 303}
304
910a0a6d 305sub relationship_valid {
1d310495 306 my( $self, $source, $target, $rel ) = @_;
307 if( $rel eq 'repetition' ) {
308 return 1;
309 } elsif ( $rel eq 'transposition' ) {
310 # Check that the two readings do not appear in the same witness.
311 my %seen_wits;
312 map { $seen_wits{$_} = 1 } $self->reading_witnesses( $source );
313 foreach my $w ( $self->reading_witnesses( $target ) ) {
314 return 0 if $seen_wits{$w};
315 }
316 return 1;
317 } else {
318 # Check that linking the source and target in a relationship won't lead
319 # to a path loop for any witness. First make a lookup table of all the
320 # readings related to either the source or the target.
321 my @proposed_related = ( $source, $target );
49d4f2ac 322 push( @proposed_related, $self->related_readings( $source, 'colocated' ) );
323 push( @proposed_related, $self->related_readings( $target, 'colocated' ) );
1d310495 324 my %pr_ids;
49d4f2ac 325 map { $pr_ids{ $_ } = 1 } @proposed_related;
1d310495 326
327 # None of these proposed related readings should have a neighbor that
328 # is also in proposed_related.
329 foreach my $pr ( keys %pr_ids ) {
330 foreach my $neighbor( $self->sequence->neighbors( $pr ) ) {
331 return 0 if exists $pr_ids{$neighbor};
332 }
333 }
334 return 1;
335 }
336}
337
338# Return a list of the witnesses in which the reading appears.
339sub reading_witnesses {
340 my( $self, $reading ) = @_;
341 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 342 # arbitrarily chosen "incoming". Thus, special-case the start node.
343 if( $reading eq $self->start ) {
344 return map { $_->sigil } $self->tradition->witnesses;
345 }
1d310495 346 my %all_witnesses;
347 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
348 my $wits = $self->sequence->get_edge_attributes( @$e );
349 @all_witnesses{ keys %$wits } = 1;
350 }
351 return keys %all_witnesses;
910a0a6d 352}
353
3a2ebbf4 354sub related_readings {
355 my( $self, $reading, $colocated ) = @_;
a753cc84 356 my $return_object;
357 if( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
358 $reading = $reading->id;
359 $return_object = 1;
7035e3a6 360# print STDERR "Returning related objects\n";
361# } else {
362# print STDERR "Returning related object names\n";
a753cc84 363 }
3a2ebbf4 364 my @related = $self->relations->all_reachable( $reading );
365 if( $colocated ) {
366 my @colo = grep { $self->relations->has_edge_attribute( $reading, $_, 'colocated' ) } @related;
a753cc84 367 @related = @colo;
368 }
369 return $return_object ? map { $self->reading( $_ ) } @related : @related;
3a2ebbf4 370}
371
8e1394aa 372=head2 Output method(s)
373
374=over
375
376=item B<as_svg>
377
378print $graph->as_svg( $recalculate );
379
0068967c 380Returns an SVG string that represents the graph, via as_dot and graphviz.
8e1394aa 381
382=cut
383
384sub as_svg {
3a2ebbf4 385 my( $self ) = @_;
386
8e1394aa 387 my @cmd = qw/dot -Tsvg/;
388 my( $svg, $err );
910a0a6d 389 my $dotfile = File::Temp->new();
d9e873d0 390 ## TODO REMOVE
eca16057 391 # $dotfile->unlink_on_destroy(0);
910a0a6d 392 binmode $dotfile, ':utf8';
393 print $dotfile $self->as_dot();
394 push( @cmd, $dotfile->filename );
395 run( \@cmd, ">", binary(), \$svg );
396 $svg = decode_utf8( $svg );
8e1394aa 397 return $svg;
398}
399
df6d9812 400=item B<as_dot>
401
402print $graph->as_dot( $view, $recalculate );
403
404Returns a string that is the collation graph expressed in dot
405(i.e. GraphViz) format. The 'view' argument determines what kind of
406graph is produced.
407 * 'path': a graph of witness paths through the collation (DEFAULT)
408 * 'relationship': a graph of how collation readings relate to
409 each other
410
411=cut
412
413sub as_dot {
414 my( $self, $view ) = @_;
3a2ebbf4 415 $view = 'sequence' unless $view;
df6d9812 416 # TODO consider making some of these things configurable
67da8d6c 417 my $graph_name = $self->tradition->name;
418 $graph_name =~ s/[^\w\s]//g;
419 $graph_name = join( '_', split( /\s+/, $graph_name ) );
420 my $dot = sprintf( "digraph %s {\n", $graph_name );
df6d9812 421 $dot .= "\tedge [ arrowhead=open ];\n";
5f52ddca 422 $dot .= "\tgraph [ rankdir=LR,bgcolor=none ];\n";
df6d9812 423 $dot .= sprintf( "\tnode [ fontsize=%d, fillcolor=%s, style=%s, shape=%s ];\n",
3a2ebbf4 424 11, "white", "filled", "ellipse" );
df6d9812 425
426 foreach my $reading ( $self->readings ) {
910a0a6d 427 # Need not output nodes without separate labels
3a2ebbf4 428 next if $reading->id eq $reading->text;
8f9cab7b 429 my $label = $reading->text;
430 $label =~ s/\"/\\\"/g;
431 $dot .= sprintf( "\t\"%s\" [ label=\"%s\" ];\n", $reading->id, $label );
df6d9812 432 }
3a2ebbf4 433
434 # TODO do something sensible for relationships
df6d9812 435
3a2ebbf4 436 my @edges = $self->paths;
df6d9812 437 foreach my $edge ( @edges ) {
910a0a6d 438 my %variables = ( 'color' => '#000000',
439 'fontcolor' => '#000000',
8f9cab7b 440 'label' => join( ', ', $self->path_display_label( $edge ) ),
910a0a6d 441 );
442 my $varopts = join( ', ', map { $_.'="'.$variables{$_}.'"' } sort keys %variables );
81a4c6a0 443 # Account for the rank gap if necessary
444 my $rankgap = $self->reading( $edge->[1] )->rank
445 - $self->reading( $edge->[0] )->rank;
446 $varopts .= ", minlen=$rankgap" if $rankgap > 1;
910a0a6d 447 $dot .= sprintf( "\t\"%s\" -> \"%s\" [ %s ];\n",
3a2ebbf4 448 $edge->[0], $edge->[1], $varopts );
df6d9812 449 }
df6d9812 450 $dot .= "}\n";
451 return $dot;
452}
453
3a2ebbf4 454sub path_witnesses {
455 my( $self, @edge ) = @_;
456 # If edge is an arrayref, cope.
457 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
458 my $e = shift @edge;
459 @edge = @$e;
460 }
461 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
462 return sort @wits;
463}
464
8f9cab7b 465sub path_display_label {
466 my( $self, $edge ) = @_;
467 my @wits = $self->path_witnesses( $edge );
468 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
469 if( scalar @wits > $maj ) {
470 return 'majority';
471 } else {
472 return join( ', ', @wits );
473 }
474}
475
476
8e1394aa 477=item B<as_graphml>
478
479print $graph->as_graphml( $recalculate )
480
481Returns a GraphML representation of the collation graph, with
482transposition information and position information. Unless
483$recalculate is passed (and is a true value), the method will return a
484cached copy of the SVG after the first call to the method.
485
486=cut
487
488sub as_graphml {
3a2ebbf4 489 my( $self ) = @_;
8e1394aa 490
491 # Some namespaces
492 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
493 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
494 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 495 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 496
497 # Create the document and root node
498 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
499 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
500 $graphml->setDocumentElement( $root );
501 $root->setNamespace( $xsi_ns, 'xsi', 0 );
502 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
503
e309421a 504 # Add the data keys for the graph
505 my %graph_data_keys;
506 my $gdi = 0;
1d310495 507 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
e309421a 508 foreach my $datum ( @graph_attributes ) {
509 $graph_data_keys{$datum} = 'dg'.$gdi++;
510 my $key = $root->addNewChild( $graphml_ns, 'key' );
511 $key->setAttribute( 'attr.name', $datum );
512 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
513 $key->setAttribute( 'for', 'graph' );
514 $key->setAttribute( 'id', $graph_data_keys{$datum} );
515 }
f6066bac 516
8e1394aa 517 # Add the data keys for nodes
ef9d481f 518 my %node_data_keys;
519 my $ndi = 0;
3a2ebbf4 520 my %node_data = (
521 id => 'string',
255875b8 522 text => 'string',
3a2ebbf4 523 rank => 'string',
524 is_start => 'boolean',
525 is_end => 'boolean',
526 is_lacuna => 'boolean',
527 );
528 foreach my $datum ( keys %node_data ) {
910a0a6d 529 $node_data_keys{$datum} = 'dn'.$ndi++;
530 my $key = $root->addNewChild( $graphml_ns, 'key' );
531 $key->setAttribute( 'attr.name', $datum );
3a2ebbf4 532 $key->setAttribute( 'attr.type', $node_data{$datum} );
910a0a6d 533 $key->setAttribute( 'for', 'node' );
534 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 535 }
536
df6d9812 537 # Add the data keys for edges, i.e. witnesses
ef9d481f 538 my $edi = 0;
539 my %edge_data_keys;
3a2ebbf4 540 my %edge_data = (
f523c7a8 541 class => 'string', # Class, deprecated soon
3a2ebbf4 542 witness => 'string', # ID/label for a path
543 relationship => 'string', # ID/label for a relationship
544 extra => 'boolean', # Path key
545 colocated => 'boolean', # Relationship key
546 non_correctable => 'boolean', # Relationship key
547 non_independent => 'boolean', # Relationship key
548 );
549 foreach my $datum ( keys %edge_data ) {
550 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 551 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 552 $key->setAttribute( 'attr.name', $datum );
553 $key->setAttribute( 'attr.type', $edge_data{$datum} );
910a0a6d 554 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 555 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 556 }
3a2ebbf4 557
2c669bca 558 # Add the collation graphs themselves
559 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
560 $sgraph->setAttribute( 'edgedefault', 'directed' );
561 $sgraph->setAttribute( 'id', $self->tradition->name );
562 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
563 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
564 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
565 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
566 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
567
f523c7a8 568 my $rgraph;
569 if( scalar $self->relationships ) {
570 my $rgraph = $root->addNewChild( $graphml_ns, 'graph' );
571 $rgraph->setAttribute( 'edgedefault', 'undirected' );
572 $rgraph->setAttribute( 'id', 'relationships' );
573 $rgraph->setAttribute( 'parse.edgeids', 'canonical' );
574 $rgraph->setAttribute( 'parse.edges', scalar($self->relationships) );
575 $rgraph->setAttribute( 'parse.nodeids', 'canonical' );
576 $rgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
577 $rgraph->setAttribute( 'parse.order', 'nodesfirst' );
578 }
579
e309421a 580 # Collation attribute data
581 foreach my $datum ( @graph_attributes ) {
2c669bca 582 my $value = $datum eq 'version' ? '3.0' : $self->$datum;
583 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 584 }
8e1394aa 585
586 my $node_ctr = 0;
587 my %node_hash;
2c669bca 588 # Add our readings to the graphs
3a2ebbf4 589 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
2c669bca 590 # Add to the main graph
591 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 592 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 593 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 594 $node_el->setAttribute( 'id', $node_xmlid );
255875b8 595 foreach my $d ( keys %node_data ) {
596 my $nval = $n->$d;
597 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
598 if defined $nval;
599 }
2c669bca 600 # Add to the relationships graph
f523c7a8 601 if( $rgraph ) {
602 my $rnode_el = $rgraph->addNewChild( $graphml_ns, 'node' );
603 $rnode_el->setAttribute( 'id', $node_xmlid );
604 }
b15511bf 605 }
606
2c669bca 607 # Add the path edges to the sequence graph
df6d9812 608 my $edge_ctr = 0;
3a2ebbf4 609 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
610 # We add an edge in the graphml for every witness in $e.
611 foreach my $wit ( $self->path_witnesses( $e ) ) {
612 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
613 $node_hash{ $e->[0] },
614 $node_hash{ $e->[1] } );
2c669bca 615 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 616 $edge_el->setAttribute( 'source', $from );
617 $edge_el->setAttribute( 'target', $to );
618 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 619
620 # It's a witness path, so add the witness
621 my $base = $wit;
622 my $key = $edge_data_keys{'witness'};
623 # Is this an ante-corr witness?
624 my $aclabel = $self->ac_label;
625 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
626 # Keep the base witness
627 $base = $1;
628 # ...and record that this is an 'extra' reading path
629 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
630 }
631 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
f523c7a8 632 _add_graphml_data( $edge_el, $edge_data_keys{'class'}, 'path' );
3a2ebbf4 633 }
634 }
635
2c669bca 636 # Add the relationship edges to the relationships graph
f523c7a8 637 if( $rgraph ) {
638 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->relationships ) {
639 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
640 $node_hash{ $e->[0] },
641 $node_hash{ $e->[1] } );
642 my $edge_el = $rgraph->addNewChild( $graphml_ns, 'edge' );
643 $edge_el->setAttribute( 'source', $from );
644 $edge_el->setAttribute( 'target', $to );
645 $edge_el->setAttribute( 'id', $id );
646
647 my $data = $self->relations->get_edge_attributes( @$e );
648 # It's a relationship, so save the relationship data
649 _add_graphml_data( $edge_el, $edge_data_keys{'relationship'}, $data->{type} );
650 _add_graphml_data( $edge_el, $edge_data_keys{'colocated'}, $data->{colocated} );
651 if( exists $data->{non_correctable} ) {
652 _add_graphml_data( $edge_el, $edge_data_keys{'non_correctable'},
653 $data->{non_correctable} );
654 }
655 if( exists $data->{non_independent} ) {
656 _add_graphml_data( $edge_el, $edge_data_keys{'non_independent'},
657 $data->{non_independent} );
658 }
659 _add_graphml_data( $edge_el, $edge_data_keys{'class'}, 'relationship' );
3a2ebbf4 660 }
8e1394aa 661 }
662
94c00c71 663 # Save and return the thing
664 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 665 return $result;
df6d9812 666}
667
b15511bf 668sub _add_graphml_data {
669 my( $el, $key, $value ) = @_;
b15511bf 670 return unless defined $value;
c9bf3dbf 671 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 672 $data_el->setAttribute( 'key', $key );
673 $data_el->appendText( $value );
8e1394aa 674}
675
910a0a6d 676=item B<as_csv>
677
678print $graph->as_csv( $recalculate )
679
680Returns a CSV alignment table representation of the collation graph, one
2c669bca 681row per witness (or witness uncorrected.)
910a0a6d 682
683=cut
684
685sub as_csv {
3a2ebbf4 686 my( $self ) = @_;
910a0a6d 687 my $table = $self->make_alignment_table;
688 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
689 my @result;
2c669bca 690 # Make the header row
691 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
692 push( @result, decode_utf8( $csv->string ) );
693 # Make the rest of the rows
694 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 695 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
2c669bca 696 my @row = map { $_ ? $_->{'t'} : $_ } @rowobjs;
697 $csv->combine( @row );
910a0a6d 698 push( @result, decode_utf8( $csv->string ) );
699 }
3a2ebbf4 700 return join( "\n", @result );
910a0a6d 701}
702
2c669bca 703=item B<make_alignment_table>
704
705my $table = $graph->make_alignment_table( $use_refs, \@wits_to_include )
706
566f4595 707Return a reference to an alignment table, in a slightly enhanced CollateX
708format which looks like this:
709
710 $table = { alignment => [ { witness => "SIGIL",
711 tokens => [ { t => "READINGTEXT" }, ... ] },
712 { witness => "SIG2",
713 tokens => [ { t => "READINGTEXT" }, ... ] },
714 ... ],
715 length => TEXTLEN };
716
717If $use_refs is set to 1, the reading object is returned in the table
718instead of READINGTEXT; if not, the text of the reading is returned.
719If $wits_to_include is set to a hashref, only the witnesses whose sigil
720keys have a true hash value will be included.
2c669bca 721
722=cut
9f3ba6f7 723
910a0a6d 724sub make_alignment_table {
08e0fb85 725 my( $self, $noderefs, $include ) = @_;
910a0a6d 726 unless( $self->linear ) {
727 warn "Need a linear graph in order to make an alignment table";
728 return;
729 }
2c669bca 730 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 731 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 732 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
2c669bca 733 if( $include ) {
566f4595 734 next unless $include->{$wit->sigil};
2c669bca 735 }
eca16057 736 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1f7aa795 737 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
738 my @row = _make_witness_row( \@wit_path, \@all_pos, $noderefs );
2c669bca 739 push( @{$table->{'alignment'}},
740 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 741 if( $wit->is_layered ) {
742 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
743 $wit->sigil.$self->ac_label, $wit->sigil );
744 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos, $noderefs );
2c669bca 745 push( @{$table->{'alignment'}},
746 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 747 }
748 }
2c669bca 749 return $table;
910a0a6d 750}
751
752sub _make_witness_row {
0e476982 753 my( $path, $positions, $noderefs ) = @_;
910a0a6d 754 my %char_hash;
755 map { $char_hash{$_} = undef } @$positions;
2c669bca 756 my $debug = 0;
910a0a6d 757 foreach my $rdg ( @$path ) {
eca16057 758 my $rtext = $rdg->text;
759 $rtext = '#LACUNA#' if $rdg->is_lacuna;
2c669bca 760 print STDERR "rank " . $rdg->rank . "\n" if $debug;
3a2ebbf4 761 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
2c669bca 762 $char_hash{$rdg->rank} = $noderefs ? { 't' => $rdg }
763 : { 't' => $rtext };
910a0a6d 764 }
765 my @row = map { $char_hash{$_} } @$positions;
eca16057 766 # Fill in lacuna markers for undef spots in the row
767 my $last_el = shift @row;
768 my @filled_row = ( $last_el );
769 foreach my $el ( @row ) {
0e476982 770 # If we are using node reference, make the lacuna node appear many times
771 # in the table. If not, use the lacuna tag.
772 if( $last_el && _el_is_lacuna( $last_el ) && !defined $el ) {
566f4595 773 $el = $noderefs ? $last_el : { 't' => '#LACUNA#' };
eca16057 774 }
775 push( @filled_row, $el );
776 $last_el = $el;
777 }
778 return @filled_row;
910a0a6d 779}
780
0e476982 781# Tiny utility function to say if a table element is a lacuna
782sub _el_is_lacuna {
783 my $el = shift;
2c669bca 784 return 1 if $el->{'t'} eq '#LACUNA#';
785 return 1 if ref( $el->{'t'} ) eq 'Text::Tradition::Collation::Reading'
786 && $el->{'t'}->is_lacuna;
0e476982 787 return 0;
788}
789
910a0a6d 790# Helper to turn the witnesses along columns rather than rows. Assumes
791# equal-sized rows.
792sub _turn_table {
793 my( $table ) = @_;
794 my $result = [];
795 return $result unless scalar @$table;
796 my $nrows = scalar @{$table->[0]};
797 foreach my $idx ( 0 .. $nrows - 1 ) {
798 foreach my $wit ( 0 .. $#{$table} ) {
799 $result->[$idx]->[$wit] = $table->[$wit]->[$idx];
800 }
801 }
802 return $result;
803}
804
8e1394aa 805=back
806
de51424a 807=head2 Navigation methods
808
809=over
810
8e1394aa 811=item B<start>
812
813my $beginning = $collation->start();
814
815Returns the beginning of the collation, a meta-reading with label '#START#'.
816
910a0a6d 817=item B<end>
818
819my $end = $collation->end();
820
821Returns the end of the collation, a meta-reading with label '#END#'.
822
910a0a6d 823
e2902068 824=item B<reading_sequence>
825
826my @readings = $graph->reading_sequence( $first, $last, $path[, $alt_path] );
827
828Returns the ordered list of readings, starting with $first and ending
829with $last, along the given witness path. If no path is specified,
830assume that the path is that of the base text (if any.)
831
832=cut
833
910a0a6d 834# TODO Think about returning some lazy-eval iterator.
835
e2902068 836sub reading_sequence {
837 my( $self, $start, $end, $witness, $backup ) = @_;
838
930ff666 839 $witness = $self->baselabel unless $witness;
e2902068 840 my @readings = ( $start );
841 my %seen;
842 my $n = $start;
3a2ebbf4 843 while( $n && $n->id ne $end->id ) {
844 if( exists( $seen{$n->id} ) ) {
845 warn "Detected loop at " . $n->id;
910a0a6d 846 last;
847 }
3a2ebbf4 848 $seen{$n->id} = 1;
910a0a6d 849
850 my $next = $self->next_reading( $n, $witness, $backup );
44771cf2 851 unless( $next ) {
3a2ebbf4 852 warn "Did not find any path for $witness from reading " . $n->id;
44771cf2 853 last;
854 }
910a0a6d 855 push( @readings, $next );
856 $n = $next;
e2902068 857 }
858 # Check that the last reading is our end reading.
859 my $last = $readings[$#readings];
3a2ebbf4 860 warn "Last reading found from " . $start->text .
910a0a6d 861 " for witness $witness is not the end!"
3a2ebbf4 862 unless $last->id eq $end->id;
e2902068 863
864 return @readings;
865}
866
4a8828f0 867=item B<next_reading>
8e1394aa 868
4a8828f0 869my $next_reading = $graph->next_reading( $reading, $witpath );
8e1394aa 870
4a8828f0 871Returns the reading that follows the given reading along the given witness
930ff666 872path.
8e1394aa 873
874=cut
875
4a8828f0 876sub next_reading {
e2902068 877 # Return the successor via the corresponding path.
8e1394aa 878 my $self = shift;
3a2ebbf4 879 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 880 return undef unless $answer;
3a2ebbf4 881 return $self->reading( $answer );
8e1394aa 882}
883
4a8828f0 884=item B<prior_reading>
8e1394aa 885
4a8828f0 886my $prior_reading = $graph->prior_reading( $reading, $witpath );
8e1394aa 887
4a8828f0 888Returns the reading that precedes the given reading along the given witness
930ff666 889path.
8e1394aa 890
891=cut
892
4a8828f0 893sub prior_reading {
e2902068 894 # Return the predecessor via the corresponding path.
8e1394aa 895 my $self = shift;
3a2ebbf4 896 my $answer = $self->_find_linked_reading( 'prior', @_ );
897 return $self->reading( $answer );
8e1394aa 898}
899
4a8828f0 900sub _find_linked_reading {
e2902068 901 my( $self, $direction, $node, $path, $alt_path ) = @_;
902 my @linked_paths = $direction eq 'next'
3a2ebbf4 903 ? $self->sequence->edges_from( $node )
904 : $self->sequence->edges_to( $node );
e2902068 905 return undef unless scalar( @linked_paths );
8e1394aa 906
e2902068 907 # We have to find the linked path that contains all of the
908 # witnesses supplied in $path.
909 my( @path_wits, @alt_path_wits );
3a2ebbf4 910 @path_wits = sort( $self->witnesses_of_label( $path ) ) if $path;
911 @alt_path_wits = sort( $self->witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 912 my $base_le;
913 my $alt_le;
914 foreach my $le ( @linked_paths ) {
3a2ebbf4 915 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 916 $base_le = $le;
910a0a6d 917 }
3a2ebbf4 918 my @le_wits = $self->path_witnesses( $le );
919 if( _is_within( \@path_wits, \@le_wits ) ) {
920 # This is the right path.
921 return $direction eq 'next' ? $le->[1] : $le->[0];
922 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
923 $alt_le = $le;
924 }
8e1394aa 925 }
e2902068 926 # Got this far? Return the alternate path if it exists.
3a2ebbf4 927 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 928 if $alt_le;
e2902068 929
930 # Got this far? Return the base path if it exists.
3a2ebbf4 931 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 932 if $base_le;
e2902068 933
934 # Got this far? We have no appropriate path.
2c669bca 935 warn "Could not find $direction node from " . $node->id
910a0a6d 936 . " along path $path";
8e1394aa 937 return undef;
938}
939
4a8828f0 940# Some set logic.
941sub _is_within {
942 my( $set1, $set2 ) = @_;
7854e12e 943 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 944 foreach my $el ( @$set1 ) {
910a0a6d 945 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 946 }
947 return $ret;
948}
949
de51424a 950
951## INITIALIZATION METHODS - for use by parsers
930ff666 952
7e450e44 953# For use when a collation is constructed from a base text and an apparatus.
954# We have the sequences of readings and just need to add path edges.
1f7aa795 955# When we are done, clear out the witness path attributes, as they are no
956# longer needed.
957# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 958
6a222840 959sub make_witness_paths {
960 my( $self ) = @_;
910a0a6d 961 foreach my $wit ( $self->tradition->witnesses ) {
0068967c 962 # print STDERR "Making path for " . $wit->sigil . "\n";
910a0a6d 963 $self->make_witness_path( $wit );
7854e12e 964 }
7854e12e 965}
966
6a222840 967sub make_witness_path {
7854e12e 968 my( $self, $wit ) = @_;
969 my @chain = @{$wit->path};
15d2d3df 970 my $sig = $wit->sigil;
7854e12e 971 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 972 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 973 }
1f7aa795 974 if( $wit->is_layered ) {
d9e873d0 975 @chain = @{$wit->uncorrected_path};
976 foreach my $idx( 0 .. $#chain-1 ) {
977 my $source = $chain[$idx];
978 my $target = $chain[$idx+1];
979 $self->add_path( $source, $target, $sig.$self->ac_label )
980 unless $self->has_path( $source, $target, $sig );
981 }
15d2d3df 982 }
1f7aa795 983 $wit->clear_path;
984 $wit->clear_uncorrected_path;
e2902068 985}
986
910a0a6d 987sub calculate_ranks {
988 my $self = shift;
989 # Walk a version of the graph where every node linked by a relationship
990 # edge is fundamentally the same node, and do a topological ranking on
991 # the nodes in this graph.
c9bf3dbf 992 my $topo_graph = Graph->new();
910a0a6d 993 my %rel_containers;
994 my $rel_ctr = 0;
995 # Add the nodes
996 foreach my $r ( $self->readings ) {
3a2ebbf4 997 next if exists $rel_containers{$r->id};
910a0a6d 998 my @rels = $r->related_readings( 'colocated' );
999 if( @rels ) {
1000 # Make a relationship container.
1001 push( @rels, $r );
c9bf3dbf 1002 my $rn = 'rel_container_' . $rel_ctr++;
1003 $topo_graph->add_vertex( $rn );
910a0a6d 1004 foreach( @rels ) {
3a2ebbf4 1005 $rel_containers{$_->id} = $rn;
910a0a6d 1006 }
1007 } else {
1008 # Add a new node to mirror the old node.
3a2ebbf4 1009 $rel_containers{$r->id} = $r->id;
1010 $topo_graph->add_vertex( $r->id );
910a0a6d 1011 }
4a8828f0 1012 }
3a1f2523 1013
3a2ebbf4 1014 # Add the edges.
910a0a6d 1015 foreach my $r ( $self->readings ) {
3a2ebbf4 1016 foreach my $n ( $self->sequence->successors( $r->id ) ) {
1017 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
1018 $rel_containers{$n} );
49d4f2ac 1019 $DB::single = 1 unless $tfrom && $tto;
3a2ebbf4 1020 $topo_graph->add_edge( $tfrom, $tto );
910a0a6d 1021 }
1022 }
1023
1024 # Now do the rankings, starting with the start node.
3a2ebbf4 1025 my $topo_start = $rel_containers{$self->start->id};
c9bf3dbf 1026 my $node_ranks = { $topo_start => 0 };
910a0a6d 1027 my @curr_origin = ( $topo_start );
1028 # A little iterative function.
1029 while( @curr_origin ) {
c9bf3dbf 1030 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
910a0a6d 1031 }
1032 # Transfer our rankings from the topological graph to the real one.
1033 foreach my $r ( $self->readings ) {
3a2ebbf4 1034 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1035 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
67da8d6c 1036 } else {
1037 $DB::single = 1;
3a2ebbf4 1038 die "No rank calculated for node " . $r->id
67da8d6c 1039 . " - do you have a cycle in the graph?";
1040 }
de51424a 1041 }
8e1394aa 1042}
3a1f2523 1043
910a0a6d 1044sub _assign_rank {
c9bf3dbf 1045 my( $graph, $node_ranks, @current_nodes ) = @_;
910a0a6d 1046 # Look at each of the children of @current_nodes. If all the child's
1047 # parents have a rank, assign it the highest rank + 1 and add it to
c9bf3dbf 1048 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1049 # parent gets a rank.
910a0a6d 1050 my @next_nodes;
1051 foreach my $c ( @current_nodes ) {
c9bf3dbf 1052 warn "Current reading $c has no rank!"
1053 unless exists $node_ranks->{$c};
1054 # print STDERR "Looking at child of node $c, rank "
1055 # . $node_ranks->{$c} . "\n";
1056 foreach my $child ( $graph->successors( $c ) ) {
1057 next if exists $node_ranks->{$child};
910a0a6d 1058 my $highest_rank = -1;
1059 my $skip = 0;
c9bf3dbf 1060 foreach my $parent ( $graph->predecessors( $child ) ) {
1061 if( exists $node_ranks->{$parent} ) {
1062 $highest_rank = $node_ranks->{$parent}
1063 if $highest_rank <= $node_ranks->{$parent};
910a0a6d 1064 } else {
1065 $skip = 1;
1066 last;
1067 }
1068 }
1069 next if $skip;
c9bf3dbf 1070 my $c_rank = $highest_rank + 1;
1071 # print STDERR "Assigning rank $c_rank to node $child \n";
1072 $node_ranks->{$child} = $c_rank;
910a0a6d 1073 push( @next_nodes, $child );
1074 }
1075 }
1076 return @next_nodes;
4cdd82f1 1077}
910a0a6d 1078
0e476982 1079# Another method to make up for rough collation methods. If the same reading
1080# appears multiple times at the same rank, collapse the nodes.
1081sub flatten_ranks {
1082 my $self = shift;
1083 my %unique_rank_rdg;
1084 foreach my $rdg ( $self->readings ) {
1085 next unless $rdg->has_rank;
1086 my $key = $rdg->rank . "||" . $rdg->text;
1087 if( exists $unique_rank_rdg{$key} ) {
1088 # Combine!
0068967c 1089 # print STDERR "Combining readings at same rank: $key\n";
0e476982 1090 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
1091 } else {
1092 $unique_rank_rdg{$key} = $rdg;
1093 }
1094 }
1095}
1096
1097
fa954f4c 1098## Utility functions
de51424a 1099
4a8828f0 1100# Return the string that joins together a list of witnesses for
1101# display on a single path.
4a8828f0 1102sub witnesses_of_label {
de51424a 1103 my( $self, $label ) = @_;
4a8828f0 1104 my $regex = $self->wit_list_separator;
de51424a 1105 my @answer = split( /\Q$regex\E/, $label );
1106 return @answer;
4a8828f0 1107}
8e1394aa 1108
dd3b58b0 1109no Moose;
1110__PACKAGE__->meta->make_immutable;
e867486f 1111
1112=head1 BUGS / TODO
1113
1114=over
1115
0068967c 1116=item * Think about making Relationship objects again
e867486f 1117
1118=back