fixing start/end nodes
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
6771a1b1 3use feature 'say';
910a0a6d 4use Encode qw( decode_utf8 );
5use File::Temp;
bfcbcecb 6use File::Which;
c9bf3dbf 7use Graph;
8e1394aa 8use IPC::Run qw( run binary );
82fa4d57 9use Text::CSV;
b15511bf 10use Text::Tradition::Collation::Reading;
22222af9 11use Text::Tradition::Collation::RelationshipStore;
63778331 12use Text::Tradition::Error;
cc31ebaa 13use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
dd3b58b0 14use Moose;
15
3a2ebbf4 16has 'sequence' => (
d047cd52 17 is => 'ro',
3a2ebbf4 18 isa => 'Graph',
19 default => sub { Graph->new() },
d047cd52 20 handles => {
3a2ebbf4 21 paths => 'edges',
d047cd52 22 },
d047cd52 23 );
3a2ebbf4 24
25has 'relations' => (
26 is => 'ro',
22222af9 27 isa => 'Text::Tradition::Collation::RelationshipStore',
28 handles => {
29 relationships => 'relationships',
30 related_readings => 'related_readings',
202ccb18 31 get_relationship => 'get_relationship',
ee801e17 32 del_relationship => 'del_relationship',
359944f7 33 equivalence => 'equivalence',
34 equivalence_graph => 'equivalence_graph',
22222af9 35 },
36 writer => '_set_relations',
3a2ebbf4 37 );
dd3b58b0 38
3a2ebbf4 39has 'tradition' => (
40 is => 'ro',
d047cd52 41 isa => 'Text::Tradition',
8cfd99c4 42 writer => '_set_tradition',
8d9a1cd8 43 weak_ref => 1,
d047cd52 44 );
dd3b58b0 45
3a2ebbf4 46has 'readings' => (
47 isa => 'HashRef[Text::Tradition::Collation::Reading]',
48 traits => ['Hash'],
49 handles => {
50 reading => 'get',
51 _add_reading => 'set',
52 del_reading => 'delete',
53 has_reading => 'exists',
144d845b 54# reading_keys => 'keys',
3a2ebbf4 55 readings => 'values',
56 },
57 default => sub { {} },
58 );
910a0a6d 59
4a8828f0 60has 'wit_list_separator' => (
7854e12e 61 is => 'rw',
62 isa => 'Str',
63 default => ', ',
64 );
65
66has 'baselabel' => (
67 is => 'rw',
68 isa => 'Str',
69 default => 'base text',
70 );
4a8828f0 71
15d2d3df 72has 'linear' => (
73 is => 'rw',
74 isa => 'Bool',
75 default => 1,
76 );
c84275ff 77
ef9d481f 78has 'ac_label' => (
79 is => 'rw',
80 isa => 'Str',
81 default => ' (a.c.)',
82 );
3a2ebbf4 83
4e483aa5 84has 'wordsep' => (
85 is => 'rw',
86 isa => 'Str',
87 default => ' ',
88 );
89
3a2ebbf4 90has 'start' => (
91 is => 'ro',
92 isa => 'Text::Tradition::Collation::Reading',
93 writer => '_set_start',
94 weak_ref => 1,
95 );
96
97has 'end' => (
98 is => 'ro',
99 isa => 'Text::Tradition::Collation::Reading',
100 writer => '_set_end',
101 weak_ref => 1,
102 );
b365fbae 103
1dd07bda 104has 'cached_table' => (
105 is => 'rw',
106 isa => 'HashRef',
107 predicate => 'has_cached_table',
108 clearer => 'wipe_table',
109 );
c1915ab9 110
111has '_graphcalc_done' => (
112 is => 'rw',
113 isa => 'Bool',
114 default => undef,
115 );
1f563ac3 116
4e5a7b2c 117=head1 NAME
118
119Text::Tradition::Collation - a software model for a text collation
120
121=head1 SYNOPSIS
122
123 use Text::Tradition;
124 my $t = Text::Tradition->new(
125 'name' => 'this is a text',
126 'input' => 'TEI',
127 'file' => '/path/to/tei_parallel_seg_file.xml' );
128
129 my $c = $t->collation;
130 my @readings = $c->readings;
131 my @paths = $c->paths;
132 my @relationships = $c->relationships;
133
134 my $svg_variant_graph = $t->collation->as_svg();
135
136=head1 DESCRIPTION
137
138Text::Tradition is a library for representation and analysis of collated
139texts, particularly medieval ones. The Collation is the central feature of
140a Tradition, where the text, its sequence of readings, and its relationships
141between readings are actually kept.
142
143=head1 CONSTRUCTOR
144
145=head2 new
146
147The constructor. Takes a hash or hashref of the following arguments:
148
149=over
150
151=item * tradition - The Text::Tradition object to which the collation
152belongs. Required.
153
154=item * linear - Whether the collation should be linear; that is, whether
155transposed readings should be treated as two linked readings rather than one,
156and therefore whether the collation graph is acyclic. Defaults to true.
157
4e5a7b2c 158=item * baselabel - The default label for the path taken by a base text
159(if any). Defaults to 'base text'.
160
161=item * wit_list_separator - The string to join a list of witnesses for
162purposes of making labels in display graphs. Defaults to ', '.
163
164=item * ac_label - The extra label to tack onto a witness sigil when
165representing another layer of path for the given witness - that is, when
166a text has more than one possible reading due to scribal corrections or
167the like. Defaults to ' (a.c.)'.
168
4e483aa5 169=item * wordsep - The string used to separate words in the original text.
170Defaults to ' '.
171
4e5a7b2c 172=back
173
174=head1 ACCESSORS
175
176=head2 tradition
177
178=head2 linear
179
4e5a7b2c 180=head2 wit_list_separator
181
182=head2 baselabel
183
184=head2 ac_label
185
4e483aa5 186=head2 wordsep
187
4e5a7b2c 188Simple accessors for collation attributes.
189
190=head2 start
191
192The meta-reading at the start of every witness path.
193
194=head2 end
195
196The meta-reading at the end of every witness path.
197
198=head2 readings
199
200Returns all Reading objects in the graph.
201
202=head2 reading( $id )
203
204Returns the Reading object corresponding to the given ID.
205
206=head2 add_reading( $reading_args )
207
208Adds a new reading object to the collation.
209See L<Text::Tradition::Collation::Reading> for the available arguments.
210
211=head2 del_reading( $object_or_id )
212
213Removes the given reading from the collation, implicitly removing its
214paths and relationships.
215
4e483aa5 216=head2 merge_readings( $main, $second, $concatenate, $with_str )
217
218Merges the $second reading into the $main one. If $concatenate is true, then
219the merged node will carry the text of both readings, concatenated with either
220$with_str (if specified) or a sensible default (the empty string if the
221appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
4e5a7b2c 222
4e483aa5 223The first two arguments may be either readings or reading IDs.
4e5a7b2c 224
225=head2 has_reading( $id )
226
227Predicate to see whether a given reading ID is in the graph.
228
229=head2 reading_witnesses( $object_or_id )
230
231Returns a list of sigils whose witnesses contain the reading.
232
233=head2 paths
234
235Returns all reading paths within the document - that is, all edges in the
236collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
237
238=head2 add_path( $source, $target, $sigil )
239
240Links the given readings in the collation in sequence, under the given witness
241sigil. The readings may be specified by object or ID.
242
243=head2 del_path( $source, $target, $sigil )
244
245Links the given readings in the collation in sequence, under the given witness
246sigil. The readings may be specified by object or ID.
247
248=head2 has_path( $source, $target );
249
250Returns true if the two readings are linked in sequence in any witness.
251The readings may be specified by object or ID.
252
253=head2 relationships
254
255Returns all Relationship objects in the collation.
256
257=head2 add_relationship( $reading, $other_reading, $options )
258
259Adds a new relationship of the type given in $options between the two readings,
260which may be specified by object or ID. Returns a value of ( $status, @vectors)
261where $status is true on success, and @vectors is a list of relationship edges
262that were ultimately added.
263See L<Text::Tradition::Collation::Relationship> for the available options.
264
265=cut
dd3b58b0 266
d047cd52 267sub BUILD {
3a2ebbf4 268 my $self = shift;
22222af9 269 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
e4b73942 270 $self->_set_start( $self->add_reading(
271 { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
272 $self->_set_end( $self->add_reading(
273 { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
d047cd52 274}
784877d9 275
3a2ebbf4 276### Reading construct/destruct functions
277
278sub add_reading {
279 my( $self, $reading ) = @_;
280 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
281 my %args = %$reading;
e4b73942 282 if( $args{'init'} ) {
283 # If we are initializing an empty collation, don't assume that we
284 # have set a tradition.
285 delete $args{'init'};
286 } elsif( $self->tradition->has_language && !exists $args{'language'} ) {
6ad2ce78 287 $args{'language'} = $self->tradition->language;
288 }
3a2ebbf4 289 $reading = Text::Tradition::Collation::Reading->new(
290 'collation' => $self,
291 %args );
292 }
293 # First check to see if a reading with this ID exists.
294 if( $self->reading( $reading->id ) ) {
63778331 295 throw( "Collation already has a reading with id " . $reading->id );
3a2ebbf4 296 }
c1915ab9 297 $self->_graphcalc_done(0);
3a2ebbf4 298 $self->_add_reading( $reading->id => $reading );
299 # Once the reading has been added, put it in both graphs.
300 $self->sequence->add_vertex( $reading->id );
cecbe56d 301 $self->relations->add_reading( $reading->id );
3a2ebbf4 302 return $reading;
eca16057 303};
304
3a2ebbf4 305around del_reading => sub {
306 my $orig = shift;
307 my $self = shift;
308 my $arg = shift;
309
cecbe56d 310 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
311 $arg = $arg->id;
3a2ebbf4 312 }
3a2ebbf4 313 # Remove the reading from the graphs.
c1915ab9 314 $self->_graphcalc_done(0);
4e483aa5 315 $self->_clear_cache; # Explicitly clear caches to GC the reading
cecbe56d 316 $self->sequence->delete_vertex( $arg );
317 $self->relations->delete_reading( $arg );
3a2ebbf4 318
319 # Carry on.
cecbe56d 320 $self->$orig( $arg );
3a2ebbf4 321};
7854e12e 322
4e483aa5 323=begin testing
324
325use Text::Tradition;
326
327my $cxfile = 't/data/Collatex-16.xml';
328my $t = Text::Tradition->new(
329 'name' => 'inline',
330 'input' => 'CollateX',
331 'file' => $cxfile,
332 );
333my $c = $t->collation;
334
335my $rno = scalar $c->readings;
336# Split n21 for testing purposes
337my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
338my $old_r = $c->reading( 'n21' );
339$old_r->alter_text( 'to' );
340$c->del_path( 'n20', 'n21', 'A' );
341$c->add_path( 'n20', 'n21p0', 'A' );
342$c->add_path( 'n21p0', 'n21', 'A' );
343$c->flatten_ranks();
344ok( $c->reading( 'n21p0' ), "New reading exists" );
345is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
346
679f17e1 347# Combine n3 and n4 ( with his )
4e483aa5 348$c->merge_readings( 'n3', 'n4', 1 );
349ok( !$c->reading('n4'), "Reading n4 is gone" );
350is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
351
679f17e1 352# Collapse n9 and n10 ( rood / root )
353$c->merge_readings( 'n9', 'n10' );
354ok( !$c->reading('n10'), "Reading n10 is gone" );
355is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
4e483aa5 356
357# Combine n21 and n21p0
358my $remaining = $c->reading('n21');
359$remaining ||= $c->reading('n22'); # one of these should still exist
360$c->merge_readings( 'n21p0', $remaining, 1 );
361ok( !$c->reading('n21'), "Reading $remaining is gone" );
362is( $c->reading('n21p0')->text, 'unto', "Reading n21p0 merged correctly" );
363
364=end testing
365
366=cut
7854e12e 367
3a2ebbf4 368sub merge_readings {
369 my $self = shift;
370
56772e8c 371 # Sanity check
372 my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
373 my $mergemeta = $kept_obj->is_meta;
374 throw( "Cannot merge meta and non-meta reading" )
375 unless ( $mergemeta && $del_obj->is_meta )
376 || ( !$mergemeta && !$del_obj->is_meta );
377 if( $mergemeta ) {
378 throw( "Cannot merge with start or end node" )
379 if( $kept_obj eq $self->start || $kept_obj eq $self->end
380 || $del_obj eq $self->start || $del_obj eq $self->end );
381 }
3a2ebbf4 382 # We only need the IDs for adding paths to the graph, not the reading
383 # objects themselves.
56772e8c 384 my $kept = $kept_obj->id;
385 my $deleted = $del_obj->id;
c1915ab9 386 $self->_graphcalc_done(0);
10943ab0 387
3a2ebbf4 388 # The kept reading should inherit the paths and the relationships
389 # of the deleted reading.
390 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
391 my @vector = ( $kept );
392 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
393 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 394 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 395 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
396 $self->sequence->add_edge( @vector );
397 my $fwits = $self->sequence->get_edge_attributes( @vector );
398 @wits{keys %$fwits} = values %$fwits;
399 $self->sequence->set_edge_attributes( @vector, \%wits );
400 }
cecbe56d 401 $self->relations->merge_readings( $kept, $deleted, $combine );
3a2ebbf4 402
403 # Do the deletion deed.
4e483aa5 404 if( $combine ) {
869a1ada 405 # Combine the text of the readings
4e483aa5 406 my $joinstr = $combine_char;
407 unless( defined $joinstr ) {
408 $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
409 $joinstr = $self->wordsep unless defined $joinstr;
410 }
411 $kept_obj->alter_text( join( $joinstr, $kept_obj->text, $del_obj->text ) );
339786dd 412 # Change this reading to a joining one if necessary
413 $kept_obj->_set_join_next( $del_obj->join_next );
869a1ada 414 $kept_obj->normal_form(
415 join( $joinstr, $kept_obj->normal_form, $del_obj->normal_form ) );
416 # Combine the lexemes present in the readings
417 if( $kept_obj->has_lexemes && $del_obj->has_lexemes ) {
418 $kept_obj->add_lexeme( $del_obj->lexemes );
419 }
49d4f2ac 420 }
3a2ebbf4 421 $self->del_reading( $deleted );
422}
7854e12e 423
6771a1b1 424=head2 compress_readings
425
426Where possible in the graph, compresses plain sequences of readings into a
427single reading. The sequences must consist of readings with no
428relationships to other readings, with only a single witness path between
429them and no other witness paths from either that would skip the other. The
430readings must also not be marked as nonsense or bad grammar.
431
432WARNING: This operation cannot be undone.
433
434=cut
435
436sub compress_readings {
437 my $self = shift;
438 # Anywhere in the graph that there is a reading that joins only to a single
439 # successor, and neither of these have any relationships, just join the two
440 # readings.
441 my %gobbled;
442 foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
428bcf0b 443 # While we are here, get rid of any extra wordforms from a disambiguated
444 # reading.
445 if( $rdg->disambiguated ) {
446 foreach my $lex ( $rdg->lexemes ) {
447 $lex->clear_matching_forms();
448 $lex->add_matching_form( $lex->form );
449 }
450 }
451 # Now look for readings that can be joined to their successors.
6771a1b1 452 next if $rdg->is_meta;
453 next if $gobbled{$rdg->id};
454 next if $rdg->grammar_invalid || $rdg->is_nonsense;
455 next if $rdg->related_readings();
456 my %seen;
457 while( $self->sequence->successors( $rdg ) == 1 ) {
458 my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
459 throw( "Infinite loop" ) if $seen{$next->id};
460 $seen{$next->id} = 1;
461 last if $self->sequence->predecessors( $next ) > 1;
462 last if $next->is_meta;
463 last if $next->grammar_invalid || $next->is_nonsense;
464 last if $next->related_readings();
465 say "Joining readings $rdg and $next";
466 $self->merge_readings( $rdg, $next, 1 );
467 }
468 }
469 # Make sure we haven't screwed anything up
470 foreach my $wit ( $self->tradition->witnesses ) {
471 my $pathtext = $self->path_text( $wit->sigil );
472 my $origtext = join( ' ', @{$wit->text} );
473 throw( "Text differs for witness " . $wit->sigil )
474 unless $pathtext eq $origtext;
475 if( $wit->is_layered ) {
476 $pathtext = $self->path_text( $wit->sigil.$self->ac_label );
477 $origtext = join( ' ', @{$wit->layertext} );
478 throw( "Ante-corr text differs for witness " . $wit->sigil )
479 unless $pathtext eq $origtext;
480 }
481 }
482
483 $self->relations->rebuild_equivalence();
484 $self->calculate_ranks();
485}
3265b0ce 486
3a2ebbf4 487# Helper function for manipulating the graph.
488sub _stringify_args {
4e483aa5 489 my( $self, $first, $second, @args ) = @_;
3a2ebbf4 490 $first = $first->id
491 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
492 $second = $second->id
493 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
4e483aa5 494 return( $first, $second, @args );
3a2ebbf4 495}
df6d9812 496
4e5a7b2c 497# Helper function for manipulating the graph.
498sub _objectify_args {
499 my( $self, $first, $second, $arg ) = @_;
500 $first = $self->reading( $first )
501 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
502 $second = $self->reading( $second )
503 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
504 return( $first, $second, $arg );
505}
3a2ebbf4 506### Path logic
507
508sub add_path {
509 my $self = shift;
510
511 # We only need the IDs for adding paths to the graph, not the reading
512 # objects themselves.
cecbe56d 513 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
3a2ebbf4 514
c1915ab9 515 $self->_graphcalc_done(0);
3a2ebbf4 516 # Connect the readings
359944f7 517 unless( $self->sequence->has_edge( $source, $target ) ) {
518 $self->sequence->add_edge( $source, $target );
519 $self->relations->add_equivalence_edge( $source, $target );
520 }
3a2ebbf4 521 # Note the witness in question
522 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
359944f7 523}
b15511bf 524
3a2ebbf4 525sub del_path {
526 my $self = shift;
49d4f2ac 527 my @args;
528 if( ref( $_[0] ) eq 'ARRAY' ) {
529 my $e = shift @_;
530 @args = ( @$e, @_ );
531 } else {
532 @args = @_;
533 }
3a2ebbf4 534
535 # We only need the IDs for adding paths to the graph, not the reading
536 # objects themselves.
49d4f2ac 537 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 538
c1915ab9 539 $self->_graphcalc_done(0);
3a2ebbf4 540 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 541 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 542 }
543 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
544 $self->sequence->delete_edge( $source, $target );
359944f7 545 $self->relations->delete_equivalence_edge( $source, $target );
3a2ebbf4 546 }
784877d9 547}
548
3a2ebbf4 549
15d2d3df 550# Extra graph-alike utility
551sub has_path {
3a2ebbf4 552 my $self = shift;
553 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
554 return undef unless $self->sequence->has_edge( $source, $target );
555 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 556}
557
4e5a7b2c 558=head2 clear_witness( @sigil_list )
3a2ebbf4 559
4e5a7b2c 560Clear the given witnesses out of the collation entirely, removing references
561to them in paths, and removing readings that belong only to them. Should only
562be called via $tradition->del_witness.
3a2ebbf4 563
564=cut
565
4e5a7b2c 566sub clear_witness {
567 my( $self, @sigils ) = @_;
568
c1915ab9 569 $self->_graphcalc_done(0);
4e5a7b2c 570 # Clear the witness(es) out of the paths
571 foreach my $e ( $self->paths ) {
572 foreach my $sig ( @sigils ) {
573 $self->del_path( $e, $sig );
574 }
575 }
576
577 # Clear out the newly unused readings
578 foreach my $r ( $self->readings ) {
579 unless( $self->reading_witnesses( $r ) ) {
580 $self->del_reading( $r );
581 }
582 }
583}
3a2ebbf4 584
585sub add_relationship {
586 my $self = shift;
22222af9 587 my( $source, $target, $opts ) = $self->_stringify_args( @_ );
414cc046 588 my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
c1915ab9 589 $self->_graphcalc_done(0);
63778331 590 return @vectors;
22222af9 591}
ef9d481f 592
ca6e6095 593around qw/ get_relationship del_relationship / => sub {
594 my $orig = shift;
595 my $self = shift;
596 my @args = @_;
597 if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
598 @args = @{$_[0]};
599 }
600 my( $source, $target ) = $self->_stringify_args( @args );
601 $self->$orig( $source, $target );
602};
603
22222af9 604=head2 reading_witnesses( $reading )
910a0a6d 605
22222af9 606Return a list of sigils corresponding to the witnesses in which the reading appears.
3265b0ce 607
22222af9 608=cut
1d310495 609
1d310495 610sub reading_witnesses {
611 my( $self, $reading ) = @_;
612 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 613 # arbitrarily chosen "incoming". Thus, special-case the start node.
614 if( $reading eq $self->start ) {
615 return map { $_->sigil } $self->tradition->witnesses;
616 }
1d310495 617 my %all_witnesses;
618 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
619 my $wits = $self->sequence->get_edge_attributes( @$e );
620 @all_witnesses{ keys %$wits } = 1;
621 }
c12bb878 622 my $acstr = $self->ac_label;
623 foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
624 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
625 }
1d310495 626 return keys %all_witnesses;
910a0a6d 627}
628
4e5a7b2c 629=head1 OUTPUT METHODS
8e1394aa 630
0ecb975c 631=head2 as_svg( \%options )
8e1394aa 632
0068967c 633Returns an SVG string that represents the graph, via as_dot and graphviz.
bfcbcecb 634See as_dot for a list of options. Must have GraphViz (dot) installed to run.
8e1394aa 635
636=cut
637
638sub as_svg {
0ecb975c 639 my( $self, $opts ) = @_;
bfcbcecb 640 throw( "Need GraphViz installed to output SVG" )
641 unless File::Which::which( 'dot' );
e247aad1 642 my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
1ff82d4f 643 $self->calculate_ranks()
644 unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
be3af600 645 my @cmd = qw/dot -Tsvg/;
646 my( $svg, $err );
647 my $dotfile = File::Temp->new();
648 ## USE FOR DEBUGGING
649 # $dotfile->unlink_on_destroy(0);
650 binmode $dotfile, ':utf8';
651 print $dotfile $self->as_dot( $opts );
652 push( @cmd, $dotfile->filename );
653 run( \@cmd, ">", binary(), \$svg );
654 $svg = decode_utf8( $svg );
655 return $svg;
8e1394aa 656}
657
b22576c6 658
0ecb975c 659=head2 as_dot( \%options )
b22576c6 660
0ecb975c 661Returns a string that is the collation graph expressed in dot
662(i.e. GraphViz) format. Options include:
b22576c6 663
0ecb975c 664=over 4
b22576c6 665
0ecb975c 666=item * from
b22576c6 667
0ecb975c 668=item * to
df6d9812 669
0ecb975c 670=item * color_common
671
672=back
df6d9812 673
674=cut
675
676sub as_dot {
0ecb975c 677 my( $self, $opts ) = @_;
678 my $startrank = $opts->{'from'} if $opts;
679 my $endrank = $opts->{'to'} if $opts;
680 my $color_common = $opts->{'color_common'} if $opts;
b365fbae 681 my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
682 && $self->end->rank > 100;
6648ee3d 683 $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
b365fbae 684
b22576c6 685 # Check the arguments
686 if( $startrank ) {
687 return if $endrank && $startrank > $endrank;
688 return if $startrank > $self->end->rank;
689 }
690 if( defined $endrank ) {
691 return if $endrank < 0;
f1b3b33a 692 $endrank = undef if $endrank == $self->end->rank;
b22576c6 693 }
694
67da8d6c 695 my $graph_name = $self->tradition->name;
696 $graph_name =~ s/[^\w\s]//g;
697 $graph_name = join( '_', split( /\s+/, $graph_name ) );
f13b5582 698
699 my %graph_attrs = (
700 'rankdir' => 'LR',
701 'bgcolor' => 'none',
702 );
703 my %node_attrs = (
b8990398 704 'fontsize' => 14,
f13b5582 705 'fillcolor' => 'white',
706 'style' => 'filled',
707 'shape' => 'ellipse'
708 );
709 my %edge_attrs = (
710 'arrowhead' => 'open',
711 'color' => '#000000',
712 'fontcolor' => '#000000',
713 );
714
67da8d6c 715 my $dot = sprintf( "digraph %s {\n", $graph_name );
f13b5582 716 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
717 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
df6d9812 718
b22576c6 719 # Output substitute start/end readings if necessary
720 if( $startrank ) {
43c94341 721 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
b22576c6 722 }
723 if( $endrank ) {
43c94341 724 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
b22576c6 725 }
b365fbae 726 if( $STRAIGHTENHACK ) {
727 ## HACK part 1
43c94341 728 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
729 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
b365fbae 730 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
731 }
b22576c6 732 my %used; # Keep track of the readings that actually appear in the graph
30ddc24c 733 # Sort the readings by rank if we have ranks; this speeds layout.
734 my @all_readings = $self->end->has_rank
735 ? sort { $a->rank <=> $b->rank } $self->readings
736 : $self->readings;
4633f9e4 737 # TODO Refrain from outputting lacuna nodes - just grey out the edges.
30ddc24c 738 foreach my $reading ( @all_readings ) {
b22576c6 739 # Only output readings within our rank range.
740 next if $startrank && $reading->rank < $startrank;
741 next if $endrank && $reading->rank > $endrank;
742 $used{$reading->id} = 1;
910a0a6d 743 # Need not output nodes without separate labels
3a2ebbf4 744 next if $reading->id eq $reading->text;
d4b75f44 745 my $rattrs;
30f0df34 746 my $label = $reading->text;
629e27b0 747 $label .= '-' if $reading->join_next;
748 $label = "-$label" if $reading->join_prior;
8f9cab7b 749 $label =~ s/\"/\\\"/g;
d4b75f44 750 $rattrs->{'label'} = $label;
10e4b1ac 751 $rattrs->{'id'} = $reading->id;
0ecb975c 752 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
d4b75f44 753 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
df6d9812 754 }
3a2ebbf4 755
30ddc24c 756 # Add the real edges. Need to weight one edge per rank jump, in a
757 # continuous line.
b365fbae 758 # my $weighted = $self->_add_edge_weights;
b22576c6 759 my @edges = $self->paths;
3bdec618 760 my( %substart, %subend );
b22576c6 761 foreach my $edge ( @edges ) {
762 # Do we need to output this edge?
508fd430 763 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
027d819c 764 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
f13b5582 765 my $variables = { %edge_attrs, 'label' => $label };
30ddc24c 766
b22576c6 767 # Account for the rank gap if necessary
30ddc24c 768 my $rank0 = $self->reading( $edge->[0] )->rank
769 if $self->reading( $edge->[0] )->has_rank;
770 my $rank1 = $self->reading( $edge->[1] )->rank
771 if $self->reading( $edge->[1] )->has_rank;
772 if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
773 $variables->{'minlen'} = $rank1 - $rank0;
774 }
775
776 # Add the calculated edge weights
b365fbae 777 # if( exists $weighted->{$edge->[0]}
e247aad1 778 # && $weighted->{$edge->[0]} eq $edge->[1] ) {
779 # # $variables->{'color'} = 'red';
780 # $variables->{'weight'} = 3.0;
781 # }
30ddc24c 782
508fd430 783 # EXPERIMENTAL: make edge width reflect no. of witnesses
784 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
785 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
786
f13b5582 787 my $varopts = _dot_attr_string( $variables );
788 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
789 $edge->[0], $edge->[1], $varopts );
3bdec618 790 } elsif( $used{$edge->[0]} ) {
96ba0418 791 $subend{$edge->[0]} = $edge->[1];
3bdec618 792 } elsif( $used{$edge->[1]} ) {
96ba0418 793 $substart{$edge->[1]} = $edge->[0];
b22576c6 794 }
df6d9812 795 }
3bdec618 796 # Add substitute start and end edges if necessary
797 foreach my $node ( keys %substart ) {
96ba0418 798 my $witstr = $self->_path_display_label ( $self->path_witnesses( $substart{$node}, $node ) );
f13b5582 799 my $variables = { %edge_attrs, 'label' => $witstr };
96ba0418 800 my $nrdg = $self->reading( $node );
801 if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
802 # Substart is actually one lower than $startrank
803 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
804 }
f13b5582 805 my $varopts = _dot_attr_string( $variables );
96ba0418 806 $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
3bdec618 807 }
808 foreach my $node ( keys %subend ) {
96ba0418 809 my $witstr = $self->_path_display_label ( $self->path_witnesses( $node, $subend{$node} ) );
f13b5582 810 my $variables = { %edge_attrs, 'label' => $witstr };
811 my $varopts = _dot_attr_string( $variables );
96ba0418 812 $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
3bdec618 813 }
b365fbae 814 # HACK part 2
815 if( $STRAIGHTENHACK ) {
43c94341 816 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
817 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
b365fbae 818 }
30ddc24c 819
df6d9812 820 $dot .= "}\n";
821 return $dot;
822}
823
f13b5582 824sub _dot_attr_string {
825 my( $hash ) = @_;
826 my @attrs;
827 foreach my $k ( sort keys %$hash ) {
828 my $v = $hash->{$k};
829 push( @attrs, $k.'="'.$v.'"' );
830 }
831 return( '[ ' . join( ', ', @attrs ) . ' ]' );
832}
833
30ddc24c 834sub _add_edge_weights {
835 my $self = shift;
836 # Walk the graph from START to END, choosing the successor node with
837 # the largest number of witness paths each time.
838 my $weighted = {};
839 my $curr = $self->start->id;
008fc8a6 840 my $ranked = $self->end->has_rank;
30ddc24c 841 while( $curr ne $self->end->id ) {
008fc8a6 842 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
30ddc24c 843 my @succ = sort { $self->path_witnesses( $curr, $a )
844 <=> $self->path_witnesses( $curr, $b ) }
845 $self->sequence->successors( $curr );
846 my $next = pop @succ;
008fc8a6 847 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
30ddc24c 848 # Try to avoid lacunae in the weighted path.
008fc8a6 849 while( @succ &&
850 ( $self->reading( $next )->is_lacuna ||
851 $nextrank - $rank > 1 ) ){
30ddc24c 852 $next = pop @succ;
853 }
854 $weighted->{$curr} = $next;
855 $curr = $next;
856 }
857 return $weighted;
858}
859
027d819c 860=head2 path_witnesses( $edge )
861
862Returns the list of sigils whose witnesses are associated with the given edge.
863The edge can be passed as either an array or an arrayref of ( $source, $target ).
864
865=cut
866
3a2ebbf4 867sub path_witnesses {
868 my( $self, @edge ) = @_;
869 # If edge is an arrayref, cope.
870 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
871 my $e = shift @edge;
872 @edge = @$e;
873 }
874 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
508fd430 875 return @wits;
3a2ebbf4 876}
877
7f9f05e8 878# Helper function. Make a display label for the given witnesses, showing a.c.
879# witnesses only where the main witness is not also in the list.
027d819c 880sub _path_display_label {
508fd430 881 my $self = shift;
7f9f05e8 882 my %wits;
883 map { $wits{$_} = 1 } @_;
884
885 # If an a.c. wit is listed, remove it if the main wit is also listed.
886 # Otherwise keep it for explicit listing.
887 my $aclabel = $self->ac_label;
888 my @disp_ac;
889 foreach my $w ( sort keys %wits ) {
890 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
891 if( exists $wits{$1} ) {
892 delete $wits{$w};
893 } else {
894 push( @disp_ac, $w );
895 }
896 }
897 }
898
899 # See if we are in a majority situation.
8f9cab7b 900 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
1ff82d4f 901 $maj = $maj > 5 ? $maj : 5;
7f9f05e8 902 if( scalar keys %wits > $maj ) {
903 unshift( @disp_ac, 'majority' );
904 return join( ', ', @disp_ac );
8f9cab7b 905 } else {
7f9f05e8 906 return join( ', ', sort keys %wits );
8f9cab7b 907 }
908}
1dd07bda 909
bf6e338d 910=head2 readings_at_rank( $rank )
1dd07bda 911
bf6e338d 912Returns a list of readings at a given rank, taken from the alignment table.
1dd07bda 913
914=cut
915
bf6e338d 916sub readings_at_rank {
1dd07bda 917 my( $self, $rank ) = @_;
bf6e338d 918 my $table = $self->alignment_table;
919 # Table rank is real rank - 1.
920 my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
921 my %readings;
922 foreach my $e ( @elements ) {
923 next unless ref( $e ) eq 'HASH';
924 next unless exists $e->{'t'};
925 $readings{$e->{'t'}->id} = $e->{'t'};
926 }
927 return values %readings;
1dd07bda 928}
8f9cab7b 929
4e5a7b2c 930=head2 as_graphml
8e1394aa 931
4e5a7b2c 932Returns a GraphML representation of the collation. The GraphML will contain
933two graphs. The first expresses the attributes of the readings and the witness
934paths that link them; the second expresses the relationships that link the
935readings. This is the native transfer format for a tradition.
8e1394aa 936
56eefa04 937=begin testing
938
939use Text::Tradition;
940
941my $READINGS = 311;
942my $PATHS = 361;
943
944my $datafile = 't/data/florilegium_tei_ps.xml';
945my $tradition = Text::Tradition->new( 'input' => 'TEI',
946 'name' => 'test0',
947 'file' => $datafile,
948 'linear' => 1 );
949
950ok( $tradition, "Got a tradition object" );
951is( scalar $tradition->witnesses, 13, "Found all witnesses" );
952ok( $tradition->collation, "Tradition has a collation" );
953
954my $c = $tradition->collation;
955is( scalar $c->readings, $READINGS, "Collation has all readings" );
956is( scalar $c->paths, $PATHS, "Collation has all paths" );
957is( scalar $c->relationships, 0, "Collation has all relationships" );
958
959# Add a few relationships
960$c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
961$c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
962$c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
963
964# Now write it to GraphML and parse it again.
965
966my $graphml = $c->as_graphml;
967my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
968is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
969is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
970is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
971
9fef629b 972# Now add a stemma, write to GraphML, and look at the output.
2a812726 973my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
974is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
975is( $tradition->stemmata, 1, "Tradition now has the stemma" );
976$graphml = $c->as_graphml;
977like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
978
9fef629b 979# Now add a user, write to GraphML, and look at the output.
980unlike( $graphml, qr/testuser/, "Test user name does not exist in GraphML yet" );
981my $testuser = Text::Tradition::User->new(
982 id => 'testuser', password => 'testpass' );
983is( ref( $testuser ), 'Text::Tradition::User', "Created test user object" );
984$testuser->add_tradition( $tradition );
985is( $tradition->user->id, $testuser->id, "Tradition assigned to test user" );
986$graphml = $c->as_graphml;
987like( $graphml, qr/testuser/, "Test user name now exists in GraphML" );
988
56eefa04 989=end testing
990
8e1394aa 991=cut
992
993sub as_graphml {
a30ca502 994 my( $self, $options ) = @_;
3d14b48e 995 $self->calculate_ranks unless $self->_graphcalc_done;
996
a30ca502 997 my $start = $options->{'from'}
998 ? $self->reading( $options->{'from'} ) : $self->start;
999 my $end = $options->{'to'}
1000 ? $self->reading( $options->{'to'} ) : $self->end;
1001 if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
1002 throw( 'Start node must be before end node' );
1003 }
1004 # The readings need to be ranked for this to work.
1005 $start = $self->start unless $start->has_rank;
1006 $end = $self->end unless $end->has_rank;
414cc046 1007 my $rankoffset = 0;
1008 unless( $start eq $self->start ) {
1009 $rankoffset = $start->rank - 1;
1010 }
a30ca502 1011 my %use_readings;
1012
8e1394aa 1013 # Some namespaces
1014 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
1015 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
1016 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 1017 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 1018
1019 # Create the document and root node
428bcf0b 1020 require XML::LibXML;
8e1394aa 1021 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
1022 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
1023 $graphml->setDocumentElement( $root );
1024 $root->setNamespace( $xsi_ns, 'xsi', 0 );
1025 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
bbd064a9 1026
1027 # List of attribute types to save on our objects and their corresponding
1028 # GraphML types
1029 my %save_types = (
1030 'Str' => 'string',
1031 'Int' => 'int',
1032 'Bool' => 'boolean',
10e4b1ac 1033 'ReadingID' => 'string',
bbd064a9 1034 'RelationshipType' => 'string',
1035 'RelationshipScope' => 'string',
1036 );
1037
bbd064a9 1038 # Add the data keys for the graph. Include an extra key 'version' for the
1039 # GraphML output version.
e309421a 1040 my %graph_data_keys;
1041 my $gdi = 0;
bbd064a9 1042 my %graph_attributes = ( 'version' => 'string' );
1043 # Graph attributes include those of Tradition and those of Collation.
1044 my %gattr_from;
1045 my $tmeta = $self->tradition->meta;
1046 my $cmeta = $self->meta;
1047 map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
1048 map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
1049 foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
1050 next if $attr->name =~ /^_/;
bbd064a9 1051 next unless $save_types{$attr->type_constraint->name};
1052 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1053 }
9fef629b 1054 # Extra custom keys for complex objects that should be saved in some form.
1055 # The subroutine should return a string, or undef/empty.
1056 $graph_attributes{'stemmata'} = sub {
1057 my @stemstrs;
1058 map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1059 $self->tradition->stemmata;
1060 join( "\n", @stemstrs );
1061 };
1062 $graph_attributes{'user'} = sub {
1063 $self->tradition->user ? $self->tradition->user->id : undef
1064 };
bbd064a9 1065
1066 foreach my $datum ( sort keys %graph_attributes ) {
e309421a 1067 $graph_data_keys{$datum} = 'dg'.$gdi++;
1068 my $key = $root->addNewChild( $graphml_ns, 'key' );
9fef629b 1069 my $dtype = ref( $graph_attributes{$datum} ) ? 'string'
1070 : $graph_attributes{$datum};
e309421a 1071 $key->setAttribute( 'attr.name', $datum );
9fef629b 1072 $key->setAttribute( 'attr.type', $dtype );
e309421a 1073 $key->setAttribute( 'for', 'graph' );
1074 $key->setAttribute( 'id', $graph_data_keys{$datum} );
1075 }
f6066bac 1076
bbd064a9 1077 # Add the data keys for reading nodes
1078 my %reading_attributes;
1079 my $rmeta = Text::Tradition::Collation::Reading->meta;
1080 foreach my $attr( $rmeta->get_all_attributes ) {
1081 next if $attr->name =~ /^_/;
bbd064a9 1082 next unless $save_types{$attr->type_constraint->name};
1083 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1084 }
7cd9f181 1085 # Extra custom key for the reading morphology
1086 $reading_attributes{'lexemes'} = 'string';
1087
ef9d481f 1088 my %node_data_keys;
1089 my $ndi = 0;
bbd064a9 1090 foreach my $datum ( sort keys %reading_attributes ) {
910a0a6d 1091 $node_data_keys{$datum} = 'dn'.$ndi++;
1092 my $key = $root->addNewChild( $graphml_ns, 'key' );
1093 $key->setAttribute( 'attr.name', $datum );
bbd064a9 1094 $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
910a0a6d 1095 $key->setAttribute( 'for', 'node' );
1096 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 1097 }
1098
bbd064a9 1099 # Add the data keys for edges, that is, paths and relationships. Path
1100 # data does not come from a Moose class so is here manually.
ef9d481f 1101 my $edi = 0;
1102 my %edge_data_keys;
bbd064a9 1103 my %edge_attributes = (
3a2ebbf4 1104 witness => 'string', # ID/label for a path
3a2ebbf4 1105 extra => 'boolean', # Path key
3a2ebbf4 1106 );
bbd064a9 1107 my @path_attributes = keys %edge_attributes; # track our manual additions
1108 my $pmeta = Text::Tradition::Collation::Relationship->meta;
1109 foreach my $attr( $pmeta->get_all_attributes ) {
1110 next if $attr->name =~ /^_/;
bbd064a9 1111 next unless $save_types{$attr->type_constraint->name};
1112 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1113 }
1114 foreach my $datum ( sort keys %edge_attributes ) {
3a2ebbf4 1115 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 1116 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 1117 $key->setAttribute( 'attr.name', $datum );
bbd064a9 1118 $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
910a0a6d 1119 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 1120 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 1121 }
3a2ebbf4 1122
cc31ebaa 1123 # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1124 my $xmlidname = $self->tradition->name;
1125 $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1126 if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1127 $xmlidname = '_'.$xmlidname;
1128 }
2c669bca 1129 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1130 $sgraph->setAttribute( 'edgedefault', 'directed' );
cc31ebaa 1131 $sgraph->setAttribute( 'id', $xmlidname );
2c669bca 1132 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
cc31ebaa 1133 $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
2c669bca 1134 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
cc31ebaa 1135 $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
2c669bca 1136 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
22222af9 1137
2a812726 1138 # Tradition/collation attribute data
bbd064a9 1139 foreach my $datum ( keys %graph_attributes ) {
1140 my $value;
1141 if( $datum eq 'version' ) {
2a812726 1142 $value = '3.2';
9fef629b 1143 } elsif( ref( $graph_attributes{$datum} ) ) {
1144 my $sub = $graph_attributes{$datum};
1145 $value = &$sub();
bbd064a9 1146 } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1147 $value = $self->tradition->$datum;
1148 } else {
1149 $value = $self->$datum;
1150 }
2c669bca 1151 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 1152 }
8e1394aa 1153
1154 my $node_ctr = 0;
1155 my %node_hash;
22222af9 1156 # Add our readings to the graph
3a2ebbf4 1157 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
a30ca502 1158 next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1159 ( $n->rank < $start->rank || $n->rank > $end->rank );
1160 $use_readings{$n->id} = 1;
2c669bca 1161 # Add to the main graph
1162 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 1163 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 1164 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 1165 $node_el->setAttribute( 'id', $node_xmlid );
bbd064a9 1166 foreach my $d ( keys %reading_attributes ) {
255875b8 1167 my $nval = $n->$d;
7cd9f181 1168 # Custom serialization
1169 if( $d eq 'lexemes' ) {
1170 # If nval is a true value, we have lexemes so we need to
1171 # serialize them. Otherwise set nval to undef so that the
1172 # key is excluded from this reading.
1173 $nval = $nval ? $n->_serialize_lexemes : undef;
18c64d55 1174 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1175 $nval = undef;
7cd9f181 1176 }
cc31ebaa 1177 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
414cc046 1178 # Adjust the ranks within the subgraph.
cc31ebaa 1179 $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1180 : $nval - $rankoffset;
414cc046 1181 }
255875b8 1182 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1183 if defined $nval;
1184 }
b15511bf 1185 }
1186
2c669bca 1187 # Add the path edges to the sequence graph
df6d9812 1188 my $edge_ctr = 0;
3a2ebbf4 1189 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1190 # We add an edge in the graphml for every witness in $e.
a30ca502 1191 next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1192 my @edge_wits = sort $self->path_witnesses( $e );
cc31ebaa 1193 $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1194 $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1195 # Skip any path from start to end; that witness is not in the subgraph.
1196 next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
a30ca502 1197 foreach my $wit ( @edge_wits ) {
3a2ebbf4 1198 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1199 $node_hash{ $e->[0] },
1200 $node_hash{ $e->[1] } );
2c669bca 1201 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 1202 $edge_el->setAttribute( 'source', $from );
1203 $edge_el->setAttribute( 'target', $to );
1204 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 1205
1206 # It's a witness path, so add the witness
1207 my $base = $wit;
1208 my $key = $edge_data_keys{'witness'};
1209 # Is this an ante-corr witness?
1210 my $aclabel = $self->ac_label;
1211 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1212 # Keep the base witness
1213 $base = $1;
1214 # ...and record that this is an 'extra' reading path
1215 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1216 }
1217 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1218 }
1219 }
1220
cc31ebaa 1221 # Report the actual number of nodes and edges that went in
1222 $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1223 $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1224
22222af9 1225 # Add the relationship graph to the XML
bbd064a9 1226 map { delete $edge_data_keys{$_} } @path_attributes;
826d8773 1227 $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1228 $node_data_keys{'id'}, \%edge_data_keys );
8e1394aa 1229
94c00c71 1230 # Save and return the thing
1231 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 1232 return $result;
df6d9812 1233}
1234
b15511bf 1235sub _add_graphml_data {
1236 my( $el, $key, $value ) = @_;
b15511bf 1237 return unless defined $value;
c9bf3dbf 1238 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 1239 $data_el->setAttribute( 'key', $key );
1240 $data_el->appendText( $value );
8e1394aa 1241}
1242
4e5a7b2c 1243=head2 as_csv
910a0a6d 1244
1245Returns a CSV alignment table representation of the collation graph, one
2c669bca 1246row per witness (or witness uncorrected.)
910a0a6d 1247
1248=cut
1249
1250sub as_csv {
3a2ebbf4 1251 my( $self ) = @_;
1dd07bda 1252 my $table = $self->alignment_table;
82fa4d57 1253 my $csv = Text::CSV->new( { binary => 1, quote_null => 0 } );
910a0a6d 1254 my @result;
2c669bca 1255 # Make the header row
1256 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1257 push( @result, decode_utf8( $csv->string ) );
1258 # Make the rest of the rows
1259 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 1260 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1dd07bda 1261 my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
2c669bca 1262 $csv->combine( @row );
910a0a6d 1263 push( @result, decode_utf8( $csv->string ) );
1264 }
3a2ebbf4 1265 return join( "\n", @result );
910a0a6d 1266}
1267
1dd07bda 1268=head2 alignment_table( $use_refs, $include_witnesses )
2c669bca 1269
566f4595 1270Return a reference to an alignment table, in a slightly enhanced CollateX
1271format which looks like this:
1272
1273 $table = { alignment => [ { witness => "SIGIL",
4e5a7b2c 1274 tokens => [ { t => "TEXT" }, ... ] },
566f4595 1275 { witness => "SIG2",
4e5a7b2c 1276 tokens => [ { t => "TEXT" }, ... ] },
566f4595 1277 ... ],
1278 length => TEXTLEN };
1279
1280If $use_refs is set to 1, the reading object is returned in the table
1281instead of READINGTEXT; if not, the text of the reading is returned.
4e5a7b2c 1282
1283If $include_witnesses is set to a hashref, only the witnesses whose sigil
566f4595 1284keys have a true hash value will be included.
2c669bca 1285
1286=cut
9f3ba6f7 1287
1dd07bda 1288sub alignment_table {
1289 my( $self ) = @_;
c1915ab9 1290 $self->calculate_ranks() unless $self->_graphcalc_done;
1dd07bda 1291 return $self->cached_table if $self->has_cached_table;
1292
0ecb975c 1293 # Make sure we can do this
1294 throw( "Need a linear graph in order to make an alignment table" )
1295 unless $self->linear;
1296 $self->calculate_ranks unless $self->end->has_rank;
1297
2c669bca 1298 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 1299 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 1300 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
6771a1b1 1301 # say STDERR "Making witness row(s) for " . $wit->sigil;
1f7aa795 1302 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1dd07bda 1303 my @row = _make_witness_row( \@wit_path, \@all_pos );
2c669bca 1304 push( @{$table->{'alignment'}},
1305 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1f7aa795 1306 if( $wit->is_layered ) {
1307 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
861c3e27 1308 $wit->sigil.$self->ac_label );
1dd07bda 1309 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
2c669bca 1310 push( @{$table->{'alignment'}},
1311 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
910a0a6d 1312 }
1313 }
1dd07bda 1314 $self->cached_table( $table );
1315 return $table;
910a0a6d 1316}
1317
1318sub _make_witness_row {
1dd07bda 1319 my( $path, $positions ) = @_;
910a0a6d 1320 my %char_hash;
1321 map { $char_hash{$_} = undef } @$positions;
2c669bca 1322 my $debug = 0;
910a0a6d 1323 foreach my $rdg ( @$path ) {
eca16057 1324 my $rtext = $rdg->text;
1325 $rtext = '#LACUNA#' if $rdg->is_lacuna;
6771a1b1 1326 say STDERR "rank " . $rdg->rank if $debug;
1327 # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
1dd07bda 1328 $char_hash{$rdg->rank} = { 't' => $rdg };
910a0a6d 1329 }
1330 my @row = map { $char_hash{$_} } @$positions;
eca16057 1331 # Fill in lacuna markers for undef spots in the row
1332 my $last_el = shift @row;
1333 my @filled_row = ( $last_el );
1334 foreach my $el ( @row ) {
0e476982 1335 # If we are using node reference, make the lacuna node appear many times
1336 # in the table. If not, use the lacuna tag.
1dd07bda 1337 if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1338 $el = $last_el;
eca16057 1339 }
1340 push( @filled_row, $el );
1341 $last_el = $el;
1342 }
1343 return @filled_row;
910a0a6d 1344}
1345
4e5a7b2c 1346=head1 NAVIGATION METHODS
910a0a6d 1347
4e5a7b2c 1348=head2 reading_sequence( $first, $last, $sigil, $backup )
e2902068 1349
1350Returns the ordered list of readings, starting with $first and ending
4e5a7b2c 1351with $last, for the witness given in $sigil. If a $backup sigil is
1352specified (e.g. when walking a layered witness), it will be used wherever
1353no $sigil path exists. If there is a base text reading, that will be
1354used wherever no path exists for $sigil or $backup.
e2902068 1355
1356=cut
1357
910a0a6d 1358# TODO Think about returning some lazy-eval iterator.
b0b4421a 1359# TODO Get rid of backup; we should know from what witness is whether we need it.
910a0a6d 1360
e2902068 1361sub reading_sequence {
861c3e27 1362 my( $self, $start, $end, $witness ) = @_;
e2902068 1363
930ff666 1364 $witness = $self->baselabel unless $witness;
e2902068 1365 my @readings = ( $start );
1366 my %seen;
1367 my $n = $start;
3a2ebbf4 1368 while( $n && $n->id ne $end->id ) {
1369 if( exists( $seen{$n->id} ) ) {
63778331 1370 throw( "Detected loop for $witness at " . $n->id );
910a0a6d 1371 }
3a2ebbf4 1372 $seen{$n->id} = 1;
910a0a6d 1373
861c3e27 1374 my $next = $self->next_reading( $n, $witness );
44771cf2 1375 unless( $next ) {
63778331 1376 throw( "Did not find any path for $witness from reading " . $n->id );
44771cf2 1377 }
910a0a6d 1378 push( @readings, $next );
1379 $n = $next;
e2902068 1380 }
1381 # Check that the last reading is our end reading.
1382 my $last = $readings[$#readings];
63778331 1383 throw( "Last reading found from " . $start->text .
1384 " for witness $witness is not the end!" ) # TODO do we get this far?
3a2ebbf4 1385 unless $last->id eq $end->id;
e2902068 1386
1387 return @readings;
1388}
1389
4e5a7b2c 1390=head2 next_reading( $reading, $sigil );
8e1394aa 1391
4a8828f0 1392Returns the reading that follows the given reading along the given witness
930ff666 1393path.
8e1394aa 1394
1395=cut
1396
4a8828f0 1397sub next_reading {
e2902068 1398 # Return the successor via the corresponding path.
8e1394aa 1399 my $self = shift;
3a2ebbf4 1400 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 1401 return undef unless $answer;
3a2ebbf4 1402 return $self->reading( $answer );
8e1394aa 1403}
1404
4e5a7b2c 1405=head2 prior_reading( $reading, $sigil )
8e1394aa 1406
4a8828f0 1407Returns the reading that precedes the given reading along the given witness
930ff666 1408path.
8e1394aa 1409
1410=cut
1411
4a8828f0 1412sub prior_reading {
e2902068 1413 # Return the predecessor via the corresponding path.
8e1394aa 1414 my $self = shift;
3a2ebbf4 1415 my $answer = $self->_find_linked_reading( 'prior', @_ );
1416 return $self->reading( $answer );
8e1394aa 1417}
1418
4a8828f0 1419sub _find_linked_reading {
861c3e27 1420 my( $self, $direction, $node, $path ) = @_;
1421
1422 # Get a backup if we are dealing with a layered witness
1423 my $alt_path;
1424 my $aclabel = $self->ac_label;
1425 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1426 $alt_path = $1;
1427 }
1428
e2902068 1429 my @linked_paths = $direction eq 'next'
3a2ebbf4 1430 ? $self->sequence->edges_from( $node )
1431 : $self->sequence->edges_to( $node );
e2902068 1432 return undef unless scalar( @linked_paths );
8e1394aa 1433
e2902068 1434 # We have to find the linked path that contains all of the
1435 # witnesses supplied in $path.
1436 my( @path_wits, @alt_path_wits );
4e5a7b2c 1437 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1438 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 1439 my $base_le;
1440 my $alt_le;
1441 foreach my $le ( @linked_paths ) {
3a2ebbf4 1442 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 1443 $base_le = $le;
910a0a6d 1444 }
508fd430 1445 my @le_wits = sort $self->path_witnesses( $le );
3a2ebbf4 1446 if( _is_within( \@path_wits, \@le_wits ) ) {
1447 # This is the right path.
1448 return $direction eq 'next' ? $le->[1] : $le->[0];
1449 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1450 $alt_le = $le;
1451 }
8e1394aa 1452 }
e2902068 1453 # Got this far? Return the alternate path if it exists.
3a2ebbf4 1454 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 1455 if $alt_le;
e2902068 1456
1457 # Got this far? Return the base path if it exists.
3a2ebbf4 1458 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 1459 if $base_le;
e2902068 1460
1461 # Got this far? We have no appropriate path.
2c669bca 1462 warn "Could not find $direction node from " . $node->id
910a0a6d 1463 . " along path $path";
8e1394aa 1464 return undef;
1465}
1466
4a8828f0 1467# Some set logic.
1468sub _is_within {
1469 my( $set1, $set2 ) = @_;
7854e12e 1470 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 1471 foreach my $el ( @$set1 ) {
910a0a6d 1472 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 1473 }
1474 return $ret;
1475}
1476
4e5a7b2c 1477# Return the string that joins together a list of witnesses for
1478# display on a single path.
1479sub _witnesses_of_label {
1480 my( $self, $label ) = @_;
1481 my $regex = $self->wit_list_separator;
1482 my @answer = split( /\Q$regex\E/, $label );
1483 return @answer;
b0b4421a 1484}
1485
d4b75f44 1486=head2 common_readings
1487
1488Returns the list of common readings in the graph (i.e. those readings that are
1489shared by all non-lacunose witnesses.)
1490
1491=cut
1492
1493sub common_readings {
1494 my $self = shift;
1495 my @common = grep { $_->is_common } $self->readings;
1496 return @common;
1497}
1498
fae52efd 1499=head2 path_text( $sigil, [, $start, $end ] )
b0b4421a 1500
1501Returns the text of a witness (plus its backup, if we are using a layer)
1502as stored in the collation. The text is returned as a string, where the
1503individual readings are joined with spaces and the meta-readings (e.g.
1504lacunae) are omitted. Optional specification of $start and $end allows
1505the generation of a subset of the witness text.
4e5a7b2c 1506
b0b4421a 1507=cut
1508
1509sub path_text {
861c3e27 1510 my( $self, $wit, $start, $end ) = @_;
b0b4421a 1511 $start = $self->start unless $start;
1512 $end = $self->end unless $end;
861c3e27 1513 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
629e27b0 1514 my $pathtext = '';
1515 my $last;
1516 foreach my $r ( @path ) {
6ad2ce78 1517 unless ( $r->join_prior || !$last || $last->join_next ) {
1518 $pathtext .= ' ';
1519 }
1520 $pathtext .= $r->text;
629e27b0 1521 $last = $r;
1522 }
1523 return $pathtext;
b0b4421a 1524}
4e5a7b2c 1525
1526=head1 INITIALIZATION METHODS
1527
1528These are mostly for use by parsers.
1529
1530=head2 make_witness_path( $witness )
1531
1532Link the array of readings contained in $witness->path (and in
1533$witness->uncorrected_path if it exists) into collation paths.
1534Clear out the arrays when finished.
de51424a 1535
4e5a7b2c 1536=head2 make_witness_paths
1537
1538Call make_witness_path for all witnesses in the tradition.
1539
1540=cut
930ff666 1541
7e450e44 1542# For use when a collation is constructed from a base text and an apparatus.
1543# We have the sequences of readings and just need to add path edges.
1f7aa795 1544# When we are done, clear out the witness path attributes, as they are no
1545# longer needed.
1546# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 1547
6a222840 1548sub make_witness_paths {
1549 my( $self ) = @_;
910a0a6d 1550 foreach my $wit ( $self->tradition->witnesses ) {
6771a1b1 1551 # say STDERR "Making path for " . $wit->sigil;
910a0a6d 1552 $self->make_witness_path( $wit );
7854e12e 1553 }
7854e12e 1554}
1555
6a222840 1556sub make_witness_path {
7854e12e 1557 my( $self, $wit ) = @_;
1558 my @chain = @{$wit->path};
15d2d3df 1559 my $sig = $wit->sigil;
fae52efd 1560 # Add start and end if necessary
1561 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1562 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
7854e12e 1563 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 1564 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 1565 }
1f7aa795 1566 if( $wit->is_layered ) {
d9e873d0 1567 @chain = @{$wit->uncorrected_path};
fae52efd 1568 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1569 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
d9e873d0 1570 foreach my $idx( 0 .. $#chain-1 ) {
1571 my $source = $chain[$idx];
1572 my $target = $chain[$idx+1];
1573 $self->add_path( $source, $target, $sig.$self->ac_label )
1574 unless $self->has_path( $source, $target, $sig );
1575 }
15d2d3df 1576 }
1f7aa795 1577 $wit->clear_path;
1578 $wit->clear_uncorrected_path;
e2902068 1579}
1580
4e5a7b2c 1581=head2 calculate_ranks
1582
1583Calculate the reading ranks (that is, their aligned positions relative
1584to each other) for the graph. This can only be called on linear collations.
1585
b365fbae 1586=begin testing
1587
1588use Text::Tradition;
1589
1590my $cxfile = 't/data/Collatex-16.xml';
1591my $t = Text::Tradition->new(
1592 'name' => 'inline',
1593 'input' => 'CollateX',
1594 'file' => $cxfile,
1595 );
1596my $c = $t->collation;
1597
1598# Make an svg
bfcbcecb 1599my $table = $c->alignment_table;
1600ok( $c->has_cached_table, "Alignment table was cached" );
1601is( $c->alignment_table, $table, "Cached table returned upon second call" );
b365fbae 1602$c->calculate_ranks;
bfcbcecb 1603is( $c->alignment_table, $table, "Cached table retained with no rank change" );
679f17e1 1604$c->add_relationship( 'n24', 'n23', { 'type' => 'spelling' } );
bfcbcecb 1605isnt( $c->alignment_table, $table, "Alignment table changed after relationship add" );
b365fbae 1606
1607=end testing
1608
4e5a7b2c 1609=cut
1610
910a0a6d 1611sub calculate_ranks {
1612 my $self = shift;
b365fbae 1613 # Save the existing ranks, in case we need to invalidate the cached SVG.
1614 my %existing_ranks;
ac4d7ac2 1615 map { $existing_ranks{$_} = $_->rank } $self->readings;
359944f7 1616
1617 # Do the rankings based on the relationship equivalence graph, starting
1618 # with the start node.
56772e8c 1619 my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
1620
910a0a6d 1621 # Transfer our rankings from the topological graph to the real one.
1622 foreach my $r ( $self->readings ) {
cecbe56d 1623 if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
359944f7 1624 $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
67da8d6c 1625 } else {
63778331 1626 # Die. Find the last rank we calculated.
359944f7 1627 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
1628 <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
63778331 1629 $self->readings;
1630 my $last = pop @all_defined;
1631 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
67da8d6c 1632 }
de51424a 1633 }
bfcbcecb 1634 # Do we need to invalidate the cached data?
be3af600 1635 if( $self->has_cached_table ) {
b365fbae 1636 foreach my $r ( $self->readings ) {
7c293912 1637 next if defined( $existing_ranks{$r} )
1638 && $existing_ranks{$r} == $r->rank;
c1915ab9 1639 # Something has changed, so clear the cache
bfcbcecb 1640 $self->_clear_cache;
c1915ab9 1641 # ...and recalculate the common readings.
1642 $self->calculate_common_readings();
b365fbae 1643 last;
1644 }
1645 }
c1915ab9 1646 # The graph calculation information is now up to date.
1647 $self->_graphcalc_done(1);
8e1394aa 1648}
3a1f2523 1649
c1915ab9 1650sub _clear_cache {
1651 my $self = shift;
c1915ab9 1652 $self->wipe_table if $self->has_cached_table;
1653}
1654
1655
4e5a7b2c 1656=head2 flatten_ranks
1657
1658A convenience method for parsing collation data. Searches the graph for readings
1659with the same text at the same rank, and merges any that are found.
1660
1661=cut
1662
0e476982 1663sub flatten_ranks {
1664 my $self = shift;
1665 my %unique_rank_rdg;
bf6e338d 1666 my $changed;
0e476982 1667 foreach my $rdg ( $self->readings ) {
1668 next unless $rdg->has_rank;
1669 my $key = $rdg->rank . "||" . $rdg->text;
1670 if( exists $unique_rank_rdg{$key} ) {
07e6765f 1671 # Make sure they don't have different grammatical forms
1672 my $ur = $unique_rank_rdg{$key};
1673 if( $rdg->disambiguated && $ur->disambiguated ) {
1674 my $rform = join( '//', map { $_->form->to_string } $rdg->lexemes );
1675 my $uform = join( '//', map { $_->form->to_string } $ur->lexemes );
1676 next unless $rform eq $uform;
1677 } elsif( $rdg->disambiguated xor $ur->disambiguated ) {
1678 next;
1679 }
0e476982 1680 # Combine!
6771a1b1 1681 #say STDERR "Combining readings at same rank: $key";
bf6e338d 1682 $changed = 1;
0e476982 1683 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
0ecb975c 1684 # TODO see if this now makes a common point.
0e476982 1685 } else {
1686 $unique_rank_rdg{$key} = $rdg;
1687 }
1688 }
bf6e338d 1689 # If we merged readings, the ranks are still fine but the alignment
1690 # table is wrong. Wipe it.
1691 $self->wipe_table() if $changed;
0e476982 1692}
4633f9e4 1693
1694
d4b75f44 1695=head2 calculate_common_readings
1696
1697Goes through the graph identifying the readings that appear in every witness
1698(apart from those with lacunae at that spot.) Marks them as common and returns
1699the list.
1700
1701=begin testing
1702
1703use Text::Tradition;
1704
1705my $cxfile = 't/data/Collatex-16.xml';
1706my $t = Text::Tradition->new(
1707 'name' => 'inline',
1708 'input' => 'CollateX',
1709 'file' => $cxfile,
1710 );
1711my $c = $t->collation;
1712
1713my @common = $c->calculate_common_readings();
1714is( scalar @common, 8, "Found correct number of common readings" );
1715my @marked = sort $c->common_readings();
1716is( scalar @common, 8, "All common readings got marked as such" );
679f17e1 1717my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
d4b75f44 1718is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1719
1720=end testing
1721
1722=cut
1723
1724sub calculate_common_readings {
1725 my $self = shift;
1726 my @common;
c1915ab9 1727 map { $_->is_common( 0 ) } $self->readings;
1728 # Implicitly calls calculate_ranks
1dd07bda 1729 my $table = $self->alignment_table;
d4b75f44 1730 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
7f52eac8 1731 my @row = map { $_->{'tokens'}->[$idx]
1732 ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1733 @{$table->{'alignment'}};
d4b75f44 1734 my %hash;
1735 foreach my $r ( @row ) {
1736 if( $r ) {
1737 $hash{$r->id} = $r unless $r->is_meta;
1738 } else {
1739 $hash{'UNDEF'} = $r;
1740 }
1741 }
1742 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1743 my( $r ) = values %hash;
1744 $r->is_common( 1 );
1745 push( @common, $r );
1746 }
1747 }
1748 return @common;
1749}
1750
861c3e27 1751=head2 text_from_paths
1752
1753Calculate the text array for all witnesses from the path, for later consistency
1754checking. Only to be used if there is no non-graph-based way to know the
1755original texts.
1756
1757=cut
1758
1759sub text_from_paths {
1760 my $self = shift;
1761 foreach my $wit ( $self->tradition->witnesses ) {
5164a6f0 1762 my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1763 my @text;
1764 foreach my $r ( @readings ) {
1765 next if $r->is_meta;
1766 push( @text, $r->text );
1767 }
861c3e27 1768 $wit->text( \@text );
1769 if( $wit->is_layered ) {
5164a6f0 1770 my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
1771 $wit->sigil.$self->ac_label );
1772 my @uctext;
1773 foreach my $r ( @ucrdgs ) {
1774 next if $r->is_meta;
1775 push( @uctext, $r->text );
1776 }
1777 $wit->layertext( \@uctext );
861c3e27 1778 }
1779 }
1780}
0e476982 1781
4e5a7b2c 1782=head1 UTILITY FUNCTIONS
1783
1784=head2 common_predecessor( $reading_a, $reading_b )
8e1394aa 1785
4e5a7b2c 1786Find the last reading that occurs in sequence before both the given readings.
414cc046 1787At the very least this should be $self->start.
4e5a7b2c 1788
1789=head2 common_successor( $reading_a, $reading_b )
1790
1791Find the first reading that occurs in sequence after both the given readings.
414cc046 1792At the very least this should be $self->end.
4e5a7b2c 1793
22222af9 1794=begin testing
1795
1796use Text::Tradition;
1797
1798my $cxfile = 't/data/Collatex-16.xml';
1799my $t = Text::Tradition->new(
1800 'name' => 'inline',
1801 'input' => 'CollateX',
1802 'file' => $cxfile,
1803 );
1804my $c = $t->collation;
1805
679f17e1 1806is( $c->common_predecessor( 'n24', 'n23' )->id,
22222af9 1807 'n20', "Found correct common predecessor" );
679f17e1 1808is( $c->common_successor( 'n24', 'n23' )->id,
10e4b1ac 1809 '__END__', "Found correct common successor" );
22222af9 1810
4e5a7b2c 1811is( $c->common_predecessor( 'n19', 'n17' )->id,
22222af9 1812 'n16', "Found correct common predecessor for readings on same path" );
679f17e1 1813is( $c->common_successor( 'n21', 'n10' )->id,
10e4b1ac 1814 '__END__', "Found correct common successor for readings on same path" );
22222af9 1815
1816=end testing
1817
1818=cut
1819
1820## Return the closest reading that is a predecessor of both the given readings.
1821sub common_predecessor {
1822 my $self = shift;
4e5a7b2c 1823 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1824 return $self->_common_in_path( $r1, $r2, 'predecessors' );
22222af9 1825}
1826
1827sub common_successor {
1828 my $self = shift;
4e5a7b2c 1829 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 1830 return $self->_common_in_path( $r1, $r2, 'successors' );
22222af9 1831}
1832
414cc046 1833
1834# TODO think about how to do this without ranks...
027d819c 1835sub _common_in_path {
22222af9 1836 my( $self, $r1, $r2, $dir ) = @_;
414cc046 1837 my $iter = $self->end->rank;
22222af9 1838 my @candidates;
414cc046 1839 my @last_r1 = ( $r1 );
1840 my @last_r2 = ( $r2 );
1841 # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
22222af9 1842 my %all_seen;
6771a1b1 1843 # say STDERR "Finding common $dir for $r1, $r2";
22222af9 1844 while( !@candidates ) {
414cc046 1845 last unless $iter--; # Avoid looping infinitely
1846 # Iterate separately down the graph from r1 and r2
1847 my( @new_lc1, @new_lc2 );
1848 foreach my $lc ( @last_r1 ) {
1849 foreach my $p ( $lc->$dir ) {
1850 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
6771a1b1 1851 # say STDERR "Path candidate $p from $lc";
414cc046 1852 push( @candidates, $p );
002e3600 1853 } elsif( !$all_seen{$p->id} ) {
414cc046 1854 $all_seen{$p->id} = 'r1';
1855 push( @new_lc1, $p );
1856 }
1857 }
1858 }
1859 foreach my $lc ( @last_r2 ) {
22222af9 1860 foreach my $p ( $lc->$dir ) {
414cc046 1861 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
6771a1b1 1862 # say STDERR "Path candidate $p from $lc";
22222af9 1863 push( @candidates, $p );
002e3600 1864 } elsif( !$all_seen{$p->id} ) {
414cc046 1865 $all_seen{$p->id} = 'r2';
1866 push( @new_lc2, $p );
22222af9 1867 }
1868 }
1869 }
414cc046 1870 @last_r1 = @new_lc1;
1871 @last_r2 = @new_lc2;
22222af9 1872 }
1873 my @answer = sort { $a->rank <=> $b->rank } @candidates;
1874 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1875}
1876
63778331 1877sub throw {
1878 Text::Tradition::Error->throw(
1879 'ident' => 'Collation error',
1880 'message' => $_[0],
1881 );
1882}
1883
dd3b58b0 1884no Moose;
1885__PACKAGE__->meta->make_immutable;
e867486f 1886
027d819c 1887=head1 LICENSE
e867486f 1888
027d819c 1889This package is free software and is provided "as is" without express
1890or implied warranty. You can redistribute it and/or modify it under
1891the same terms as Perl itself.
e867486f 1892
027d819c 1893=head1 AUTHOR
e867486f 1894
027d819c 1895Tara L Andrews E<lt>aurum@cpan.orgE<gt>