Add logic for normalized form of lemma to propagate on orth/spelling links
[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
6771a1b1 3use feature 'say';
910a0a6d 4use Encode qw( decode_utf8 );
5use File::Temp;
bfcbcecb 6use File::Which;
c9bf3dbf 7use Graph;
8e1394aa 8use IPC::Run qw( run binary );
522c1314 9use JSON qw/ to_json /;
82fa4d57 10use Text::CSV;
5c0072ef 11use Text::Tradition::Collation::Data;
b15511bf 12use Text::Tradition::Collation::Reading;
22222af9 13use Text::Tradition::Collation::RelationshipStore;
63778331 14use Text::Tradition::Error;
cc31ebaa 15use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
5c0072ef 16use XML::LibXML;
17use XML::LibXML::XPathContext;
dd3b58b0 18use Moose;
19
5c0072ef 20has _data => (
21 isa => 'Text::Tradition::Collation::Data',
22 is => 'ro',
23 required => 1,
24 handles => [ qw(
25 sequence
26 paths
27 _set_relations
28 relations
29 _set_start
30 _set_end
31 ac_label
32 has_cached_table
33 relationships
0bbea47d 34 relationship_types
5c0072ef 35 related_readings
36 get_relationship
37 del_relationship
38 equivalence
39 equivalence_graph
40 readings
41 reading
42 _add_reading
43 del_reading
44 has_reading
45 wit_list_separator
46 baselabel
47 linear
48 wordsep
49 start
50 end
51 cached_table
52 _graphcalc_done
53 has_cached_svg
54 wipe_table
55 )]
56);
dd3b58b0 57
3a2ebbf4 58has 'tradition' => (
59 is => 'ro',
d047cd52 60 isa => 'Text::Tradition',
8cfd99c4 61 writer => '_set_tradition',
8d9a1cd8 62 weak_ref => 1,
d047cd52 63 );
dd3b58b0 64
6b6c7098 65=encoding utf8
66
4e5a7b2c 67=head1 NAME
68
69Text::Tradition::Collation - a software model for a text collation
70
71=head1 SYNOPSIS
72
73 use Text::Tradition;
74 my $t = Text::Tradition->new(
75 'name' => 'this is a text',
76 'input' => 'TEI',
77 'file' => '/path/to/tei_parallel_seg_file.xml' );
78
79 my $c = $t->collation;
80 my @readings = $c->readings;
81 my @paths = $c->paths;
82 my @relationships = $c->relationships;
83
84 my $svg_variant_graph = $t->collation->as_svg();
85
86=head1 DESCRIPTION
87
88Text::Tradition is a library for representation and analysis of collated
89texts, particularly medieval ones. The Collation is the central feature of
90a Tradition, where the text, its sequence of readings, and its relationships
91between readings are actually kept.
92
93=head1 CONSTRUCTOR
94
95=head2 new
96
97The constructor. Takes a hash or hashref of the following arguments:
98
99=over
100
101=item * tradition - The Text::Tradition object to which the collation
102belongs. Required.
103
104=item * linear - Whether the collation should be linear; that is, whether
105transposed readings should be treated as two linked readings rather than one,
106and therefore whether the collation graph is acyclic. Defaults to true.
107
4e5a7b2c 108=item * baselabel - The default label for the path taken by a base text
109(if any). Defaults to 'base text'.
110
111=item * wit_list_separator - The string to join a list of witnesses for
112purposes of making labels in display graphs. Defaults to ', '.
113
114=item * ac_label - The extra label to tack onto a witness sigil when
115representing another layer of path for the given witness - that is, when
116a text has more than one possible reading due to scribal corrections or
117the like. Defaults to ' (a.c.)'.
118
4e483aa5 119=item * wordsep - The string used to separate words in the original text.
120Defaults to ' '.
121
4e5a7b2c 122=back
123
124=head1 ACCESSORS
125
126=head2 tradition
127
128=head2 linear
129
4e5a7b2c 130=head2 wit_list_separator
131
132=head2 baselabel
133
134=head2 ac_label
135
4e483aa5 136=head2 wordsep
137
4e5a7b2c 138Simple accessors for collation attributes.
139
140=head2 start
141
142The meta-reading at the start of every witness path.
143
144=head2 end
145
146The meta-reading at the end of every witness path.
147
148=head2 readings
149
150Returns all Reading objects in the graph.
151
152=head2 reading( $id )
153
154Returns the Reading object corresponding to the given ID.
155
156=head2 add_reading( $reading_args )
157
158Adds a new reading object to the collation.
159See L<Text::Tradition::Collation::Reading> for the available arguments.
160
161=head2 del_reading( $object_or_id )
162
163Removes the given reading from the collation, implicitly removing its
164paths and relationships.
165
4e5a7b2c 166=head2 has_reading( $id )
167
168Predicate to see whether a given reading ID is in the graph.
169
170=head2 reading_witnesses( $object_or_id )
171
172Returns a list of sigils whose witnesses contain the reading.
173
174=head2 paths
175
176Returns all reading paths within the document - that is, all edges in the
177collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
178
179=head2 add_path( $source, $target, $sigil )
180
181Links the given readings in the collation in sequence, under the given witness
182sigil. The readings may be specified by object or ID.
183
184=head2 del_path( $source, $target, $sigil )
185
186Links the given readings in the collation in sequence, under the given witness
187sigil. The readings may be specified by object or ID.
188
189=head2 has_path( $source, $target );
190
191Returns true if the two readings are linked in sequence in any witness.
192The readings may be specified by object or ID.
193
194=head2 relationships
195
196Returns all Relationship objects in the collation.
197
198=head2 add_relationship( $reading, $other_reading, $options )
199
200Adds a new relationship of the type given in $options between the two readings,
201which may be specified by object or ID. Returns a value of ( $status, @vectors)
202where $status is true on success, and @vectors is a list of relationship edges
203that were ultimately added.
204See L<Text::Tradition::Collation::Relationship> for the available options.
205
206=cut
dd3b58b0 207
5c0072ef 208sub BUILDARGS {
209 my ( $class, @args ) = @_;
210 my %args = @args == 1 ? %{ $args[0] } : @args;
211 # TODO determine these from the Moose::Meta object
212 my @delegate_attrs = qw(sequence relations readings wit_list_separator baselabel
213 linear wordsep start end cached_table _graphcalc_done);
214 my %data_args;
215 for my $attr (@delegate_attrs) {
216 $data_args{$attr} = delete $args{$attr} if exists $args{$attr};
217 }
218 $args{_data} = Text::Tradition::Collation::Data->new(%data_args);
219 return \%args;
220}
221
d047cd52 222sub BUILD {
3a2ebbf4 223 my $self = shift;
22222af9 224 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
e4b73942 225 $self->_set_start( $self->add_reading(
226 { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
227 $self->_set_end( $self->add_reading(
228 { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
d047cd52 229}
784877d9 230
b34fd19d 231=head2 register_relationship_type( %relationship_definition )
232
233Add a relationship type definition to this collation. The argument can be either a
234hash or a hashref, defining the properties of the relationship. For relationship types
235and their properties, see L<Text::Tradition::Collation::RelationshipType>.
236
237=head2 get_relationship_type( $relationship_name )
238
239Retrieve the RelationshipType object for the relationship with the given name.
240
241=cut
242
24efa55d 243sub register_relationship_type {
244 my $self = shift;
245 my %args = @_ == 1 ? %{$_[0]} : @_;
246 if( $self->relations->has_type( $args{name} ) ) {
247 throw( 'Relationship type ' . $args{name} . ' already registered' );
248 }
249 $self->relations->add_type( %args );
250}
251
39d1ddc7 252sub get_relationship_type {
253 my( $self, $name ) = @_;
254 return $self->relations->has_type( $name )
255 ? $self->relations->type( $name ) : undef;
256}
257
3a2ebbf4 258### Reading construct/destruct functions
259
260sub add_reading {
261 my( $self, $reading ) = @_;
262 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
263 my %args = %$reading;
e4b73942 264 if( $args{'init'} ) {
265 # If we are initializing an empty collation, don't assume that we
266 # have set a tradition.
267 delete $args{'init'};
8943ff68 268 } elsif( $self->tradition->can('language') && $self->tradition->has_language
269 && !exists $args{'language'} ) {
6ad2ce78 270 $args{'language'} = $self->tradition->language;
271 }
3a2ebbf4 272 $reading = Text::Tradition::Collation::Reading->new(
273 'collation' => $self,
274 %args );
275 }
276 # First check to see if a reading with this ID exists.
277 if( $self->reading( $reading->id ) ) {
63778331 278 throw( "Collation already has a reading with id " . $reading->id );
3a2ebbf4 279 }
c1915ab9 280 $self->_graphcalc_done(0);
3a2ebbf4 281 $self->_add_reading( $reading->id => $reading );
282 # Once the reading has been added, put it in both graphs.
283 $self->sequence->add_vertex( $reading->id );
cecbe56d 284 $self->relations->add_reading( $reading->id );
3a2ebbf4 285 return $reading;
eca16057 286};
287
3a2ebbf4 288around del_reading => sub {
289 my $orig = shift;
290 my $self = shift;
291 my $arg = shift;
292
cecbe56d 293 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
294 $arg = $arg->id;
3a2ebbf4 295 }
3a2ebbf4 296 # Remove the reading from the graphs.
c1915ab9 297 $self->_graphcalc_done(0);
4e483aa5 298 $self->_clear_cache; # Explicitly clear caches to GC the reading
cecbe56d 299 $self->sequence->delete_vertex( $arg );
300 $self->relations->delete_reading( $arg );
3a2ebbf4 301
302 # Carry on.
cecbe56d 303 $self->$orig( $arg );
3a2ebbf4 304};
7854e12e 305
3c234eb6 306=head2 merge_readings( $main, $second, $concatenate, $with_str )
307
308Merges the $second reading into the $main one. If $concatenate is true, then
309the merged node will carry the text of both readings, concatenated with either
310$with_str (if specified) or a sensible default (the empty string if the
311appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
312
313The first two arguments may be either readings or reading IDs.
314
4e483aa5 315=begin testing
316
317use Text::Tradition;
58568d5c 318use TryCatch;
4e483aa5 319
320my $cxfile = 't/data/Collatex-16.xml';
321my $t = Text::Tradition->new(
322 'name' => 'inline',
323 'input' => 'CollateX',
324 'file' => $cxfile,
325 );
326my $c = $t->collation;
327
328my $rno = scalar $c->readings;
4ef65ab4 329# Split n21 ('unto') for testing purposes
4e483aa5 330my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
331my $old_r = $c->reading( 'n21' );
332$old_r->alter_text( 'to' );
333$c->del_path( 'n20', 'n21', 'A' );
334$c->add_path( 'n20', 'n21p0', 'A' );
335$c->add_path( 'n21p0', 'n21', 'A' );
7a0956c1 336$c->add_relationship( 'n21', 'n22', { type => 'collated', scope => 'local' } );
4e483aa5 337$c->flatten_ranks();
338ok( $c->reading( 'n21p0' ), "New reading exists" );
339is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
340
679f17e1 341# Combine n3 and n4 ( with his )
4e483aa5 342$c->merge_readings( 'n3', 'n4', 1 );
343ok( !$c->reading('n4'), "Reading n4 is gone" );
344is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
345
679f17e1 346# Collapse n9 and n10 ( rood / root )
347$c->merge_readings( 'n9', 'n10' );
348ok( !$c->reading('n10'), "Reading n10 is gone" );
349is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
4e483aa5 350
58568d5c 351# Try to combine n21 and n21p0. This should break.
4e483aa5 352my $remaining = $c->reading('n21');
353$remaining ||= $c->reading('n22'); # one of these should still exist
58568d5c 354try {
355 $c->merge_readings( 'n21p0', $remaining, 1 );
356 ok( 0, "Bad reading merge changed the graph" );
357} catch( Text::Tradition::Error $e ) {
358 like( $e->message, qr/neither concatenated nor collated/, "Expected exception from bad concatenation" );
359} catch {
360 ok( 0, "Unexpected error on bad reading merge: $@" );
361}
362
363try {
364 $c->calculate_ranks();
365 ok( 1, "Graph is still evidently whole" );
366} catch( Text::Tradition::Error $e ) {
367 ok( 0, "Caught a rank exception: " . $e->message );
368}
4e483aa5 369
370=end testing
371
372=cut
7854e12e 373
3a2ebbf4 374sub merge_readings {
375 my $self = shift;
376
56772e8c 377 # Sanity check
378 my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
379 my $mergemeta = $kept_obj->is_meta;
380 throw( "Cannot merge meta and non-meta reading" )
381 unless ( $mergemeta && $del_obj->is_meta )
382 || ( !$mergemeta && !$del_obj->is_meta );
383 if( $mergemeta ) {
384 throw( "Cannot merge with start or end node" )
385 if( $kept_obj eq $self->start || $kept_obj eq $self->end
386 || $del_obj eq $self->start || $del_obj eq $self->end );
a445ce40 387 throw( "Cannot combine text of meta readings" ) if $combine;
56772e8c 388 }
e4f00901 389 # We can only merge readings in a linear graph if:
390 # - they are contiguous with only one edge between them, OR
391 # - they are at equivalent ranks in the graph.
392 if( $self->linear ) {
393 my @delpred = $del_obj->predecessors;
394 my @keptsuc = $kept_obj->successors;
395 unless ( @delpred == 1 && $delpred[0] eq $kept_obj
396 && @keptsuc == 1 && $keptsuc[0] eq $del_obj ) {
397 my( $is_ok, $msg ) = $self->relations->relationship_valid(
398 $kept_obj, $del_obj, 'collated' );
399 unless( $is_ok ) {
400 throw( "Readings $kept_obj and $del_obj can be neither concatenated nor collated" );
401 }
402 }
403 }
404
3a2ebbf4 405 # We only need the IDs for adding paths to the graph, not the reading
406 # objects themselves.
56772e8c 407 my $kept = $kept_obj->id;
408 my $deleted = $del_obj->id;
c1915ab9 409 $self->_graphcalc_done(0);
10943ab0 410
3a2ebbf4 411 # The kept reading should inherit the paths and the relationships
412 # of the deleted reading.
413 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
414 my @vector = ( $kept );
415 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
416 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 417 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 418 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
419 $self->sequence->add_edge( @vector );
420 my $fwits = $self->sequence->get_edge_attributes( @vector );
421 @wits{keys %$fwits} = values %$fwits;
422 $self->sequence->set_edge_attributes( @vector, \%wits );
423 }
cecbe56d 424 $self->relations->merge_readings( $kept, $deleted, $combine );
3a2ebbf4 425
426 # Do the deletion deed.
4e483aa5 427 if( $combine ) {
869a1ada 428 # Combine the text of the readings
4e483aa5 429 my $joinstr = $combine_char;
430 unless( defined $joinstr ) {
431 $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
432 $joinstr = $self->wordsep unless defined $joinstr;
433 }
a445ce40 434 $kept_obj->_combine( $del_obj, $joinstr );
49d4f2ac 435 }
3a2ebbf4 436 $self->del_reading( $deleted );
437}
7854e12e 438
8d9494a8 439=head2 merge_related( @relationship_types )
440
441Merge all readings linked with the relationship types given. If any of the selected type(s) is not a colocation, the graph will no longer be linear. The majority/plurality reading in each case will be the one kept.
442
443WARNING: This operation cannot be undone.
444
445=cut
446
447=begin testing
448
58568d5c 449use Test::Warn;
8d9494a8 450use Text::Tradition;
451use TryCatch;
452
58568d5c 453my $t;
454warnings_exist {
455 $t = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/legendfrag.xml' );
456} [qr/Cannot set relationship on a meta reading/],
457 "Got expected relationship drop warning on parse";
458
8d9494a8 459my $c = $t->collation;
a512d0e6 460# Force the transitive propagation of all existing relationships.
461$c->relations->propagate_all_relationships();
8d9494a8 462
463my %rdg_ids;
464map { $rdg_ids{$_} = 1 } $c->readings;
465$c->merge_related( 'orthographic' );
751ee528 466is( scalar( $c->readings ), keys( %rdg_ids ) - 9,
8d9494a8 467 "Successfully collapsed orthographic variation" );
751ee528 468map { $rdg_ids{$_} = undef } qw/ r13.3 r11.4 r8.5 r8.2 r7.7 r7.5 r7.4 r7.3 r7.1 /;
8d9494a8 469foreach my $rid ( keys %rdg_ids ) {
470 my $exp = $rdg_ids{$rid};
471 is( !$c->reading( $rid ), !$exp, "Reading $rid correctly " .
472 ( $exp ? "retained" : "removed" ) );
473}
474ok( $c->linear, "Graph is still linear" );
475try {
476 $c->calculate_ranks; # This should succeed
477 ok( 1, "Can still calculate ranks on the new graph" );
478} catch {
479 ok( 0, "Rank calculation on merged graph failed: $@" );
480}
481
482# Now add some transpositions
483$c->add_relationship( 'r8.4', 'r10.4', { type => 'transposition' } );
484$c->merge_related( 'transposition' );
751ee528 485is( scalar( $c->readings ), keys( %rdg_ids ) - 10,
8d9494a8 486 "Transposed relationship is merged away" );
487ok( !$c->reading('r8.4'), "Correct transposed reading removed" );
488ok( !$c->linear, "Graph is no longer linear" );
489try {
490 $c->calculate_ranks; # This should fail
491 ok( 0, "Rank calculation happened on nonlinear graph?!" );
492} catch ( Text::Tradition::Error $e ) {
493 is( $e->message, 'Cannot calculate ranks on a non-linear graph',
494 "Rank calculation on merged graph threw an error" );
495}
496
8d9494a8 497=end testing
498
499=cut
500
501# TODO: there should be a way to display merged without affecting the underlying data!
502
503sub merge_related {
504 my $self = shift;
505 my %reltypehash;
506 map { $reltypehash{$_} = 1 } @_;
507
508 # Set up the filter for finding related readings
509 my $filter = sub {
510 exists $reltypehash{$_[0]->type};
511 };
512
8d9494a8 513 # Go through all readings looking for related ones
514 foreach my $r ( $self->readings ) {
515 next unless $self->reading( "$r" ); # might have been deleted meanwhile
751ee528 516 while( my @related = $self->related_readings( $r, $filter ) ) {
8d9494a8 517 push( @related, $r );
518 @related = sort {
519 scalar $b->witnesses <=> scalar $a->witnesses
520 } @related;
521 my $keep = shift @related;
522 foreach my $delr ( @related ) {
58568d5c 523 $self->linear( 0 )
8d9494a8 524 unless( $self->get_relationship( $keep, $delr )->colocated );
525 $self->merge_readings( $keep, $delr );
526 }
527 }
528 }
8d9494a8 529}
530
6771a1b1 531=head2 compress_readings
532
533Where possible in the graph, compresses plain sequences of readings into a
534single reading. The sequences must consist of readings with no
535relationships to other readings, with only a single witness path between
536them and no other witness paths from either that would skip the other. The
537readings must also not be marked as nonsense or bad grammar.
538
539WARNING: This operation cannot be undone.
540
c67190dd 541=begin testing
542
543use Text::Tradition;
544
2e818703 545my $t = Text::Tradition->new( input => 'CollateX', file => 't/data/Collatex-16.xml' );
c67190dd 546my $c = $t->collation;
547my $n = scalar $c->readings;
548$c->compress_readings();
549is( scalar $c->readings, $n - 6, "Compressing readings seems to work" );
550
551# Now put in a join-word and make sure the thing still works.
2e818703 552my $t2 = Text::Tradition->new( input => 'CollateX', file => 't/data/Collatex-16.xml' );
c67190dd 553my $c2 = $t2->collation;
554# Split n21 ('unto') for testing purposes
555my $new_r = $c2->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
556my $old_r = $c2->reading( 'n21' );
557$old_r->alter_text( 'to' );
558$c2->del_path( 'n20', 'n21', 'A' );
559$c2->add_path( 'n20', 'n21p0', 'A' );
560$c2->add_path( 'n21p0', 'n21', 'A' );
561$c2->calculate_ranks();
562is( scalar $c2->readings, $n + 1, "We have our extra test reading" );
563$c2->compress_readings();
564is( scalar $c2->readings, $n - 6, "Compressing readings also works with join_next" );
565is( $c2->reading( 'n21p0' )->text, 'unto', "The joined word has no space" );
566
567
568=end testing
569
6771a1b1 570=cut
571
572sub compress_readings {
573 my $self = shift;
f5e442f9 574 # Sanity check: first save the original text of each witness.
575 my %origtext;
576 foreach my $wit ( $self->tradition->witnesses ) {
577 $origtext{$wit->sigil} = $self->path_text( $wit->sigil );
578 if( $wit->is_layered ) {
579 my $acsig = $wit->sigil . $self->ac_label;
580 $origtext{$acsig} = $self->path_text( $acsig );
581 }
582 }
583
584 # Now do the deed.
6771a1b1 585 # Anywhere in the graph that there is a reading that joins only to a single
586 # successor, and neither of these have any relationships, just join the two
587 # readings.
6771a1b1 588 foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
428bcf0b 589 # Now look for readings that can be joined to their successors.
a445ce40 590 next unless $rdg->is_combinable;
6771a1b1 591 my %seen;
592 while( $self->sequence->successors( $rdg ) == 1 ) {
593 my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
594 throw( "Infinite loop" ) if $seen{$next->id};
595 $seen{$next->id} = 1;
596 last if $self->sequence->predecessors( $next ) > 1;
a445ce40 597 last unless $next->is_combinable;
6771a1b1 598 say "Joining readings $rdg and $next";
599 $self->merge_readings( $rdg, $next, 1 );
600 }
601 }
f5e442f9 602
603 # Finally, make sure we haven't screwed anything up.
6771a1b1 604 foreach my $wit ( $self->tradition->witnesses ) {
605 my $pathtext = $self->path_text( $wit->sigil );
6771a1b1 606 throw( "Text differs for witness " . $wit->sigil )
f5e442f9 607 unless $pathtext eq $origtext{$wit->sigil};
6771a1b1 608 if( $wit->is_layered ) {
f5e442f9 609 my $acsig = $wit->sigil . $self->ac_label;
610 $pathtext = $self->path_text( $acsig );
611 throw( "Layered text differs for witness " . $wit->sigil )
612 unless $pathtext eq $origtext{$acsig};
6771a1b1 613 }
614 }
615
616 $self->relations->rebuild_equivalence();
617 $self->calculate_ranks();
618}
3265b0ce 619
3a2ebbf4 620# Helper function for manipulating the graph.
621sub _stringify_args {
4e483aa5 622 my( $self, $first, $second, @args ) = @_;
3a2ebbf4 623 $first = $first->id
624 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
625 $second = $second->id
626 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
4e483aa5 627 return( $first, $second, @args );
3a2ebbf4 628}
df6d9812 629
4e5a7b2c 630# Helper function for manipulating the graph.
631sub _objectify_args {
632 my( $self, $first, $second, $arg ) = @_;
633 $first = $self->reading( $first )
634 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
635 $second = $self->reading( $second )
636 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
637 return( $first, $second, $arg );
638}
f97ef19e 639
640=head2 duplicate_reading( $reading, @witlist )
641
642Split the given reading into two, so that the new reading is in the path for
ef73c20a 643the witnesses given in @witlist. If the result is that certain non-colocated
644relationships (e.g. transpositions) are no longer valid, these will be removed.
645Returns the newly-created reading.
f97ef19e 646
647=begin testing
648
68e48c06 649use Test::More::UTF8;
f97ef19e 650use Text::Tradition;
68e48c06 651use TryCatch;
f97ef19e 652
653my $st = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/collatecorr.xml' );
654is( ref( $st ), 'Text::Tradition', "Got a tradition from test file" );
655ok( $st->has_witness('Ba96'), "Tradition has the affected witness" );
656
657my $sc = $st->collation;
658my $numr = 17;
659ok( $sc->reading('n131'), "Tradition has the affected reading" );
660is( scalar( $sc->readings ), $numr, "There are $numr readings in the graph" );
661is( $sc->end->rank, 14, "There are fourteen ranks in the graph" );
662
663# Detach the erroneously collated reading
2dcb5d11 664my( $newr, @del_rdgs ) = $sc->duplicate_reading( 'n131', 'Ba96' );
ef73c20a 665ok( $newr, "New reading was created" );
f97ef19e 666ok( $sc->reading('n131_0'), "Detached the bad collation with a new reading" );
667is( scalar( $sc->readings ), $numr + 1, "A reading was added to the graph" );
668is( $sc->end->rank, 10, "There are now only ten ranks in the graph" );
3c234eb6 669my $csucc = $sc->common_successor( 'n131', 'n131_0' );
670is( $csucc->id, 'n136', "Found correct common successor to duped reading" );
f97ef19e 671
672# Check that the bad transposition is gone
2dcb5d11 673is( scalar @del_rdgs, 1, "Deleted reading was returned by API call" );
f97ef19e 674is( $sc->get_relationship( 'n130', 'n135' ), undef, "Bad transposition relationship is gone" );
675
e19635f8 676# The collation should not be fixed
677my @pairs = $sc->identical_readings();
678is( scalar @pairs, 0, "Not re-collated yet" );
f97ef19e 679# Fix the collation
3c234eb6 680ok( $sc->merge_readings( 'n124', 'n131_0' ), "Collated the readings correctly" );
e19635f8 681@pairs = $sc->identical_readings( start => 'n124', end => $csucc->id );
3c234eb6 682is( scalar @pairs, 3, "Found three more identical readings" );
e19635f8 683is( $sc->end->rank, 11, "The ranks shifted appropriately" );
3c234eb6 684$sc->flatten_ranks();
f97ef19e 685is( scalar( $sc->readings ), $numr - 3, "Now we are collated correctly" );
686
68e48c06 687# Check that we can't "duplicate" a reading with no wits or with all wits
688try {
689 my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124' );
690 ok( 0, "Reading duplication without witnesses throws an error" );
691} catch( Text::Tradition::Error $e ) {
692 like( $e->message, qr/Must specify one or more witnesses/,
693 "Reading duplication without witnesses throws the expected error" );
694} catch {
695 ok( 0, "Reading duplication without witnesses threw the wrong error" );
696}
697
698try {
699 my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124', 'Ba96', 'Mü11475' );
700 ok( 0, "Reading duplication with all witnesses throws an error" );
701} catch( Text::Tradition::Error $e ) {
702 like( $e->message, qr/Cannot join all witnesses/,
703 "Reading duplication with all witnesses throws the expected error" );
704} catch {
705 ok( 0, "Reading duplication with all witnesses threw the wrong error" );
706}
707
58568d5c 708try {
709 $sc->calculate_ranks();
710 ok( 1, "Graph is still evidently whole" );
711} catch( Text::Tradition::Error $e ) {
712 ok( 0, "Caught a rank exception: " . $e->message );
713}
714
f97ef19e 715=end testing
716
717=cut
718
719sub duplicate_reading {
720 my( $self, $r, @wits ) = @_;
68e48c06 721 # Check that we are not doing anything unwise.
722 throw( "Must specify one or more witnesses for the duplicated reading" )
723 unless @wits;
f97ef19e 724 unless( ref( $r ) eq 'Text::Tradition::Collation::Reading' ) {
725 $r = $self->reading( $r );
726 }
727 throw( "Cannot duplicate a meta-reading" )
728 if $r->is_meta;
68e48c06 729 throw( "Cannot join all witnesses to the new reading" )
47cd4d16 730 if scalar( @wits ) == scalar( $r->witnesses );
68e48c06 731
f97ef19e 732 # Get all the reading attributes and duplicate them.
733 my $rmeta = Text::Tradition::Collation::Reading->meta;
734 my %args;
735 foreach my $attr( $rmeta->get_all_attributes ) {
736 next if $attr->name =~ /^_/;
737 my $acc = $attr->get_read_method;
738 if( !$acc && $attr->has_applied_traits ) {
739 my $tr = $attr->applied_traits;
740 if( $tr->[0] =~ /::(Array|Hash)$/ ) {
741 my $which = $1;
742 my %methods = reverse %{$attr->handles};
743 $acc = $methods{elements};
744 $args{$attr->name} = $which eq 'Array'
745 ? [ $r->$acc ] : { $r->$acc };
746 }
747 } else {
748 $args{$attr->name} = $r->$acc if $acc;
749 }
750 }
751 # By definition the new reading will no longer be common.
752 $args{is_common} = 0;
753 # The new reading also needs its own ID.
754 $args{id} = $self->_generate_dup_id( $r->id );
755
756 # Try to make the new reading.
757 my $newr = $self->add_reading( \%args );
758 # The old reading is also no longer common.
759 $r->is_common( 0 );
760
761 # For each of the witnesses, dissociate from the old reading and
762 # associate with the new.
763 foreach my $wit ( @wits ) {
764 my $prior = $self->prior_reading( $r, $wit );
765 my $next = $self->next_reading( $r, $wit );
766 $self->del_path( $prior, $r, $wit );
767 $self->add_path( $prior, $newr, $wit );
768 $self->del_path( $r, $next, $wit );
769 $self->add_path( $newr, $next, $wit );
770 }
771
3c234eb6 772 # If the graph is ranked, we need to look for relationships that are now
773 # invalid (i.e. 'non-colocation' types that might now be colocated) and
774 # remove them. If not, we can skip it.
f97ef19e 775 my $succ;
3c234eb6 776 my %rrk;
2dcb5d11 777 my @deleted_relations;
f97ef19e 778 if( $self->end->has_rank ) {
3c234eb6 779 # Find the point where we can stop checking
f97ef19e 780 $succ = $self->common_successor( $r, $newr );
3c234eb6 781
782 # Hash the existing ranks
f97ef19e 783 foreach my $rdg ( $self->readings ) {
784 $rrk{$rdg->id} = $rdg->rank;
785 }
3c234eb6 786 # Calculate the new ranks
787 $self->calculate_ranks();
f97ef19e 788
3c234eb6 789 # Check for invalid non-colocated relationships among changed-rank readings
790 # from where the ranks start changing up to $succ
f97ef19e 791 my $lastrank = $succ->rank;
792 foreach my $rdg ( $self->readings ) {
793 next if $rdg->rank > $lastrank;
794 next if $rdg->rank == $rrk{$rdg->id};
795 my @noncolo = $rdg->related_readings( sub { !$_[0]->colocated } );
796 next unless @noncolo;
797 foreach my $nc ( @noncolo ) {
2dcb5d11 798 unless( $self->relations->verify_or_delete( $rdg, $nc ) ) {
f6a1d5f0 799 push( @deleted_relations, [ $rdg->id, $nc->id ] );
2dcb5d11 800 }
f97ef19e 801 }
802 }
ef73c20a 803 }
2dcb5d11 804 return ( $newr, @deleted_relations );
f97ef19e 805}
806
807sub _generate_dup_id {
808 my( $self, $rid ) = @_;
809 my $newid;
810 my $i = 0;
811 while( !$newid ) {
812 $newid = $rid."_$i";
813 if( $self->has_reading( $newid ) ) {
814 $newid = '';
815 $i++;
816 }
817 }
818 return $newid;
819}
820
3a2ebbf4 821### Path logic
822
823sub add_path {
824 my $self = shift;
825
826 # We only need the IDs for adding paths to the graph, not the reading
827 # objects themselves.
cecbe56d 828 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
3a2ebbf4 829
c1915ab9 830 $self->_graphcalc_done(0);
3a2ebbf4 831 # Connect the readings
359944f7 832 unless( $self->sequence->has_edge( $source, $target ) ) {
833 $self->sequence->add_edge( $source, $target );
834 $self->relations->add_equivalence_edge( $source, $target );
835 }
3a2ebbf4 836 # Note the witness in question
837 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
359944f7 838}
b15511bf 839
3a2ebbf4 840sub del_path {
841 my $self = shift;
49d4f2ac 842 my @args;
843 if( ref( $_[0] ) eq 'ARRAY' ) {
844 my $e = shift @_;
845 @args = ( @$e, @_ );
846 } else {
847 @args = @_;
848 }
3a2ebbf4 849
f97ef19e 850 # We only need the IDs for removing paths from the graph, not the reading
3a2ebbf4 851 # objects themselves.
49d4f2ac 852 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 853
c1915ab9 854 $self->_graphcalc_done(0);
3a2ebbf4 855 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 856 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 857 }
30897024 858 unless( $self->sequence->has_edge_attributes( $source, $target ) ) {
3a2ebbf4 859 $self->sequence->delete_edge( $source, $target );
359944f7 860 $self->relations->delete_equivalence_edge( $source, $target );
3a2ebbf4 861 }
784877d9 862}
863
3a2ebbf4 864
15d2d3df 865# Extra graph-alike utility
866sub has_path {
3a2ebbf4 867 my $self = shift;
868 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
869 return undef unless $self->sequence->has_edge( $source, $target );
870 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 871}
872
4e5a7b2c 873=head2 clear_witness( @sigil_list )
3a2ebbf4 874
4e5a7b2c 875Clear the given witnesses out of the collation entirely, removing references
876to them in paths, and removing readings that belong only to them. Should only
877be called via $tradition->del_witness.
3a2ebbf4 878
879=cut
880
4e5a7b2c 881sub clear_witness {
882 my( $self, @sigils ) = @_;
883
c1915ab9 884 $self->_graphcalc_done(0);
4e5a7b2c 885 # Clear the witness(es) out of the paths
886 foreach my $e ( $self->paths ) {
887 foreach my $sig ( @sigils ) {
888 $self->del_path( $e, $sig );
889 }
890 }
891
892 # Clear out the newly unused readings
893 foreach my $r ( $self->readings ) {
894 unless( $self->reading_witnesses( $r ) ) {
895 $self->del_reading( $r );
896 }
897 }
898}
3a2ebbf4 899
900sub add_relationship {
901 my $self = shift;
22222af9 902 my( $source, $target, $opts ) = $self->_stringify_args( @_ );
414cc046 903 my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
45095bee 904 my $did_recalc;
864ee4bf 905 foreach my $v ( @vectors ) {
45095bee 906 my $rel = $self->get_relationship( $v );
907 next unless $rel->colocated;
908 my $r1 = $self->reading( $v->[0] );
909 my $r2 = $self->reading( $v->[1] );
910 # If it's a spelling or orthographic relationship, and one is marked
911 # as a lemma, set the normal form on the non-lemma to reflect that.
912 if( $r1->does( 'Text::Tradition::Morphology' ) ) {
913 $r1->relationship_added( $r2, $rel );
914 }
915 next if $did_recalc;
916 if( $r1->has_rank && $r2->has_rank && $r1->rank ne $r2->rank ) {
864ee4bf 917 $self->_graphcalc_done(0);
918 $self->_clear_cache;
45095bee 919 $did_recalc = 1;
864ee4bf 920 }
921 }
63778331 922 return @vectors;
22222af9 923}
ef9d481f 924
ca6e6095 925around qw/ get_relationship del_relationship / => sub {
926 my $orig = shift;
927 my $self = shift;
928 my @args = @_;
929 if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
930 @args = @{$_[0]};
931 }
e19635f8 932 my @stringargs = $self->_stringify_args( @args );
933 $self->$orig( @stringargs );
ca6e6095 934};
935
22222af9 936=head2 reading_witnesses( $reading )
910a0a6d 937
22222af9 938Return a list of sigils corresponding to the witnesses in which the reading appears.
3265b0ce 939
22222af9 940=cut
1d310495 941
1d310495 942sub reading_witnesses {
943 my( $self, $reading ) = @_;
944 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 945 # arbitrarily chosen "incoming". Thus, special-case the start node.
946 if( $reading eq $self->start ) {
bed6ce83 947 return map { $_->sigil } grep { $_->is_collated } $self->tradition->witnesses;
96dc90ec 948 }
1d310495 949 my %all_witnesses;
950 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
951 my $wits = $self->sequence->get_edge_attributes( @$e );
952 @all_witnesses{ keys %$wits } = 1;
953 }
c12bb878 954 my $acstr = $self->ac_label;
955 foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
956 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
957 }
1d310495 958 return keys %all_witnesses;
910a0a6d 959}
960
4e5a7b2c 961=head1 OUTPUT METHODS
8e1394aa 962
0ecb975c 963=head2 as_svg( \%options )
8e1394aa 964
0068967c 965Returns an SVG string that represents the graph, via as_dot and graphviz.
bfcbcecb 966See as_dot for a list of options. Must have GraphViz (dot) installed to run.
8e1394aa 967
968=cut
969
970sub as_svg {
0ecb975c 971 my( $self, $opts ) = @_;
bfcbcecb 972 throw( "Need GraphViz installed to output SVG" )
973 unless File::Which::which( 'dot' );
e247aad1 974 my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
1ff82d4f 975 $self->calculate_ranks()
976 unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
be3af600 977 my @cmd = qw/dot -Tsvg/;
978 my( $svg, $err );
979 my $dotfile = File::Temp->new();
980 ## USE FOR DEBUGGING
981 # $dotfile->unlink_on_destroy(0);
982 binmode $dotfile, ':utf8';
983 print $dotfile $self->as_dot( $opts );
984 push( @cmd, $dotfile->filename );
985 run( \@cmd, ">", binary(), \$svg );
986 $svg = decode_utf8( $svg );
987 return $svg;
8e1394aa 988}
989
b22576c6 990
0ecb975c 991=head2 as_dot( \%options )
b22576c6 992
0ecb975c 993Returns a string that is the collation graph expressed in dot
994(i.e. GraphViz) format. Options include:
b22576c6 995
0ecb975c 996=over 4
b22576c6 997
0ecb975c 998=item * from
b22576c6 999
0ecb975c 1000=item * to
df6d9812 1001
0ecb975c 1002=item * color_common
1003
1004=back
df6d9812 1005
1006=cut
1007
1008sub as_dot {
0ecb975c 1009 my( $self, $opts ) = @_;
1010 my $startrank = $opts->{'from'} if $opts;
1011 my $endrank = $opts->{'to'} if $opts;
1012 my $color_common = $opts->{'color_common'} if $opts;
b365fbae 1013 my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
1014 && $self->end->rank > 100;
6648ee3d 1015 $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
b365fbae 1016
b22576c6 1017 # Check the arguments
1018 if( $startrank ) {
1019 return if $endrank && $startrank > $endrank;
1020 return if $startrank > $self->end->rank;
1021 }
1022 if( defined $endrank ) {
1023 return if $endrank < 0;
f1b3b33a 1024 $endrank = undef if $endrank == $self->end->rank;
b22576c6 1025 }
1026
67da8d6c 1027 my $graph_name = $self->tradition->name;
1028 $graph_name =~ s/[^\w\s]//g;
1029 $graph_name = join( '_', split( /\s+/, $graph_name ) );
f13b5582 1030
1031 my %graph_attrs = (
1032 'rankdir' => 'LR',
1033 'bgcolor' => 'none',
1034 );
1035 my %node_attrs = (
b8990398 1036 'fontsize' => 14,
f13b5582 1037 'fillcolor' => 'white',
1038 'style' => 'filled',
1039 'shape' => 'ellipse'
1040 );
1041 my %edge_attrs = (
1042 'arrowhead' => 'open',
1043 'color' => '#000000',
1044 'fontcolor' => '#000000',
1045 );
1046
67da8d6c 1047 my $dot = sprintf( "digraph %s {\n", $graph_name );
f13b5582 1048 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
1049 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
df6d9812 1050
b22576c6 1051 # Output substitute start/end readings if necessary
1052 if( $startrank ) {
43c94341 1053 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
b22576c6 1054 }
1055 if( $endrank ) {
43c94341 1056 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
b22576c6 1057 }
b365fbae 1058 if( $STRAIGHTENHACK ) {
1059 ## HACK part 1
43c94341 1060 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
1061 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
b365fbae 1062 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
1063 }
b22576c6 1064 my %used; # Keep track of the readings that actually appear in the graph
30ddc24c 1065 # Sort the readings by rank if we have ranks; this speeds layout.
1066 my @all_readings = $self->end->has_rank
1067 ? sort { $a->rank <=> $b->rank } $self->readings
1068 : $self->readings;
4633f9e4 1069 # TODO Refrain from outputting lacuna nodes - just grey out the edges.
30ddc24c 1070 foreach my $reading ( @all_readings ) {
b22576c6 1071 # Only output readings within our rank range.
1072 next if $startrank && $reading->rank < $startrank;
1073 next if $endrank && $reading->rank > $endrank;
1074 $used{$reading->id} = 1;
910a0a6d 1075 # Need not output nodes without separate labels
3a2ebbf4 1076 next if $reading->id eq $reading->text;
d4b75f44 1077 my $rattrs;
30f0df34 1078 my $label = $reading->text;
6b6c7098 1079 unless( $label =~ /^[[:punct:]]+$/ ) {
1080 $label .= '-' if $reading->join_next;
1081 $label = "-$label" if $reading->join_prior;
1082 }
8f9cab7b 1083 $label =~ s/\"/\\\"/g;
d4b75f44 1084 $rattrs->{'label'} = $label;
10e4b1ac 1085 $rattrs->{'id'} = $reading->id;
0ecb975c 1086 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
d4b75f44 1087 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
df6d9812 1088 }
3a2ebbf4 1089
522c1314 1090 # Add the real edges.
b22576c6 1091 my @edges = $self->paths;
3bdec618 1092 my( %substart, %subend );
b22576c6 1093 foreach my $edge ( @edges ) {
1094 # Do we need to output this edge?
508fd430 1095 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
57560672 1096 my $label = $self->_path_display_label( $opts,
1097 $self->path_witnesses( $edge ) );
f13b5582 1098 my $variables = { %edge_attrs, 'label' => $label };
30ddc24c 1099
b22576c6 1100 # Account for the rank gap if necessary
30ddc24c 1101 my $rank0 = $self->reading( $edge->[0] )->rank
1102 if $self->reading( $edge->[0] )->has_rank;
1103 my $rank1 = $self->reading( $edge->[1] )->rank
1104 if $self->reading( $edge->[1] )->has_rank;
1105 if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
1106 $variables->{'minlen'} = $rank1 - $rank0;
1107 }
1108
508fd430 1109 # EXPERIMENTAL: make edge width reflect no. of witnesses
1110 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
1111 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
1112
f13b5582 1113 my $varopts = _dot_attr_string( $variables );
1114 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1115 $edge->[0], $edge->[1], $varopts );
3bdec618 1116 } elsif( $used{$edge->[0]} ) {
96ba0418 1117 $subend{$edge->[0]} = $edge->[1];
3bdec618 1118 } elsif( $used{$edge->[1]} ) {
96ba0418 1119 $substart{$edge->[1]} = $edge->[0];
b22576c6 1120 }
df6d9812 1121 }
bed6ce83 1122
1123 # If we are asked to, add relationship links
1124 if( exists $opts->{show_relations} ) {
1125 my $filter = $opts->{show_relations}; # can be 'transposition' or 'all'
1126 if( $filter eq 'transposition' ) {
1127 $filter =~ qr/^transposition$/;
1128 }
9e54e11b 1129 my %typecolors;
1130 my @types = sort( map { $_->name } $self->relations->types );
1131 if( exists $opts->{graphcolors} ) {
1132 foreach my $tdx ( 0 .. $#types ) {
1133 $typecolors{$types[$tdx]} = $opts->{graphcolors}->[$tdx];
1134 }
1135 } else {
1136 map { $typecolors{$_} = '#FFA14F' } @types;
1137 }
bed6ce83 1138 foreach my $redge ( $self->relationships ) {
1139 if( $used{$redge->[0]} && $used{$redge->[1]} ) {
9e54e11b 1140 my $rel = $self->get_relationship( $redge );
1141 next unless $filter eq 'all' || $rel->type =~ /$filter/;
1142 my $variables = {
1143 arrowhead => 'none',
1144 color => $typecolors{$rel->type},
1145 constraint => 'false',
1146 penwidth => '3',
1147 };
1148 unless( exists $opts->{graphcolors} ) {
1149 $variables->{label} = uc( substr( $rel->type, 0, 4 ) ),
bed6ce83 1150 }
9e54e11b 1151 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1152 $redge->[0], $redge->[1], _dot_attr_string( $variables ) );
bed6ce83 1153 }
1154 }
1155 }
1156
3bdec618 1157 # Add substitute start and end edges if necessary
1158 foreach my $node ( keys %substart ) {
57560672 1159 my $witstr = $self->_path_display_label( $opts,
1160 $self->path_witnesses( $substart{$node}, $node ) );
f13b5582 1161 my $variables = { %edge_attrs, 'label' => $witstr };
96ba0418 1162 my $nrdg = $self->reading( $node );
1163 if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
1164 # Substart is actually one lower than $startrank
1165 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
1166 }
f13b5582 1167 my $varopts = _dot_attr_string( $variables );
96ba0418 1168 $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
3bdec618 1169 }
1170 foreach my $node ( keys %subend ) {
57560672 1171 my $witstr = $self->_path_display_label( $opts,
1172 $self->path_witnesses( $node, $subend{$node} ) );
f13b5582 1173 my $variables = { %edge_attrs, 'label' => $witstr };
1174 my $varopts = _dot_attr_string( $variables );
96ba0418 1175 $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
3bdec618 1176 }
b365fbae 1177 # HACK part 2
1178 if( $STRAIGHTENHACK ) {
43c94341 1179 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
1180 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
b365fbae 1181 }
30ddc24c 1182
df6d9812 1183 $dot .= "}\n";
1184 return $dot;
1185}
1186
f13b5582 1187sub _dot_attr_string {
1188 my( $hash ) = @_;
1189 my @attrs;
1190 foreach my $k ( sort keys %$hash ) {
1191 my $v = $hash->{$k};
1192 push( @attrs, $k.'="'.$v.'"' );
1193 }
1194 return( '[ ' . join( ', ', @attrs ) . ' ]' );
1195}
1196
027d819c 1197=head2 path_witnesses( $edge )
1198
1199Returns the list of sigils whose witnesses are associated with the given edge.
1200The edge can be passed as either an array or an arrayref of ( $source, $target ).
1201
1202=cut
1203
3a2ebbf4 1204sub path_witnesses {
1205 my( $self, @edge ) = @_;
1206 # If edge is an arrayref, cope.
1207 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
1208 my $e = shift @edge;
1209 @edge = @$e;
1210 }
1211 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
508fd430 1212 return @wits;
3a2ebbf4 1213}
1214
7f9f05e8 1215# Helper function. Make a display label for the given witnesses, showing a.c.
1216# witnesses only where the main witness is not also in the list.
027d819c 1217sub _path_display_label {
508fd430 1218 my $self = shift;
57560672 1219 my $opts = shift;
7f9f05e8 1220 my %wits;
1221 map { $wits{$_} = 1 } @_;
1222
1223 # If an a.c. wit is listed, remove it if the main wit is also listed.
1224 # Otherwise keep it for explicit listing.
1225 my $aclabel = $self->ac_label;
1226 my @disp_ac;
1227 foreach my $w ( sort keys %wits ) {
1228 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
1229 if( exists $wits{$1} ) {
1230 delete $wits{$w};
1231 } else {
1232 push( @disp_ac, $w );
1233 }
1234 }
1235 }
1236
57560672 1237 if( $opts->{'explicit_wits'} ) {
7f9f05e8 1238 return join( ', ', sort keys %wits );
57560672 1239 } else {
1240 # See if we are in a majority situation.
1241 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
1242 $maj = $maj > 5 ? $maj : 5;
1243 if( scalar keys %wits > $maj ) {
1244 unshift( @disp_ac, 'majority' );
1245 return join( ', ', @disp_ac );
1246 } else {
1247 return join( ', ', sort keys %wits );
1248 }
8f9cab7b 1249 }
1250}
1dd07bda 1251
522c1314 1252=head2 as_adjacency_list
1dd07bda 1253
522c1314 1254Returns a JSON structure that represents the collation sequence graph.
1dd07bda 1255
8a9a8200 1256=begin testing
1257
1258use JSON qw/ from_json /;
1259use Text::Tradition;
1260
1261my $t = Text::Tradition->new(
1262 'input' => 'Self',
1263 'file' => 't/data/florilegium_graphml.xml' );
1264my $c = $t->collation;
1265
1266# Make a connection so we can test rank preservation
1267$c->add_relationship( 'w91', 'w92', { type => 'grammatical' } );
1268
1269# Create an adjacency list of the whole thing; test the output.
1270my $adj_whole = from_json( $c->as_adjacency_list() );
1271is( scalar @$adj_whole, scalar $c->readings(),
1272 "Same number of nodes in graph and adjacency list" );
1273my @adj_whole_edges;
1274map { push( @adj_whole_edges, @{$_->{adjacent}} ) } @$adj_whole;
1275is( scalar @adj_whole_edges, scalar $c->sequence->edges,
1276 "Same number of edges in graph and adjacency list" );
1277# Find the reading whose rank should be preserved
1278my( $test_rdg ) = grep { $_->{id} eq 'w89' } @$adj_whole;
1279my( $test_edge ) = grep { $_->{id} eq 'w92' } @{$test_rdg->{adjacent}};
1280is( $test_edge->{minlen}, 2, "Rank of test reading is preserved" );
1281
1282# Now create an adjacency list of just a portion. w76 to w122
1283my $adj_part = from_json( $c->as_adjacency_list(
1284 { from => $c->reading('w76')->rank,
1285 to => $c->reading('w122')->rank }));
1286is( scalar @$adj_part, 48, "Correct number of nodes in partial graph" );
1287my @adj_part_edges;
1288map { push( @adj_part_edges, @{$_->{adjacent}} ) } @$adj_part;
1289is( scalar @adj_part_edges, 58,
1290 "Same number of edges in partial graph and adjacency list" );
1291# Check for consistency
1292my %part_nodes;
1293map { $part_nodes{$_->{id}} = 1 } @$adj_part;
1294foreach my $edge ( @adj_part_edges ) {
1295 my $testid = $edge->{id};
1296 ok( $part_nodes{$testid}, "ID $testid referenced in edge is given as node" );
1297}
1298
1299=end testing
1300
1dd07bda 1301=cut
1302
522c1314 1303sub as_adjacency_list {
1304 my( $self, $opts ) = @_;
1305 # Make a structure that contains all the nodes, the nodes they point to,
1306 # and the attributes of the edges that connect them.
1307 # [ { id: 'n0', label: 'Gallia', adjacent: [
1308 # { id: 'n1', label: 'P Q' } ,
1309 # { id: 'n2', label: 'R S', minlen: 2 } ] },
1310 # { id: 'n1', label: 'est', adjacent: [ ... ] },
1311 # ... ]
1312 my $startrank = $opts->{'from'} || 0;
1313 my $endrank = $opts->{'to'} || $self->end->rank;
1314
8a9a8200 1315 $self->calculate_ranks()
1316 unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
522c1314 1317 my $list = [];
1318 foreach my $rdg ( $self->readings ) {
1319 my @successors;
1320 my $phony = '';
8a9a8200 1321 # Figure out what the node's successors should be.
522c1314 1322 if( $rdg eq $self->start && $startrank > 0 ) {
1323 # Connect the start node with all the nodes at startrank.
8a9a8200 1324 # Lacunas should be included only if the node really has that rank.
1325 @successors = $self->readings_at_rank( $startrank, 1 );
522c1314 1326 $phony = 'start';
1327 } elsif( $rdg->rank < $startrank
1328 || $rdg->rank > $endrank && $rdg ne $self->end ) {
1329 next;
522c1314 1330 } else {
1331 @successors = $rdg->successors;
1332 }
8a9a8200 1333 # Make sure that the end node is at the end of the successors
1334 # list if it is needed.
1335 if( grep { $_ eq $self->end } @successors ) {
1336 my @ts = grep { $_ ne $self->end } @successors;
1337 @successors = ( @ts, $self->end );
1338 } elsif ( grep { $_->rank > $endrank } @successors ) {
1339 push( @successors, $self->end );
1340 }
522c1314 1341
1342 my $listitem = { id => $rdg->id, label => $rdg->text };
1343 my $adjacent = [];
8a9a8200 1344 my @endwits;
522c1314 1345 foreach my $succ ( @successors ) {
1346 my @edgewits;
1347 if( $phony eq 'start' ) {
1348 @edgewits = $succ->witnesses;
8a9a8200 1349 } elsif( $self->sequence->has_edge( $rdg->id, $succ->id ) ) {
522c1314 1350 @edgewits = $self->path_witnesses( $rdg->id, $succ->id );
1351 }
8a9a8200 1352
1353 if( $succ eq $self->end ) {
1354 @edgewits = @endwits;
1355 } elsif( $succ->rank > $endrank ) {
1356 # These witnesses will point to 'end' instead, not to the
1357 # actual successor.
1358 push( @endwits, @edgewits );
1359 next;
1360 }
522c1314 1361 my $edgelabel = $self->_path_display_label( $opts, @edgewits );
1362 my $edgedef = { id => $succ->id, label => $edgelabel };
1363 my $rankoffset = $succ->rank - $rdg->rank;
1364 if( $rankoffset > 1 and $succ ne $self->end ) {
1365 $edgedef->{minlen} = $rankoffset;
1366 }
1367 push( @$adjacent, $edgedef );
1368 }
1369 $listitem->{adjacent} = $adjacent;
1370 push( @$list, $listitem );
bf6e338d 1371 }
522c1314 1372 return to_json( $list );
1373}
8f9cab7b 1374
4e5a7b2c 1375=head2 as_graphml
8e1394aa 1376
4e5a7b2c 1377Returns a GraphML representation of the collation. The GraphML will contain
1378two graphs. The first expresses the attributes of the readings and the witness
1379paths that link them; the second expresses the relationships that link the
1380readings. This is the native transfer format for a tradition.
8e1394aa 1381
56eefa04 1382=begin testing
1383
1384use Text::Tradition;
951ddfe8 1385use TryCatch;
56eefa04 1386
1387my $READINGS = 311;
1388my $PATHS = 361;
1389
1390my $datafile = 't/data/florilegium_tei_ps.xml';
1391my $tradition = Text::Tradition->new( 'input' => 'TEI',
1392 'name' => 'test0',
1393 'file' => $datafile,
1394 'linear' => 1 );
1395
1396ok( $tradition, "Got a tradition object" );
1397is( scalar $tradition->witnesses, 13, "Found all witnesses" );
1398ok( $tradition->collation, "Tradition has a collation" );
1399
1400my $c = $tradition->collation;
1401is( scalar $c->readings, $READINGS, "Collation has all readings" );
1402is( scalar $c->paths, $PATHS, "Collation has all paths" );
1403is( scalar $c->relationships, 0, "Collation has all relationships" );
1404
1405# Add a few relationships
1406$c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
1407$c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
b71e7ea8 1408$c->add_relationship( 'w257', 'w262', { 'type' => 'transposition',
1409 'is_significant' => 'yes' } );
56eefa04 1410
1411# Now write it to GraphML and parse it again.
1412
1413my $graphml = $c->as_graphml;
1414my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
1415is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
1416is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
1417is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
b71e7ea8 1418my $sigrel = $st->collation->get_relationship( 'w257', 'w262' );
1419is( $sigrel->is_significant, 'yes', "Ternary attribute value was restored" );
56eefa04 1420
9fef629b 1421# Now add a stemma, write to GraphML, and look at the output.
951ddfe8 1422SKIP: {
37bf09f4 1423 skip "Analysis module not present", 3 unless $tradition->can( 'add_stemma' );
951ddfe8 1424 my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
1425 is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
1426 is( $tradition->stemmata, 1, "Tradition now has the stemma" );
1427 $graphml = $c->as_graphml;
1428 like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
1429}
2a812726 1430
56eefa04 1431=end testing
1432
8e1394aa 1433=cut
1434
a445ce40 1435## TODO MOVE this to Tradition.pm and modularize it better
8e1394aa 1436sub as_graphml {
a30ca502 1437 my( $self, $options ) = @_;
3d14b48e 1438 $self->calculate_ranks unless $self->_graphcalc_done;
1439
a30ca502 1440 my $start = $options->{'from'}
1441 ? $self->reading( $options->{'from'} ) : $self->start;
1442 my $end = $options->{'to'}
1443 ? $self->reading( $options->{'to'} ) : $self->end;
1444 if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
1445 throw( 'Start node must be before end node' );
1446 }
1447 # The readings need to be ranked for this to work.
1448 $start = $self->start unless $start->has_rank;
1449 $end = $self->end unless $end->has_rank;
414cc046 1450 my $rankoffset = 0;
1451 unless( $start eq $self->start ) {
1452 $rankoffset = $start->rank - 1;
1453 }
a30ca502 1454 my %use_readings;
1455
8e1394aa 1456 # Some namespaces
1457 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
1458 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
1459 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 1460 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 1461
1462 # Create the document and root node
428bcf0b 1463 require XML::LibXML;
8e1394aa 1464 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
1465 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
1466 $graphml->setDocumentElement( $root );
1467 $root->setNamespace( $xsi_ns, 'xsi', 0 );
1468 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
bbd064a9 1469
1470 # List of attribute types to save on our objects and their corresponding
1471 # GraphML types
1472 my %save_types = (
1473 'Str' => 'string',
1474 'Int' => 'int',
1475 'Bool' => 'boolean',
10e4b1ac 1476 'ReadingID' => 'string',
bbd064a9 1477 'RelationshipType' => 'string',
1478 'RelationshipScope' => 'string',
b71e7ea8 1479 'Ternary' => 'string',
bbd064a9 1480 );
1481
bbd064a9 1482 # Add the data keys for the graph. Include an extra key 'version' for the
1483 # GraphML output version.
e309421a 1484 my %graph_data_keys;
1485 my $gdi = 0;
bbd064a9 1486 my %graph_attributes = ( 'version' => 'string' );
1487 # Graph attributes include those of Tradition and those of Collation.
1488 my %gattr_from;
f97ef19e 1489 # TODO Use meta introspection method from duplicate_reading to do this
1490 # instead of naming custom keys.
bbd064a9 1491 my $tmeta = $self->tradition->meta;
1492 my $cmeta = $self->meta;
1493 map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
1494 map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
1495 foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
1496 next if $attr->name =~ /^_/;
bbd064a9 1497 next unless $save_types{$attr->type_constraint->name};
1498 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1499 }
9fef629b 1500 # Extra custom keys for complex objects that should be saved in some form.
1501 # The subroutine should return a string, or undef/empty.
951ddfe8 1502 if( $tmeta->has_method('stemmata') ) {
1503 $graph_attributes{'stemmata'} = sub {
1504 my @stemstrs;
1505 map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1506 $self->tradition->stemmata;
1507 join( "\n", @stemstrs );
1508 };
1509 }
1510
8943ff68 1511 if( $tmeta->has_method('user') ) {
1512 $graph_attributes{'user'} = sub {
1513 $self->tradition->user ? $self->tradition->user->id : undef
1514 };
1515 }
bbd064a9 1516
1517 foreach my $datum ( sort keys %graph_attributes ) {
e309421a 1518 $graph_data_keys{$datum} = 'dg'.$gdi++;
1519 my $key = $root->addNewChild( $graphml_ns, 'key' );
9fef629b 1520 my $dtype = ref( $graph_attributes{$datum} ) ? 'string'
1521 : $graph_attributes{$datum};
e309421a 1522 $key->setAttribute( 'attr.name', $datum );
9fef629b 1523 $key->setAttribute( 'attr.type', $dtype );
e309421a 1524 $key->setAttribute( 'for', 'graph' );
1525 $key->setAttribute( 'id', $graph_data_keys{$datum} );
1526 }
f6066bac 1527
bbd064a9 1528 # Add the data keys for reading nodes
1529 my %reading_attributes;
1530 my $rmeta = Text::Tradition::Collation::Reading->meta;
1531 foreach my $attr( $rmeta->get_all_attributes ) {
1532 next if $attr->name =~ /^_/;
bbd064a9 1533 next unless $save_types{$attr->type_constraint->name};
1534 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1535 }
a445ce40 1536 if( $self->start->does('Text::Tradition::Morphology' ) ) {
1537 # Extra custom key for the reading morphology
1538 $reading_attributes{'lexemes'} = 'string';
1539 }
7cd9f181 1540
ef9d481f 1541 my %node_data_keys;
1542 my $ndi = 0;
bbd064a9 1543 foreach my $datum ( sort keys %reading_attributes ) {
910a0a6d 1544 $node_data_keys{$datum} = 'dn'.$ndi++;
1545 my $key = $root->addNewChild( $graphml_ns, 'key' );
1546 $key->setAttribute( 'attr.name', $datum );
bbd064a9 1547 $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
910a0a6d 1548 $key->setAttribute( 'for', 'node' );
1549 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 1550 }
1551
bbd064a9 1552 # Add the data keys for edges, that is, paths and relationships. Path
1553 # data does not come from a Moose class so is here manually.
ef9d481f 1554 my $edi = 0;
1555 my %edge_data_keys;
bbd064a9 1556 my %edge_attributes = (
3a2ebbf4 1557 witness => 'string', # ID/label for a path
3a2ebbf4 1558 extra => 'boolean', # Path key
3a2ebbf4 1559 );
bbd064a9 1560 my @path_attributes = keys %edge_attributes; # track our manual additions
1561 my $pmeta = Text::Tradition::Collation::Relationship->meta;
1562 foreach my $attr( $pmeta->get_all_attributes ) {
1563 next if $attr->name =~ /^_/;
bbd064a9 1564 next unless $save_types{$attr->type_constraint->name};
1565 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1566 }
1567 foreach my $datum ( sort keys %edge_attributes ) {
3a2ebbf4 1568 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 1569 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 1570 $key->setAttribute( 'attr.name', $datum );
bbd064a9 1571 $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
910a0a6d 1572 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 1573 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 1574 }
3a2ebbf4 1575
cc31ebaa 1576 # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1577 my $xmlidname = $self->tradition->name;
1578 $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1579 if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1580 $xmlidname = '_'.$xmlidname;
1581 }
2c669bca 1582 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1583 $sgraph->setAttribute( 'edgedefault', 'directed' );
cc31ebaa 1584 $sgraph->setAttribute( 'id', $xmlidname );
2c669bca 1585 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
cc31ebaa 1586 $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
2c669bca 1587 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
cc31ebaa 1588 $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
2c669bca 1589 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
22222af9 1590
2a812726 1591 # Tradition/collation attribute data
bbd064a9 1592 foreach my $datum ( keys %graph_attributes ) {
1593 my $value;
1594 if( $datum eq 'version' ) {
2a812726 1595 $value = '3.2';
9fef629b 1596 } elsif( ref( $graph_attributes{$datum} ) ) {
1597 my $sub = $graph_attributes{$datum};
1598 $value = &$sub();
bbd064a9 1599 } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1600 $value = $self->tradition->$datum;
1601 } else {
1602 $value = $self->$datum;
1603 }
2c669bca 1604 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 1605 }
8e1394aa 1606
1607 my $node_ctr = 0;
1608 my %node_hash;
22222af9 1609 # Add our readings to the graph
3a2ebbf4 1610 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
a30ca502 1611 next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1612 ( $n->rank < $start->rank || $n->rank > $end->rank );
1613 $use_readings{$n->id} = 1;
2c669bca 1614 # Add to the main graph
1615 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 1616 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 1617 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 1618 $node_el->setAttribute( 'id', $node_xmlid );
bbd064a9 1619 foreach my $d ( keys %reading_attributes ) {
255875b8 1620 my $nval = $n->$d;
7cd9f181 1621 # Custom serialization
1622 if( $d eq 'lexemes' ) {
1623 # If nval is a true value, we have lexemes so we need to
1624 # serialize them. Otherwise set nval to undef so that the
1625 # key is excluded from this reading.
1626 $nval = $nval ? $n->_serialize_lexemes : undef;
18c64d55 1627 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1628 $nval = undef;
7cd9f181 1629 }
cc31ebaa 1630 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
414cc046 1631 # Adjust the ranks within the subgraph.
cc31ebaa 1632 $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1633 : $nval - $rankoffset;
414cc046 1634 }
255875b8 1635 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1636 if defined $nval;
1637 }
b15511bf 1638 }
1639
2c669bca 1640 # Add the path edges to the sequence graph
df6d9812 1641 my $edge_ctr = 0;
3a2ebbf4 1642 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1643 # We add an edge in the graphml for every witness in $e.
a30ca502 1644 next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1645 my @edge_wits = sort $self->path_witnesses( $e );
cc31ebaa 1646 $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1647 $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1648 # Skip any path from start to end; that witness is not in the subgraph.
1649 next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
a30ca502 1650 foreach my $wit ( @edge_wits ) {
3a2ebbf4 1651 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1652 $node_hash{ $e->[0] },
1653 $node_hash{ $e->[1] } );
2c669bca 1654 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 1655 $edge_el->setAttribute( 'source', $from );
1656 $edge_el->setAttribute( 'target', $to );
1657 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 1658
1659 # It's a witness path, so add the witness
1660 my $base = $wit;
1661 my $key = $edge_data_keys{'witness'};
1662 # Is this an ante-corr witness?
1663 my $aclabel = $self->ac_label;
1664 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1665 # Keep the base witness
1666 $base = $1;
1667 # ...and record that this is an 'extra' reading path
1668 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1669 }
1670 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1671 }
1672 }
1673
cc31ebaa 1674 # Report the actual number of nodes and edges that went in
1675 $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1676 $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1677
22222af9 1678 # Add the relationship graph to the XML
bbd064a9 1679 map { delete $edge_data_keys{$_} } @path_attributes;
826d8773 1680 $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1681 $node_data_keys{'id'}, \%edge_data_keys );
8e1394aa 1682
94c00c71 1683 # Save and return the thing
1684 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 1685 return $result;
df6d9812 1686}
1687
b15511bf 1688sub _add_graphml_data {
1689 my( $el, $key, $value ) = @_;
b15511bf 1690 return unless defined $value;
c9bf3dbf 1691 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 1692 $data_el->setAttribute( 'key', $key );
1693 $data_el->appendText( $value );
8e1394aa 1694}
1695
4e5a7b2c 1696=head2 as_csv
910a0a6d 1697
1698Returns a CSV alignment table representation of the collation graph, one
2c669bca 1699row per witness (or witness uncorrected.)
910a0a6d 1700
ce5966fb 1701=head2 as_tsv
1702
1703Returns a tab-separated alignment table representation of the collation graph,
1704one row per witness (or witness uncorrected.)
1705
16203db5 1706=begin testing
1707
1708use Text::Tradition;
34ca808b 1709use Text::CSV;
16203db5 1710
1711my $READINGS = 311;
1712my $PATHS = 361;
34ca808b 1713my $WITS = 13;
1714my $WITAC = 4;
16203db5 1715
1716my $datafile = 't/data/florilegium_tei_ps.xml';
1717my $tradition = Text::Tradition->new( 'input' => 'TEI',
1718 'name' => 'test0',
1719 'file' => $datafile,
1720 'linear' => 1 );
1721
1722my $c = $tradition->collation;
1723# Export the thing to CSV
1724my $csvstr = $c->as_csv();
34ca808b 1725# Count the columns
1726my $csv = Text::CSV->new({ sep_char => ',', binary => 1 });
1727my @lines = split(/\n/, $csvstr );
1728ok( $csv->parse( $lines[0] ), "Successfully parsed first line of CSV" );
1729is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
cbc8e08f 1730my @q_ac = grep { $_ eq 'Q'.$c->ac_label } $csv->fields;
1731ok( @q_ac, "Found a layered witness" );
1732
16203db5 1733my $t2 = Text::Tradition->new( input => 'Tabular',
1734 name => 'test2',
1735 string => $csvstr,
1736 sep_char => ',' );
1737is( scalar $t2->collation->readings, $READINGS, "Reparsed CSV collation has all readings" );
1738is( scalar $t2->collation->paths, $PATHS, "Reparsed CSV collation has all paths" );
1739
1740# Now do it with TSV
1741my $tsvstr = $c->as_tsv();
1742my $t3 = Text::Tradition->new( input => 'Tabular',
1743 name => 'test3',
1744 string => $tsvstr,
1745 sep_char => "\t" );
1746is( scalar $t3->collation->readings, $READINGS, "Reparsed TSV collation has all readings" );
1747is( scalar $t3->collation->paths, $PATHS, "Reparsed TSV collation has all paths" );
1748
4e64b669 1749my $table = $c->alignment_table;
34ca808b 1750my $noaccsv = $c->as_csv({ noac => 1 });
1751my @noaclines = split(/\n/, $noaccsv );
1752ok( $csv->parse( $noaclines[0] ), "Successfully parsed first line of no-ac CSV" );
1753is( scalar( $csv->fields ), $WITS, "CSV has correct number of witness columns" );
4e64b669 1754is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
34ca808b 1755
cbc8e08f 1756my $safecsv = $c->as_csv({ safe_ac => 1});
1757my @safelines = split(/\n/, $safecsv );
1758ok( $csv->parse( $safelines[0] ), "Successfully parsed first line of safe CSV" );
1759is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1760@q_ac = grep { $_ eq 'Q__L' } $csv->fields;
1761ok( @q_ac, "Found a sanitized layered witness" );
1762is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
16203db5 1763
de20588d 1764# Test relationship collapse
1765$c->add_relationship( $c->readings_at_rank( 37 ), { type => 'spelling' } );
1766$c->add_relationship( $c->readings_at_rank( 60 ), { type => 'spelling' } );
1767
1768my $mergedtsv = $c->as_tsv({mergetypes => [ 'spelling', 'orthographic' ] });
1769my $t4 = Text::Tradition->new( input => 'Tabular',
1770 name => 'test4',
1771 string => $mergedtsv,
1772 sep_char => "\t" );
1773is( scalar $t4->collation->readings, $READINGS - 2, "Reparsed TSV merge collation has fewer readings" );
1774is( scalar $t4->collation->paths, $PATHS - 4, "Reparsed TSV merge collation has fewer paths" );
1775
53ec2b6d 1776# Test non-ASCII sigla
1777my $t5 = Text::Tradition->new( input => 'Tabular',
1778 name => 'nonascii',
1779 file => 't/data/armexample.xlsx',
1780 excel => 'xlsx' );
1781my $awittsv = $t5->collation->as_tsv({ noac => 1, ascii => 1 });
1782my @awitlines = split( /\n/, $awittsv );
1783like( $awitlines[0], qr/_A_5315622/, "Found ASCII sigil variant in TSV" );
1784
16203db5 1785=end testing
1786
910a0a6d 1787=cut
1788
ce5966fb 1789sub _tabular {
34ca808b 1790 my( $self, $opts ) = @_;
1791 my $table = $self->alignment_table( $opts );
16203db5 1792 my $csv_options = { binary => 1, quote_null => 0 };
34ca808b 1793 $csv_options->{'sep_char'} = $opts->{fieldsep};
1794 if( $opts->{fieldsep} eq "\t" ) {
16203db5 1795 # If it is really tab separated, nothing is an escape char.
1796 $csv_options->{'quote_char'} = undef;
1797 $csv_options->{'escape_char'} = '';
1798 }
1799 my $csv = Text::CSV->new( $csv_options );
910a0a6d 1800 my @result;
53ec2b6d 1801
2c669bca 1802 # Make the header row
53ec2b6d 1803 my @witnesses = map { $_->{'witness'} } @{$table->{'alignment'}};
1804 if( $opts->{ascii} ) {
1805 # TODO think of a fix for this
1806 throw( "Cannot currently produce ASCII sigla with witness layers" )
1807 unless $opts->{noac};
1808 my @awits = map { $self->tradition->witness( $_ )->ascii_sigil } @witnesses;
1809 @witnesses = @awits;
1810 }
1811 $csv->combine( @witnesses );
43800a64 1812 push( @result, $csv->string );
53ec2b6d 1813
2c669bca 1814 # Make the rest of the rows
1815 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 1816 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1dd07bda 1817 my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
de20588d 1818 # Quick and dirty collapse of requested relationship types
1819 if( ref( $opts->{mergetypes} ) eq 'ARRAY' ) {
1820 # Now substitute the reading in the relevant index of @row
1821 # for its merge-related reading
1822 my %substitutes;
1823 while( @rowobjs ) {
1824 my $thisr = shift @rowobjs;
1825 next unless $thisr;
1826 next if exists $substitutes{$thisr->{t}->text};
1827 # Make sure we don't have A <-> B substitutions.
1828 $substitutes{$thisr->{t}->text} = $thisr->{t}->text;
1829 foreach my $thatr ( @rowobjs ) {
1830 next unless $thatr;
1831 next if exists $substitutes{$thatr->{t}->text};
1832 my $ttrel = $self->get_relationship( $thisr->{t}, $thatr->{t} );
1833 next unless $ttrel;
1834 next unless grep { $ttrel->type eq $_ } @{$opts->{mergetypes}};
1835 # If we have got this far then we need to merge them.
1836 $substitutes{$thatr->{t}->text} = $thisr->{t}->text;
1837 }
1838 }
1839 @row = map { $_ && exists $substitutes{$_} ? $substitutes{$_} : $_ } @row;
1840 }
2c669bca 1841 $csv->combine( @row );
43800a64 1842 push( @result, $csv->string );
910a0a6d 1843 }
3a2ebbf4 1844 return join( "\n", @result );
910a0a6d 1845}
1846
ce5966fb 1847sub as_csv {
1848 my $self = shift;
34ca808b 1849 my $opts = shift || {};
1850 $opts->{fieldsep} = ',';
1851 return $self->_tabular( $opts );
ce5966fb 1852}
1853
1854sub as_tsv {
1855 my $self = shift;
34ca808b 1856 my $opts = shift || {};
1857 $opts->{fieldsep} = "\t";
1858 return $self->_tabular( $opts );
ce5966fb 1859}
1860
248276a2 1861=head2 alignment_table
2c669bca 1862
566f4595 1863Return a reference to an alignment table, in a slightly enhanced CollateX
1864format which looks like this:
1865
1866 $table = { alignment => [ { witness => "SIGIL",
4e5a7b2c 1867 tokens => [ { t => "TEXT" }, ... ] },
566f4595 1868 { witness => "SIG2",
4e5a7b2c 1869 tokens => [ { t => "TEXT" }, ... ] },
566f4595 1870 ... ],
1871 length => TEXTLEN };
1872
2c669bca 1873=cut
9f3ba6f7 1874
1dd07bda 1875sub alignment_table {
34ca808b 1876 my( $self, $opts ) = @_;
4e64b669 1877 if( $self->has_cached_table ) {
cbc8e08f 1878 return $self->cached_table
1879 unless $opts->{noac} || $opts->{safe_ac};
4e64b669 1880 }
1dd07bda 1881
0ecb975c 1882 # Make sure we can do this
1883 throw( "Need a linear graph in order to make an alignment table" )
1884 unless $self->linear;
b39e7cb5 1885 $self->calculate_ranks()
1886 unless $self->_graphcalc_done && $self->end->has_rank;
1887
2c669bca 1888 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 1889 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 1890 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
6771a1b1 1891 # say STDERR "Making witness row(s) for " . $wit->sigil;
1f7aa795 1892 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1dd07bda 1893 my @row = _make_witness_row( \@wit_path, \@all_pos );
bed6ce83 1894 my $witobj = { 'witness' => $wit->sigil, 'tokens' => \@row };
1895 $witobj->{'identifier'} = $wit->identifier if $wit->identifier;
1896 push( @{$table->{'alignment'}}, $witobj );
34ca808b 1897 if( $wit->is_layered && !$opts->{noac} ) {
1f7aa795 1898 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
861c3e27 1899 $wit->sigil.$self->ac_label );
1dd07bda 1900 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
cbc8e08f 1901 my $witlabel = $opts->{safe_ac}
1902 ? $wit->sigil . '__L' : $wit->sigil.$self->ac_label;
1903 my $witacobj = { 'witness' => $witlabel,
bed6ce83 1904 'tokens' => \@ac_row };
1905 $witacobj->{'identifier'} = $wit->identifier if $wit->identifier;
1906 push( @{$table->{'alignment'}}, $witacobj );
910a0a6d 1907 }
1908 }
cbc8e08f 1909 unless( $opts->{noac} || $opts->{safe_ac} ) {
4e64b669 1910 $self->cached_table( $table );
1911 }
1dd07bda 1912 return $table;
910a0a6d 1913}
1914
1915sub _make_witness_row {
1dd07bda 1916 my( $path, $positions ) = @_;
910a0a6d 1917 my %char_hash;
1918 map { $char_hash{$_} = undef } @$positions;
2c669bca 1919 my $debug = 0;
910a0a6d 1920 foreach my $rdg ( @$path ) {
6771a1b1 1921 say STDERR "rank " . $rdg->rank if $debug;
1922 # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
1dd07bda 1923 $char_hash{$rdg->rank} = { 't' => $rdg };
910a0a6d 1924 }
1925 my @row = map { $char_hash{$_} } @$positions;
eca16057 1926 # Fill in lacuna markers for undef spots in the row
1927 my $last_el = shift @row;
1928 my @filled_row = ( $last_el );
1929 foreach my $el ( @row ) {
0e476982 1930 # If we are using node reference, make the lacuna node appear many times
1931 # in the table. If not, use the lacuna tag.
1dd07bda 1932 if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1933 $el = $last_el;
eca16057 1934 }
1935 push( @filled_row, $el );
1936 $last_el = $el;
1937 }
1938 return @filled_row;
910a0a6d 1939}
1940
248276a2 1941
4e5a7b2c 1942=head1 NAVIGATION METHODS
910a0a6d 1943
4e5a7b2c 1944=head2 reading_sequence( $first, $last, $sigil, $backup )
e2902068 1945
1946Returns the ordered list of readings, starting with $first and ending
4e5a7b2c 1947with $last, for the witness given in $sigil. If a $backup sigil is
1948specified (e.g. when walking a layered witness), it will be used wherever
1949no $sigil path exists. If there is a base text reading, that will be
1950used wherever no path exists for $sigil or $backup.
e2902068 1951
1952=cut
1953
910a0a6d 1954# TODO Think about returning some lazy-eval iterator.
b0b4421a 1955# TODO Get rid of backup; we should know from what witness is whether we need it.
910a0a6d 1956
e2902068 1957sub reading_sequence {
861c3e27 1958 my( $self, $start, $end, $witness ) = @_;
e2902068 1959
930ff666 1960 $witness = $self->baselabel unless $witness;
e2902068 1961 my @readings = ( $start );
1962 my %seen;
1963 my $n = $start;
3a2ebbf4 1964 while( $n && $n->id ne $end->id ) {
1965 if( exists( $seen{$n->id} ) ) {
63778331 1966 throw( "Detected loop for $witness at " . $n->id );
910a0a6d 1967 }
3a2ebbf4 1968 $seen{$n->id} = 1;
910a0a6d 1969
861c3e27 1970 my $next = $self->next_reading( $n, $witness );
44771cf2 1971 unless( $next ) {
63778331 1972 throw( "Did not find any path for $witness from reading " . $n->id );
44771cf2 1973 }
910a0a6d 1974 push( @readings, $next );
1975 $n = $next;
e2902068 1976 }
1977 # Check that the last reading is our end reading.
1978 my $last = $readings[$#readings];
63778331 1979 throw( "Last reading found from " . $start->text .
1980 " for witness $witness is not the end!" ) # TODO do we get this far?
3a2ebbf4 1981 unless $last->id eq $end->id;
e2902068 1982
1983 return @readings;
1984}
1985
522c1314 1986=head2 readings_at_rank( $rank )
1987
1988Returns a list of readings at a given rank, taken from the alignment table.
1989
1990=cut
1991
1992sub readings_at_rank {
8a9a8200 1993 my( $self, $rank, $nolacuna ) = @_;
522c1314 1994 my $table = $self->alignment_table;
1995 # Table rank is real rank - 1.
1996 my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
1997 my %readings;
1998 foreach my $e ( @elements ) {
1999 next unless ref( $e ) eq 'HASH';
2000 next unless exists $e->{'t'};
8a9a8200 2001 my $rdg = $e->{'t'};
2002 next if $nolacuna && $rdg->is_lacuna && $rdg->rank ne $rank;
522c1314 2003 $readings{$e->{'t'}->id} = $e->{'t'};
2004 }
2005 return values %readings;
2006}
2007
4e5a7b2c 2008=head2 next_reading( $reading, $sigil );
8e1394aa 2009
4a8828f0 2010Returns the reading that follows the given reading along the given witness
930ff666 2011path.
8e1394aa 2012
2013=cut
2014
4a8828f0 2015sub next_reading {
e2902068 2016 # Return the successor via the corresponding path.
8e1394aa 2017 my $self = shift;
3a2ebbf4 2018 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 2019 return undef unless $answer;
3a2ebbf4 2020 return $self->reading( $answer );
8e1394aa 2021}
2022
4e5a7b2c 2023=head2 prior_reading( $reading, $sigil )
8e1394aa 2024
4a8828f0 2025Returns the reading that precedes the given reading along the given witness
930ff666 2026path.
8e1394aa 2027
2028=cut
2029
4a8828f0 2030sub prior_reading {
e2902068 2031 # Return the predecessor via the corresponding path.
8e1394aa 2032 my $self = shift;
3a2ebbf4 2033 my $answer = $self->_find_linked_reading( 'prior', @_ );
2034 return $self->reading( $answer );
8e1394aa 2035}
2036
4a8828f0 2037sub _find_linked_reading {
861c3e27 2038 my( $self, $direction, $node, $path ) = @_;
2039
2040 # Get a backup if we are dealing with a layered witness
2041 my $alt_path;
2042 my $aclabel = $self->ac_label;
2043 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
2044 $alt_path = $1;
2045 }
2046
e2902068 2047 my @linked_paths = $direction eq 'next'
3a2ebbf4 2048 ? $self->sequence->edges_from( $node )
2049 : $self->sequence->edges_to( $node );
e2902068 2050 return undef unless scalar( @linked_paths );
8e1394aa 2051
e2902068 2052 # We have to find the linked path that contains all of the
2053 # witnesses supplied in $path.
2054 my( @path_wits, @alt_path_wits );
4e5a7b2c 2055 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
2056 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 2057 my $base_le;
2058 my $alt_le;
2059 foreach my $le ( @linked_paths ) {
3a2ebbf4 2060 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 2061 $base_le = $le;
910a0a6d 2062 }
508fd430 2063 my @le_wits = sort $self->path_witnesses( $le );
3a2ebbf4 2064 if( _is_within( \@path_wits, \@le_wits ) ) {
2065 # This is the right path.
2066 return $direction eq 'next' ? $le->[1] : $le->[0];
2067 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
2068 $alt_le = $le;
2069 }
8e1394aa 2070 }
e2902068 2071 # Got this far? Return the alternate path if it exists.
3a2ebbf4 2072 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 2073 if $alt_le;
e2902068 2074
2075 # Got this far? Return the base path if it exists.
3a2ebbf4 2076 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 2077 if $base_le;
e2902068 2078
2079 # Got this far? We have no appropriate path.
2c669bca 2080 warn "Could not find $direction node from " . $node->id
910a0a6d 2081 . " along path $path";
8e1394aa 2082 return undef;
2083}
2084
4a8828f0 2085# Some set logic.
2086sub _is_within {
2087 my( $set1, $set2 ) = @_;
7854e12e 2088 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 2089 foreach my $el ( @$set1 ) {
910a0a6d 2090 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 2091 }
2092 return $ret;
2093}
2094
4e5a7b2c 2095# Return the string that joins together a list of witnesses for
2096# display on a single path.
2097sub _witnesses_of_label {
2098 my( $self, $label ) = @_;
2099 my $regex = $self->wit_list_separator;
2100 my @answer = split( /\Q$regex\E/, $label );
2101 return @answer;
b0b4421a 2102}
2103
d4b75f44 2104=head2 common_readings
2105
2106Returns the list of common readings in the graph (i.e. those readings that are
2107shared by all non-lacunose witnesses.)
2108
2109=cut
2110
2111sub common_readings {
2112 my $self = shift;
2113 my @common = grep { $_->is_common } $self->readings;
2114 return @common;
2115}
2116
fae52efd 2117=head2 path_text( $sigil, [, $start, $end ] )
b0b4421a 2118
2119Returns the text of a witness (plus its backup, if we are using a layer)
2120as stored in the collation. The text is returned as a string, where the
2121individual readings are joined with spaces and the meta-readings (e.g.
2122lacunae) are omitted. Optional specification of $start and $end allows
2123the generation of a subset of the witness text.
4e5a7b2c 2124
b0b4421a 2125=cut
2126
2127sub path_text {
861c3e27 2128 my( $self, $wit, $start, $end ) = @_;
b0b4421a 2129 $start = $self->start unless $start;
2130 $end = $self->end unless $end;
861c3e27 2131 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
629e27b0 2132 my $pathtext = '';
2133 my $last;
2134 foreach my $r ( @path ) {
6ad2ce78 2135 unless ( $r->join_prior || !$last || $last->join_next ) {
2136 $pathtext .= ' ';
2137 }
2138 $pathtext .= $r->text;
629e27b0 2139 $last = $r;
2140 }
2141 return $pathtext;
b0b4421a 2142}
4e5a7b2c 2143
2144=head1 INITIALIZATION METHODS
2145
2146These are mostly for use by parsers.
2147
2148=head2 make_witness_path( $witness )
2149
2150Link the array of readings contained in $witness->path (and in
2151$witness->uncorrected_path if it exists) into collation paths.
2152Clear out the arrays when finished.
de51424a 2153
4e5a7b2c 2154=head2 make_witness_paths
2155
2156Call make_witness_path for all witnesses in the tradition.
2157
2158=cut
930ff666 2159
7e450e44 2160# For use when a collation is constructed from a base text and an apparatus.
2161# We have the sequences of readings and just need to add path edges.
1f7aa795 2162# When we are done, clear out the witness path attributes, as they are no
2163# longer needed.
2164# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 2165
6a222840 2166sub make_witness_paths {
2167 my( $self ) = @_;
910a0a6d 2168 foreach my $wit ( $self->tradition->witnesses ) {
6771a1b1 2169 # say STDERR "Making path for " . $wit->sigil;
910a0a6d 2170 $self->make_witness_path( $wit );
7854e12e 2171 }
7854e12e 2172}
2173
6a222840 2174sub make_witness_path {
7854e12e 2175 my( $self, $wit ) = @_;
2176 my @chain = @{$wit->path};
15d2d3df 2177 my $sig = $wit->sigil;
fae52efd 2178 # Add start and end if necessary
2179 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
2180 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
7854e12e 2181 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 2182 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 2183 }
1f7aa795 2184 if( $wit->is_layered ) {
d9e873d0 2185 @chain = @{$wit->uncorrected_path};
fae52efd 2186 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
2187 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
d9e873d0 2188 foreach my $idx( 0 .. $#chain-1 ) {
2189 my $source = $chain[$idx];
2190 my $target = $chain[$idx+1];
2191 $self->add_path( $source, $target, $sig.$self->ac_label )
2192 unless $self->has_path( $source, $target, $sig );
2193 }
15d2d3df 2194 }
1f7aa795 2195 $wit->clear_path;
2196 $wit->clear_uncorrected_path;
e2902068 2197}
2198
4e5a7b2c 2199=head2 calculate_ranks
2200
2201Calculate the reading ranks (that is, their aligned positions relative
2202to each other) for the graph. This can only be called on linear collations.
2203
b365fbae 2204=begin testing
2205
2206use Text::Tradition;
2207
2208my $cxfile = 't/data/Collatex-16.xml';
2209my $t = Text::Tradition->new(
2210 'name' => 'inline',
2211 'input' => 'CollateX',
2212 'file' => $cxfile,
2213 );
2214my $c = $t->collation;
2215
2216# Make an svg
bfcbcecb 2217my $table = $c->alignment_table;
2218ok( $c->has_cached_table, "Alignment table was cached" );
2219is( $c->alignment_table, $table, "Cached table returned upon second call" );
b365fbae 2220$c->calculate_ranks;
bfcbcecb 2221is( $c->alignment_table, $table, "Cached table retained with no rank change" );
864ee4bf 2222$c->add_relationship( 'n13', 'n23', { type => 'repetition' } );
2223is( $c->alignment_table, $table, "Alignment table unchanged after non-colo relationship add" );
2224$c->add_relationship( 'n24', 'n23', { type => 'spelling' } );
2225isnt( $c->alignment_table, $table, "Alignment table changed after colo relationship add" );
b365fbae 2226
2227=end testing
2228
4e5a7b2c 2229=cut
2230
910a0a6d 2231sub calculate_ranks {
2232 my $self = shift;
b365fbae 2233 # Save the existing ranks, in case we need to invalidate the cached SVG.
8d9494a8 2234 throw( "Cannot calculate ranks on a non-linear graph" )
2235 unless $self->linear;
b365fbae 2236 my %existing_ranks;
ac4d7ac2 2237 map { $existing_ranks{$_} = $_->rank } $self->readings;
359944f7 2238
2239 # Do the rankings based on the relationship equivalence graph, starting
2240 # with the start node.
56772e8c 2241 my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
2242
910a0a6d 2243 # Transfer our rankings from the topological graph to the real one.
2244 foreach my $r ( $self->readings ) {
cecbe56d 2245 if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
359944f7 2246 $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
67da8d6c 2247 } else {
63778331 2248 # Die. Find the last rank we calculated.
359944f7 2249 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
2250 <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
63778331 2251 $self->readings;
2252 my $last = pop @all_defined;
2253 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
67da8d6c 2254 }
de51424a 2255 }
bfcbcecb 2256 # Do we need to invalidate the cached data?
be3af600 2257 if( $self->has_cached_table ) {
b365fbae 2258 foreach my $r ( $self->readings ) {
7c293912 2259 next if defined( $existing_ranks{$r} )
2260 && $existing_ranks{$r} == $r->rank;
c1915ab9 2261 # Something has changed, so clear the cache
bfcbcecb 2262 $self->_clear_cache;
c1915ab9 2263 # ...and recalculate the common readings.
2264 $self->calculate_common_readings();
b365fbae 2265 last;
2266 }
2267 }
c1915ab9 2268 # The graph calculation information is now up to date.
2269 $self->_graphcalc_done(1);
8e1394aa 2270}
3a1f2523 2271
c1915ab9 2272sub _clear_cache {
2273 my $self = shift;
c1915ab9 2274 $self->wipe_table if $self->has_cached_table;
2275}
2276
2277
4e5a7b2c 2278=head2 flatten_ranks
2279
2280A convenience method for parsing collation data. Searches the graph for readings
2281with the same text at the same rank, and merges any that are found.
2282
2283=cut
2284
0e476982 2285sub flatten_ranks {
4ef65ab4 2286 my ( $self, %args ) = shift;
0e476982 2287 my %unique_rank_rdg;
bf6e338d 2288 my $changed;
4ef65ab4 2289 foreach my $p ( $self->identical_readings( %args ) ) {
2290 # say STDERR "Combining readings at same rank: @$p";
2291 $changed = 1;
2292 $self->merge_readings( @$p );
2293 # TODO see if this now makes a common point.
7a0956c1 2294 }
2295 # If we merged readings, the ranks are still fine but the alignment
2296 # table is wrong. Wipe it.
2297 $self->wipe_table() if $changed;
2298}
2299
2300=head2 identical_readings
2301=head2 identical_readings( start => $startnode, end => $endnode )
2302=head2 identical_readings( startrank => $startrank, endrank => $endrank )
2303
2304Goes through the graph identifying all pairs of readings that appear to be
2305identical, and therefore able to be merged into a single reading. Returns the
2306relevant identical pairs. Can be restricted to run over only a part of the
2307graph, specified either by node or by rank.
2308
2309=cut
2310
2311sub identical_readings {
2312 my ( $self, %args ) = @_;
2313 # Find where we should start and end.
2314 my $startrank = $args{startrank} || 0;
2315 if( $args{start} ) {
2316 throw( "Starting reading has no rank" ) unless $self->reading( $args{start} )
2317 && $self->reading( $args{start} )->has_rank;
2318 $startrank = $self->reading( $args{start} )->rank;
2319 }
2320 my $endrank = $args{endrank} || $self->end->rank;
2321 if( $args{end} ) {
2322 throw( "Ending reading has no rank" ) unless $self->reading( $args{end} )
2323 && $self->reading( $args{end} )->has_rank;
3c234eb6 2324 $endrank = $self->reading( $args{end} )->rank;
7a0956c1 2325 }
2326
2327 # Make sure the ranks are correct.
2328 unless( $self->_graphcalc_done ) {
2329 $self->calculate_ranks;
2330 }
2331 # Go through the readings looking for duplicates.
2332 my %unique_rank_rdg;
2333 my @pairs;
0e476982 2334 foreach my $rdg ( $self->readings ) {
2335 next unless $rdg->has_rank;
7a0956c1 2336 my $rk = $rdg->rank;
2337 next if $rk > $endrank || $rk < $startrank;
2338 my $key = $rk . "||" . $rdg->text;
0e476982 2339 if( exists $unique_rank_rdg{$key} ) {
07e6765f 2340 # Make sure they don't have different grammatical forms
2341 my $ur = $unique_rank_rdg{$key};
a445ce40 2342 if( $rdg->is_identical( $ur ) ) {
7a0956c1 2343 push( @pairs, [ $ur, $rdg ] );
07e6765f 2344 }
0e476982 2345 } else {
2346 $unique_rank_rdg{$key} = $rdg;
2347 }
7a0956c1 2348 }
2349
2350 return @pairs;
0e476982 2351}
4633f9e4 2352
2353
d4b75f44 2354=head2 calculate_common_readings
2355
2356Goes through the graph identifying the readings that appear in every witness
2357(apart from those with lacunae at that spot.) Marks them as common and returns
2358the list.
2359
2360=begin testing
2361
2362use Text::Tradition;
2363
2364my $cxfile = 't/data/Collatex-16.xml';
2365my $t = Text::Tradition->new(
2366 'name' => 'inline',
2367 'input' => 'CollateX',
2368 'file' => $cxfile,
2369 );
2370my $c = $t->collation;
2371
2372my @common = $c->calculate_common_readings();
2373is( scalar @common, 8, "Found correct number of common readings" );
2374my @marked = sort $c->common_readings();
2375is( scalar @common, 8, "All common readings got marked as such" );
679f17e1 2376my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
d4b75f44 2377is_deeply( \@marked, \@expected, "Found correct list of common readings" );
2378
2379=end testing
2380
2381=cut
2382
2383sub calculate_common_readings {
2384 my $self = shift;
2385 my @common;
c1915ab9 2386 map { $_->is_common( 0 ) } $self->readings;
2387 # Implicitly calls calculate_ranks
1dd07bda 2388 my $table = $self->alignment_table;
d4b75f44 2389 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
7f52eac8 2390 my @row = map { $_->{'tokens'}->[$idx]
2391 ? $_->{'tokens'}->[$idx]->{'t'} : '' }
2392 @{$table->{'alignment'}};
d4b75f44 2393 my %hash;
2394 foreach my $r ( @row ) {
2395 if( $r ) {
2396 $hash{$r->id} = $r unless $r->is_meta;
2397 } else {
2398 $hash{'UNDEF'} = $r;
2399 }
2400 }
2401 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
2402 my( $r ) = values %hash;
2403 $r->is_common( 1 );
2404 push( @common, $r );
2405 }
2406 }
2407 return @common;
2408}
2409
861c3e27 2410=head2 text_from_paths
2411
2412Calculate the text array for all witnesses from the path, for later consistency
2413checking. Only to be used if there is no non-graph-based way to know the
2414original texts.
2415
2416=cut
2417
2418sub text_from_paths {
2419 my $self = shift;
2420 foreach my $wit ( $self->tradition->witnesses ) {
5164a6f0 2421 my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
2422 my @text;
2423 foreach my $r ( @readings ) {
2424 next if $r->is_meta;
2425 push( @text, $r->text );
2426 }
861c3e27 2427 $wit->text( \@text );
2428 if( $wit->is_layered ) {
5164a6f0 2429 my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
2430 $wit->sigil.$self->ac_label );
2431 my @uctext;
2432 foreach my $r ( @ucrdgs ) {
2433 next if $r->is_meta;
2434 push( @uctext, $r->text );
2435 }
2436 $wit->layertext( \@uctext );
861c3e27 2437 }
2438 }
2439}
0e476982 2440
4e5a7b2c 2441=head1 UTILITY FUNCTIONS
2442
2443=head2 common_predecessor( $reading_a, $reading_b )
8e1394aa 2444
4e5a7b2c 2445Find the last reading that occurs in sequence before both the given readings.
414cc046 2446At the very least this should be $self->start.
4e5a7b2c 2447
2448=head2 common_successor( $reading_a, $reading_b )
2449
2450Find the first reading that occurs in sequence after both the given readings.
414cc046 2451At the very least this should be $self->end.
4e5a7b2c 2452
22222af9 2453=begin testing
2454
2455use Text::Tradition;
2456
2457my $cxfile = 't/data/Collatex-16.xml';
2458my $t = Text::Tradition->new(
2459 'name' => 'inline',
2460 'input' => 'CollateX',
2461 'file' => $cxfile,
2462 );
2463my $c = $t->collation;
2464
679f17e1 2465is( $c->common_predecessor( 'n24', 'n23' )->id,
22222af9 2466 'n20', "Found correct common predecessor" );
679f17e1 2467is( $c->common_successor( 'n24', 'n23' )->id,
10e4b1ac 2468 '__END__', "Found correct common successor" );
22222af9 2469
4e5a7b2c 2470is( $c->common_predecessor( 'n19', 'n17' )->id,
22222af9 2471 'n16', "Found correct common predecessor for readings on same path" );
679f17e1 2472is( $c->common_successor( 'n21', 'n10' )->id,
10e4b1ac 2473 '__END__', "Found correct common successor for readings on same path" );
22222af9 2474
2475=end testing
2476
2477=cut
2478
2479## Return the closest reading that is a predecessor of both the given readings.
2480sub common_predecessor {
2481 my $self = shift;
4e5a7b2c 2482 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 2483 return $self->_common_in_path( $r1, $r2, 'predecessors' );
22222af9 2484}
2485
2486sub common_successor {
2487 my $self = shift;
4e5a7b2c 2488 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 2489 return $self->_common_in_path( $r1, $r2, 'successors' );
22222af9 2490}
2491
414cc046 2492
2493# TODO think about how to do this without ranks...
027d819c 2494sub _common_in_path {
22222af9 2495 my( $self, $r1, $r2, $dir ) = @_;
414cc046 2496 my $iter = $self->end->rank;
22222af9 2497 my @candidates;
414cc046 2498 my @last_r1 = ( $r1 );
2499 my @last_r2 = ( $r2 );
2500 # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
22222af9 2501 my %all_seen;
6771a1b1 2502 # say STDERR "Finding common $dir for $r1, $r2";
22222af9 2503 while( !@candidates ) {
414cc046 2504 last unless $iter--; # Avoid looping infinitely
2505 # Iterate separately down the graph from r1 and r2
2506 my( @new_lc1, @new_lc2 );
2507 foreach my $lc ( @last_r1 ) {
2508 foreach my $p ( $lc->$dir ) {
2509 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
6771a1b1 2510 # say STDERR "Path candidate $p from $lc";
414cc046 2511 push( @candidates, $p );
002e3600 2512 } elsif( !$all_seen{$p->id} ) {
414cc046 2513 $all_seen{$p->id} = 'r1';
2514 push( @new_lc1, $p );
2515 }
2516 }
2517 }
2518 foreach my $lc ( @last_r2 ) {
22222af9 2519 foreach my $p ( $lc->$dir ) {
414cc046 2520 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
6771a1b1 2521 # say STDERR "Path candidate $p from $lc";
22222af9 2522 push( @candidates, $p );
002e3600 2523 } elsif( !$all_seen{$p->id} ) {
414cc046 2524 $all_seen{$p->id} = 'r2';
2525 push( @new_lc2, $p );
22222af9 2526 }
2527 }
2528 }
414cc046 2529 @last_r1 = @new_lc1;
2530 @last_r2 = @new_lc2;
22222af9 2531 }
2532 my @answer = sort { $a->rank <=> $b->rank } @candidates;
2533 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
2534}
2535
63778331 2536sub throw {
2537 Text::Tradition::Error->throw(
2538 'ident' => 'Collation error',
2539 'message' => $_[0],
2540 );
2541}
2542
dd3b58b0 2543no Moose;
2544__PACKAGE__->meta->make_immutable;
e867486f 2545
a445ce40 2546=head1 BUGS/TODO
2547
2548=over
2549
2550=item * Rework XML serialization in a more modular way
2551
2552=back
2553
027d819c 2554=head1 LICENSE
e867486f 2555
027d819c 2556This package is free software and is provided "as is" without express
2557or implied warranty. You can redistribute it and/or modify it under
2558the same terms as Perl itself.
e867486f 2559
027d819c 2560=head1 AUTHOR
e867486f 2561
027d819c 2562Tara L Andrews E<lt>aurum@cpan.orgE<gt>