Check text direction when merging readings
[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Collation.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Collation;
d047cd52 2
6771a1b1 3use feature 'say';
910a0a6d 4use Encode qw( decode_utf8 );
5use File::Temp;
bfcbcecb 6use File::Which;
c9bf3dbf 7use Graph;
8e1394aa 8use IPC::Run qw( run binary );
522c1314 9use JSON qw/ to_json /;
82fa4d57 10use Text::CSV;
5c0072ef 11use Text::Tradition::Collation::Data;
b15511bf 12use Text::Tradition::Collation::Reading;
22222af9 13use Text::Tradition::Collation::RelationshipStore;
63778331 14use Text::Tradition::Error;
cc31ebaa 15use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
5c0072ef 16use XML::LibXML;
17use XML::LibXML::XPathContext;
dd3b58b0 18use Moose;
19
5c0072ef 20has _data => (
21 isa => 'Text::Tradition::Collation::Data',
22 is => 'ro',
23 required => 1,
24 handles => [ qw(
25 sequence
26 paths
27 _set_relations
28 relations
29 _set_start
30 _set_end
31 ac_label
32 has_cached_table
33 relationships
0bbea47d 34 relationship_types
5c0072ef 35 related_readings
36 get_relationship
37 del_relationship
38 equivalence
39 equivalence_graph
40 readings
41 reading
42 _add_reading
43 del_reading
44 has_reading
45 wit_list_separator
46 baselabel
47 linear
48 wordsep
d1afb4b5 49 direction
5c0072ef 50 start
51 end
52 cached_table
53 _graphcalc_done
54 has_cached_svg
55 wipe_table
56 )]
57);
dd3b58b0 58
3a2ebbf4 59has 'tradition' => (
60 is => 'ro',
d047cd52 61 isa => 'Text::Tradition',
8cfd99c4 62 writer => '_set_tradition',
8d9a1cd8 63 weak_ref => 1,
d047cd52 64 );
dd3b58b0 65
6b6c7098 66=encoding utf8
67
4e5a7b2c 68=head1 NAME
69
70Text::Tradition::Collation - a software model for a text collation
71
72=head1 SYNOPSIS
73
74 use Text::Tradition;
75 my $t = Text::Tradition->new(
76 'name' => 'this is a text',
77 'input' => 'TEI',
78 'file' => '/path/to/tei_parallel_seg_file.xml' );
79
80 my $c = $t->collation;
81 my @readings = $c->readings;
82 my @paths = $c->paths;
83 my @relationships = $c->relationships;
84
85 my $svg_variant_graph = $t->collation->as_svg();
86
87=head1 DESCRIPTION
88
89Text::Tradition is a library for representation and analysis of collated
90texts, particularly medieval ones. The Collation is the central feature of
91a Tradition, where the text, its sequence of readings, and its relationships
92between readings are actually kept.
93
94=head1 CONSTRUCTOR
95
96=head2 new
97
98The constructor. Takes a hash or hashref of the following arguments:
99
100=over
101
102=item * tradition - The Text::Tradition object to which the collation
103belongs. Required.
104
105=item * linear - Whether the collation should be linear; that is, whether
106transposed readings should be treated as two linked readings rather than one,
107and therefore whether the collation graph is acyclic. Defaults to true.
108
4e5a7b2c 109=item * baselabel - The default label for the path taken by a base text
110(if any). Defaults to 'base text'.
111
112=item * wit_list_separator - The string to join a list of witnesses for
113purposes of making labels in display graphs. Defaults to ', '.
114
115=item * ac_label - The extra label to tack onto a witness sigil when
116representing another layer of path for the given witness - that is, when
117a text has more than one possible reading due to scribal corrections or
118the like. Defaults to ' (a.c.)'.
119
4e483aa5 120=item * wordsep - The string used to separate words in the original text.
121Defaults to ' '.
122
4e5a7b2c 123=back
124
125=head1 ACCESSORS
126
127=head2 tradition
128
129=head2 linear
130
4e5a7b2c 131=head2 wit_list_separator
132
133=head2 baselabel
134
135=head2 ac_label
136
4e483aa5 137=head2 wordsep
138
4e5a7b2c 139Simple accessors for collation attributes.
140
141=head2 start
142
143The meta-reading at the start of every witness path.
144
145=head2 end
146
147The meta-reading at the end of every witness path.
148
149=head2 readings
150
151Returns all Reading objects in the graph.
152
153=head2 reading( $id )
154
155Returns the Reading object corresponding to the given ID.
156
157=head2 add_reading( $reading_args )
158
159Adds a new reading object to the collation.
160See L<Text::Tradition::Collation::Reading> for the available arguments.
161
162=head2 del_reading( $object_or_id )
163
164Removes the given reading from the collation, implicitly removing its
165paths and relationships.
166
4e5a7b2c 167=head2 has_reading( $id )
168
169Predicate to see whether a given reading ID is in the graph.
170
171=head2 reading_witnesses( $object_or_id )
172
173Returns a list of sigils whose witnesses contain the reading.
174
175=head2 paths
176
177Returns all reading paths within the document - that is, all edges in the
178collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
179
180=head2 add_path( $source, $target, $sigil )
181
182Links the given readings in the collation in sequence, under the given witness
183sigil. The readings may be specified by object or ID.
184
185=head2 del_path( $source, $target, $sigil )
186
187Links the given readings in the collation in sequence, under the given witness
188sigil. The readings may be specified by object or ID.
189
190=head2 has_path( $source, $target );
191
192Returns true if the two readings are linked in sequence in any witness.
193The readings may be specified by object or ID.
194
195=head2 relationships
196
197Returns all Relationship objects in the collation.
198
5fd7be0f 199=head2 add_relationship( $reading, $other_reading, $options, $changed_readings )
4e5a7b2c 200
201Adds a new relationship of the type given in $options between the two readings,
202which may be specified by object or ID. Returns a value of ( $status, @vectors)
203where $status is true on success, and @vectors is a list of relationship edges
5fd7be0f 204that were ultimately added. If an array reference is passed in as $changed_readings,
205then any readings that were altered due to the relationship creation are added to
206the array.
207
4e5a7b2c 208See L<Text::Tradition::Collation::Relationship> for the available options.
209
210=cut
dd3b58b0 211
5c0072ef 212sub BUILDARGS {
213 my ( $class, @args ) = @_;
214 my %args = @args == 1 ? %{ $args[0] } : @args;
215 # TODO determine these from the Moose::Meta object
216 my @delegate_attrs = qw(sequence relations readings wit_list_separator baselabel
d1afb4b5 217 linear wordsep direction start end cached_table _graphcalc_done);
5c0072ef 218 my %data_args;
219 for my $attr (@delegate_attrs) {
220 $data_args{$attr} = delete $args{$attr} if exists $args{$attr};
221 }
222 $args{_data} = Text::Tradition::Collation::Data->new(%data_args);
223 return \%args;
224}
225
d047cd52 226sub BUILD {
3a2ebbf4 227 my $self = shift;
22222af9 228 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
e4b73942 229 $self->_set_start( $self->add_reading(
230 { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
231 $self->_set_end( $self->add_reading(
232 { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
d047cd52 233}
784877d9 234
b34fd19d 235=head2 register_relationship_type( %relationship_definition )
236
237Add a relationship type definition to this collation. The argument can be either a
238hash or a hashref, defining the properties of the relationship. For relationship types
239and their properties, see L<Text::Tradition::Collation::RelationshipType>.
240
241=head2 get_relationship_type( $relationship_name )
242
243Retrieve the RelationshipType object for the relationship with the given name.
244
245=cut
246
24efa55d 247sub register_relationship_type {
248 my $self = shift;
249 my %args = @_ == 1 ? %{$_[0]} : @_;
250 if( $self->relations->has_type( $args{name} ) ) {
251 throw( 'Relationship type ' . $args{name} . ' already registered' );
252 }
253 $self->relations->add_type( %args );
254}
255
39d1ddc7 256sub get_relationship_type {
257 my( $self, $name ) = @_;
258 return $self->relations->has_type( $name )
259 ? $self->relations->type( $name ) : undef;
260}
261
3a2ebbf4 262### Reading construct/destruct functions
263
264sub add_reading {
265 my( $self, $reading ) = @_;
266 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
267 my %args = %$reading;
e4b73942 268 if( $args{'init'} ) {
269 # If we are initializing an empty collation, don't assume that we
270 # have set a tradition.
271 delete $args{'init'};
8943ff68 272 } elsif( $self->tradition->can('language') && $self->tradition->has_language
273 && !exists $args{'language'} ) {
6ad2ce78 274 $args{'language'} = $self->tradition->language;
275 }
3a2ebbf4 276 $reading = Text::Tradition::Collation::Reading->new(
277 'collation' => $self,
278 %args );
279 }
280 # First check to see if a reading with this ID exists.
281 if( $self->reading( $reading->id ) ) {
63778331 282 throw( "Collation already has a reading with id " . $reading->id );
3a2ebbf4 283 }
c1915ab9 284 $self->_graphcalc_done(0);
3a2ebbf4 285 $self->_add_reading( $reading->id => $reading );
286 # Once the reading has been added, put it in both graphs.
287 $self->sequence->add_vertex( $reading->id );
cecbe56d 288 $self->relations->add_reading( $reading->id );
3a2ebbf4 289 return $reading;
eca16057 290};
291
3a2ebbf4 292around del_reading => sub {
293 my $orig = shift;
294 my $self = shift;
295 my $arg = shift;
296
cecbe56d 297 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
298 $arg = $arg->id;
3a2ebbf4 299 }
3a2ebbf4 300 # Remove the reading from the graphs.
c1915ab9 301 $self->_graphcalc_done(0);
4e483aa5 302 $self->_clear_cache; # Explicitly clear caches to GC the reading
cecbe56d 303 $self->sequence->delete_vertex( $arg );
304 $self->relations->delete_reading( $arg );
3a2ebbf4 305
306 # Carry on.
cecbe56d 307 $self->$orig( $arg );
3a2ebbf4 308};
7854e12e 309
3c234eb6 310=head2 merge_readings( $main, $second, $concatenate, $with_str )
311
312Merges the $second reading into the $main one. If $concatenate is true, then
313the merged node will carry the text of both readings, concatenated with either
314$with_str (if specified) or a sensible default (the empty string if the
315appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
316
317The first two arguments may be either readings or reading IDs.
318
4e483aa5 319=begin testing
320
321use Text::Tradition;
58568d5c 322use TryCatch;
4e483aa5 323
324my $cxfile = 't/data/Collatex-16.xml';
325my $t = Text::Tradition->new(
326 'name' => 'inline',
327 'input' => 'CollateX',
328 'file' => $cxfile,
329 );
330my $c = $t->collation;
331
332my $rno = scalar $c->readings;
4ef65ab4 333# Split n21 ('unto') for testing purposes
4e483aa5 334my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
335my $old_r = $c->reading( 'n21' );
336$old_r->alter_text( 'to' );
337$c->del_path( 'n20', 'n21', 'A' );
338$c->add_path( 'n20', 'n21p0', 'A' );
339$c->add_path( 'n21p0', 'n21', 'A' );
7a0956c1 340$c->add_relationship( 'n21', 'n22', { type => 'collated', scope => 'local' } );
4e483aa5 341$c->flatten_ranks();
342ok( $c->reading( 'n21p0' ), "New reading exists" );
343is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
344
679f17e1 345# Combine n3 and n4 ( with his )
4e483aa5 346$c->merge_readings( 'n3', 'n4', 1 );
347ok( !$c->reading('n4'), "Reading n4 is gone" );
348is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
349
679f17e1 350# Collapse n9 and n10 ( rood / root )
351$c->merge_readings( 'n9', 'n10' );
352ok( !$c->reading('n10'), "Reading n10 is gone" );
353is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
4e483aa5 354
58568d5c 355# Try to combine n21 and n21p0. This should break.
4e483aa5 356my $remaining = $c->reading('n21');
357$remaining ||= $c->reading('n22'); # one of these should still exist
58568d5c 358try {
359 $c->merge_readings( 'n21p0', $remaining, 1 );
360 ok( 0, "Bad reading merge changed the graph" );
361} catch( Text::Tradition::Error $e ) {
362 like( $e->message, qr/neither concatenated nor collated/, "Expected exception from bad concatenation" );
363} catch {
364 ok( 0, "Unexpected error on bad reading merge: $@" );
365}
366
367try {
368 $c->calculate_ranks();
369 ok( 1, "Graph is still evidently whole" );
370} catch( Text::Tradition::Error $e ) {
371 ok( 0, "Caught a rank exception: " . $e->message );
372}
4e483aa5 373
374=end testing
375
376=cut
7854e12e 377
3a2ebbf4 378sub merge_readings {
379 my $self = shift;
380
56772e8c 381 # Sanity check
382 my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
383 my $mergemeta = $kept_obj->is_meta;
384 throw( "Cannot merge meta and non-meta reading" )
385 unless ( $mergemeta && $del_obj->is_meta )
386 || ( !$mergemeta && !$del_obj->is_meta );
387 if( $mergemeta ) {
388 throw( "Cannot merge with start or end node" )
389 if( $kept_obj eq $self->start || $kept_obj eq $self->end
390 || $del_obj eq $self->start || $del_obj eq $self->end );
a445ce40 391 throw( "Cannot combine text of meta readings" ) if $combine;
56772e8c 392 }
e4f00901 393 # We can only merge readings in a linear graph if:
394 # - they are contiguous with only one edge between them, OR
395 # - they are at equivalent ranks in the graph.
396 if( $self->linear ) {
397 my @delpred = $del_obj->predecessors;
398 my @keptsuc = $kept_obj->successors;
399 unless ( @delpred == 1 && $delpred[0] eq $kept_obj
400 && @keptsuc == 1 && $keptsuc[0] eq $del_obj ) {
401 my( $is_ok, $msg ) = $self->relations->relationship_valid(
402 $kept_obj, $del_obj, 'collated' );
403 unless( $is_ok ) {
404 throw( "Readings $kept_obj and $del_obj can be neither concatenated nor collated" );
405 }
406 }
407 }
408
3a2ebbf4 409 # We only need the IDs for adding paths to the graph, not the reading
410 # objects themselves.
56772e8c 411 my $kept = $kept_obj->id;
412 my $deleted = $del_obj->id;
c1915ab9 413 $self->_graphcalc_done(0);
10943ab0 414
3a2ebbf4 415 # The kept reading should inherit the paths and the relationships
416 # of the deleted reading.
417 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
418 my @vector = ( $kept );
419 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
420 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
49d4f2ac 421 next if $vector[0] eq $vector[1]; # Don't add a self loop
3a2ebbf4 422 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
423 $self->sequence->add_edge( @vector );
424 my $fwits = $self->sequence->get_edge_attributes( @vector );
425 @wits{keys %$fwits} = values %$fwits;
426 $self->sequence->set_edge_attributes( @vector, \%wits );
427 }
cecbe56d 428 $self->relations->merge_readings( $kept, $deleted, $combine );
3a2ebbf4 429
430 # Do the deletion deed.
4e483aa5 431 if( $combine ) {
869a1ada 432 # Combine the text of the readings
4e483aa5 433 my $joinstr = $combine_char;
434 unless( defined $joinstr ) {
435 $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
436 $joinstr = $self->wordsep unless defined $joinstr;
437 }
a445ce40 438 $kept_obj->_combine( $del_obj, $joinstr );
49d4f2ac 439 }
3a2ebbf4 440 $self->del_reading( $deleted );
441}
7854e12e 442
8d9494a8 443=head2 merge_related( @relationship_types )
444
445Merge all readings linked with the relationship types given. If any of the selected type(s) is not a colocation, the graph will no longer be linear. The majority/plurality reading in each case will be the one kept.
446
447WARNING: This operation cannot be undone.
448
449=cut
450
451=begin testing
452
58568d5c 453use Test::Warn;
8d9494a8 454use Text::Tradition;
455use TryCatch;
456
58568d5c 457my $t;
458warnings_exist {
459 $t = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/legendfrag.xml' );
460} [qr/Cannot set relationship on a meta reading/],
461 "Got expected relationship drop warning on parse";
462
8d9494a8 463my $c = $t->collation;
a512d0e6 464# Force the transitive propagation of all existing relationships.
465$c->relations->propagate_all_relationships();
8d9494a8 466
467my %rdg_ids;
468map { $rdg_ids{$_} = 1 } $c->readings;
469$c->merge_related( 'orthographic' );
751ee528 470is( scalar( $c->readings ), keys( %rdg_ids ) - 9,
8d9494a8 471 "Successfully collapsed orthographic variation" );
751ee528 472map { $rdg_ids{$_} = undef } qw/ r13.3 r11.4 r8.5 r8.2 r7.7 r7.5 r7.4 r7.3 r7.1 /;
8d9494a8 473foreach my $rid ( keys %rdg_ids ) {
474 my $exp = $rdg_ids{$rid};
475 is( !$c->reading( $rid ), !$exp, "Reading $rid correctly " .
476 ( $exp ? "retained" : "removed" ) );
477}
478ok( $c->linear, "Graph is still linear" );
479try {
480 $c->calculate_ranks; # This should succeed
481 ok( 1, "Can still calculate ranks on the new graph" );
482} catch {
483 ok( 0, "Rank calculation on merged graph failed: $@" );
484}
485
486# Now add some transpositions
487$c->add_relationship( 'r8.4', 'r10.4', { type => 'transposition' } );
488$c->merge_related( 'transposition' );
751ee528 489is( scalar( $c->readings ), keys( %rdg_ids ) - 10,
8d9494a8 490 "Transposed relationship is merged away" );
491ok( !$c->reading('r8.4'), "Correct transposed reading removed" );
492ok( !$c->linear, "Graph is no longer linear" );
493try {
494 $c->calculate_ranks; # This should fail
495 ok( 0, "Rank calculation happened on nonlinear graph?!" );
496} catch ( Text::Tradition::Error $e ) {
497 is( $e->message, 'Cannot calculate ranks on a non-linear graph',
498 "Rank calculation on merged graph threw an error" );
499}
500
8d9494a8 501=end testing
502
503=cut
504
505# TODO: there should be a way to display merged without affecting the underlying data!
506
507sub merge_related {
508 my $self = shift;
509 my %reltypehash;
510 map { $reltypehash{$_} = 1 } @_;
511
512 # Set up the filter for finding related readings
513 my $filter = sub {
514 exists $reltypehash{$_[0]->type};
515 };
516
8d9494a8 517 # Go through all readings looking for related ones
518 foreach my $r ( $self->readings ) {
519 next unless $self->reading( "$r" ); # might have been deleted meanwhile
751ee528 520 while( my @related = $self->related_readings( $r, $filter ) ) {
8d9494a8 521 push( @related, $r );
522 @related = sort {
523 scalar $b->witnesses <=> scalar $a->witnesses
524 } @related;
525 my $keep = shift @related;
526 foreach my $delr ( @related ) {
58568d5c 527 $self->linear( 0 )
8d9494a8 528 unless( $self->get_relationship( $keep, $delr )->colocated );
529 $self->merge_readings( $keep, $delr );
530 }
531 }
532 }
8d9494a8 533}
534
6771a1b1 535=head2 compress_readings
536
537Where possible in the graph, compresses plain sequences of readings into a
538single reading. The sequences must consist of readings with no
539relationships to other readings, with only a single witness path between
540them and no other witness paths from either that would skip the other. The
541readings must also not be marked as nonsense or bad grammar.
542
543WARNING: This operation cannot be undone.
544
c67190dd 545=begin testing
546
547use Text::Tradition;
548
2e818703 549my $t = Text::Tradition->new( input => 'CollateX', file => 't/data/Collatex-16.xml' );
c67190dd 550my $c = $t->collation;
551my $n = scalar $c->readings;
552$c->compress_readings();
553is( scalar $c->readings, $n - 6, "Compressing readings seems to work" );
554
555# Now put in a join-word and make sure the thing still works.
2e818703 556my $t2 = Text::Tradition->new( input => 'CollateX', file => 't/data/Collatex-16.xml' );
c67190dd 557my $c2 = $t2->collation;
558# Split n21 ('unto') for testing purposes
559my $new_r = $c2->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
560my $old_r = $c2->reading( 'n21' );
561$old_r->alter_text( 'to' );
562$c2->del_path( 'n20', 'n21', 'A' );
563$c2->add_path( 'n20', 'n21p0', 'A' );
564$c2->add_path( 'n21p0', 'n21', 'A' );
565$c2->calculate_ranks();
566is( scalar $c2->readings, $n + 1, "We have our extra test reading" );
567$c2->compress_readings();
568is( scalar $c2->readings, $n - 6, "Compressing readings also works with join_next" );
569is( $c2->reading( 'n21p0' )->text, 'unto', "The joined word has no space" );
570
571
572=end testing
573
6771a1b1 574=cut
575
576sub compress_readings {
577 my $self = shift;
f5e442f9 578 # Sanity check: first save the original text of each witness.
579 my %origtext;
580 foreach my $wit ( $self->tradition->witnesses ) {
581 $origtext{$wit->sigil} = $self->path_text( $wit->sigil );
582 if( $wit->is_layered ) {
583 my $acsig = $wit->sigil . $self->ac_label;
584 $origtext{$acsig} = $self->path_text( $acsig );
585 }
586 }
587
588 # Now do the deed.
6771a1b1 589 # Anywhere in the graph that there is a reading that joins only to a single
590 # successor, and neither of these have any relationships, just join the two
591 # readings.
6771a1b1 592 foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
428bcf0b 593 # Now look for readings that can be joined to their successors.
a445ce40 594 next unless $rdg->is_combinable;
6771a1b1 595 my %seen;
596 while( $self->sequence->successors( $rdg ) == 1 ) {
597 my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
598 throw( "Infinite loop" ) if $seen{$next->id};
599 $seen{$next->id} = 1;
600 last if $self->sequence->predecessors( $next ) > 1;
a445ce40 601 last unless $next->is_combinable;
6771a1b1 602 say "Joining readings $rdg and $next";
603 $self->merge_readings( $rdg, $next, 1 );
604 }
605 }
f5e442f9 606
607 # Finally, make sure we haven't screwed anything up.
6771a1b1 608 foreach my $wit ( $self->tradition->witnesses ) {
609 my $pathtext = $self->path_text( $wit->sigil );
6771a1b1 610 throw( "Text differs for witness " . $wit->sigil )
f5e442f9 611 unless $pathtext eq $origtext{$wit->sigil};
6771a1b1 612 if( $wit->is_layered ) {
f5e442f9 613 my $acsig = $wit->sigil . $self->ac_label;
614 $pathtext = $self->path_text( $acsig );
615 throw( "Layered text differs for witness " . $wit->sigil )
616 unless $pathtext eq $origtext{$acsig};
6771a1b1 617 }
618 }
619
620 $self->relations->rebuild_equivalence();
621 $self->calculate_ranks();
622}
3265b0ce 623
3a2ebbf4 624# Helper function for manipulating the graph.
625sub _stringify_args {
4e483aa5 626 my( $self, $first, $second, @args ) = @_;
3a2ebbf4 627 $first = $first->id
628 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
629 $second = $second->id
630 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
4e483aa5 631 return( $first, $second, @args );
3a2ebbf4 632}
df6d9812 633
4e5a7b2c 634# Helper function for manipulating the graph.
635sub _objectify_args {
636 my( $self, $first, $second, $arg ) = @_;
637 $first = $self->reading( $first )
638 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
639 $second = $self->reading( $second )
640 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
641 return( $first, $second, $arg );
642}
f97ef19e 643
644=head2 duplicate_reading( $reading, @witlist )
645
646Split the given reading into two, so that the new reading is in the path for
ef73c20a 647the witnesses given in @witlist. If the result is that certain non-colocated
648relationships (e.g. transpositions) are no longer valid, these will be removed.
649Returns the newly-created reading.
f97ef19e 650
651=begin testing
652
68e48c06 653use Test::More::UTF8;
f97ef19e 654use Text::Tradition;
68e48c06 655use TryCatch;
f97ef19e 656
657my $st = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/collatecorr.xml' );
658is( ref( $st ), 'Text::Tradition', "Got a tradition from test file" );
659ok( $st->has_witness('Ba96'), "Tradition has the affected witness" );
660
661my $sc = $st->collation;
662my $numr = 17;
663ok( $sc->reading('n131'), "Tradition has the affected reading" );
664is( scalar( $sc->readings ), $numr, "There are $numr readings in the graph" );
665is( $sc->end->rank, 14, "There are fourteen ranks in the graph" );
666
667# Detach the erroneously collated reading
2dcb5d11 668my( $newr, @del_rdgs ) = $sc->duplicate_reading( 'n131', 'Ba96' );
ef73c20a 669ok( $newr, "New reading was created" );
f97ef19e 670ok( $sc->reading('n131_0'), "Detached the bad collation with a new reading" );
671is( scalar( $sc->readings ), $numr + 1, "A reading was added to the graph" );
672is( $sc->end->rank, 10, "There are now only ten ranks in the graph" );
3c234eb6 673my $csucc = $sc->common_successor( 'n131', 'n131_0' );
674is( $csucc->id, 'n136', "Found correct common successor to duped reading" );
f97ef19e 675
676# Check that the bad transposition is gone
2dcb5d11 677is( scalar @del_rdgs, 1, "Deleted reading was returned by API call" );
f97ef19e 678is( $sc->get_relationship( 'n130', 'n135' ), undef, "Bad transposition relationship is gone" );
679
e19635f8 680# The collation should not be fixed
681my @pairs = $sc->identical_readings();
682is( scalar @pairs, 0, "Not re-collated yet" );
f97ef19e 683# Fix the collation
3c234eb6 684ok( $sc->merge_readings( 'n124', 'n131_0' ), "Collated the readings correctly" );
e19635f8 685@pairs = $sc->identical_readings( start => 'n124', end => $csucc->id );
3c234eb6 686is( scalar @pairs, 3, "Found three more identical readings" );
e19635f8 687is( $sc->end->rank, 11, "The ranks shifted appropriately" );
3c234eb6 688$sc->flatten_ranks();
f97ef19e 689is( scalar( $sc->readings ), $numr - 3, "Now we are collated correctly" );
690
68e48c06 691# Check that we can't "duplicate" a reading with no wits or with all wits
692try {
693 my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124' );
694 ok( 0, "Reading duplication without witnesses throws an error" );
695} catch( Text::Tradition::Error $e ) {
696 like( $e->message, qr/Must specify one or more witnesses/,
697 "Reading duplication without witnesses throws the expected error" );
698} catch {
699 ok( 0, "Reading duplication without witnesses threw the wrong error" );
700}
701
702try {
703 my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124', 'Ba96', 'Mü11475' );
704 ok( 0, "Reading duplication with all witnesses throws an error" );
705} catch( Text::Tradition::Error $e ) {
706 like( $e->message, qr/Cannot join all witnesses/,
707 "Reading duplication with all witnesses throws the expected error" );
708} catch {
709 ok( 0, "Reading duplication with all witnesses threw the wrong error" );
710}
711
58568d5c 712try {
713 $sc->calculate_ranks();
714 ok( 1, "Graph is still evidently whole" );
715} catch( Text::Tradition::Error $e ) {
716 ok( 0, "Caught a rank exception: " . $e->message );
717}
718
f97ef19e 719=end testing
720
721=cut
722
723sub duplicate_reading {
724 my( $self, $r, @wits ) = @_;
68e48c06 725 # Check that we are not doing anything unwise.
726 throw( "Must specify one or more witnesses for the duplicated reading" )
727 unless @wits;
f97ef19e 728 unless( ref( $r ) eq 'Text::Tradition::Collation::Reading' ) {
729 $r = $self->reading( $r );
730 }
731 throw( "Cannot duplicate a meta-reading" )
732 if $r->is_meta;
68e48c06 733 throw( "Cannot join all witnesses to the new reading" )
47cd4d16 734 if scalar( @wits ) == scalar( $r->witnesses );
68e48c06 735
f97ef19e 736 # Get all the reading attributes and duplicate them.
737 my $rmeta = Text::Tradition::Collation::Reading->meta;
738 my %args;
739 foreach my $attr( $rmeta->get_all_attributes ) {
740 next if $attr->name =~ /^_/;
741 my $acc = $attr->get_read_method;
742 if( !$acc && $attr->has_applied_traits ) {
743 my $tr = $attr->applied_traits;
744 if( $tr->[0] =~ /::(Array|Hash)$/ ) {
745 my $which = $1;
746 my %methods = reverse %{$attr->handles};
747 $acc = $methods{elements};
748 $args{$attr->name} = $which eq 'Array'
749 ? [ $r->$acc ] : { $r->$acc };
750 }
9611c649 751 } elsif( $acc ) {
752 my $attrval = $r->$acc;
753 if( defined $attrval ) {
754 $args{$attr->name} = $attrval;
755 }
f97ef19e 756 }
757 }
758 # By definition the new reading will no longer be common.
759 $args{is_common} = 0;
760 # The new reading also needs its own ID.
761 $args{id} = $self->_generate_dup_id( $r->id );
762
763 # Try to make the new reading.
764 my $newr = $self->add_reading( \%args );
765 # The old reading is also no longer common.
766 $r->is_common( 0 );
767
768 # For each of the witnesses, dissociate from the old reading and
769 # associate with the new.
770 foreach my $wit ( @wits ) {
771 my $prior = $self->prior_reading( $r, $wit );
772 my $next = $self->next_reading( $r, $wit );
773 $self->del_path( $prior, $r, $wit );
774 $self->add_path( $prior, $newr, $wit );
775 $self->del_path( $r, $next, $wit );
776 $self->add_path( $newr, $next, $wit );
777 }
778
3c234eb6 779 # If the graph is ranked, we need to look for relationships that are now
780 # invalid (i.e. 'non-colocation' types that might now be colocated) and
781 # remove them. If not, we can skip it.
f97ef19e 782 my $succ;
3c234eb6 783 my %rrk;
2dcb5d11 784 my @deleted_relations;
f97ef19e 785 if( $self->end->has_rank ) {
3c234eb6 786 # Find the point where we can stop checking
f97ef19e 787 $succ = $self->common_successor( $r, $newr );
3c234eb6 788
789 # Hash the existing ranks
f97ef19e 790 foreach my $rdg ( $self->readings ) {
791 $rrk{$rdg->id} = $rdg->rank;
792 }
3c234eb6 793 # Calculate the new ranks
794 $self->calculate_ranks();
f97ef19e 795
3c234eb6 796 # Check for invalid non-colocated relationships among changed-rank readings
797 # from where the ranks start changing up to $succ
f97ef19e 798 my $lastrank = $succ->rank;
799 foreach my $rdg ( $self->readings ) {
800 next if $rdg->rank > $lastrank;
801 next if $rdg->rank == $rrk{$rdg->id};
802 my @noncolo = $rdg->related_readings( sub { !$_[0]->colocated } );
803 next unless @noncolo;
804 foreach my $nc ( @noncolo ) {
2dcb5d11 805 unless( $self->relations->verify_or_delete( $rdg, $nc ) ) {
f6a1d5f0 806 push( @deleted_relations, [ $rdg->id, $nc->id ] );
2dcb5d11 807 }
f97ef19e 808 }
809 }
ef73c20a 810 }
2dcb5d11 811 return ( $newr, @deleted_relations );
f97ef19e 812}
813
814sub _generate_dup_id {
815 my( $self, $rid ) = @_;
816 my $newid;
817 my $i = 0;
818 while( !$newid ) {
819 $newid = $rid."_$i";
820 if( $self->has_reading( $newid ) ) {
821 $newid = '';
822 $i++;
823 }
824 }
825 return $newid;
826}
827
3a2ebbf4 828### Path logic
829
830sub add_path {
831 my $self = shift;
832
833 # We only need the IDs for adding paths to the graph, not the reading
834 # objects themselves.
cecbe56d 835 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
3a2ebbf4 836
c1915ab9 837 $self->_graphcalc_done(0);
3a2ebbf4 838 # Connect the readings
359944f7 839 unless( $self->sequence->has_edge( $source, $target ) ) {
840 $self->sequence->add_edge( $source, $target );
841 $self->relations->add_equivalence_edge( $source, $target );
842 }
3a2ebbf4 843 # Note the witness in question
844 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
359944f7 845}
b15511bf 846
3a2ebbf4 847sub del_path {
848 my $self = shift;
49d4f2ac 849 my @args;
850 if( ref( $_[0] ) eq 'ARRAY' ) {
851 my $e = shift @_;
852 @args = ( @$e, @_ );
853 } else {
854 @args = @_;
855 }
3a2ebbf4 856
f97ef19e 857 # We only need the IDs for removing paths from the graph, not the reading
3a2ebbf4 858 # objects themselves.
49d4f2ac 859 my( $source, $target, $wit ) = $self->_stringify_args( @args );
3a2ebbf4 860
c1915ab9 861 $self->_graphcalc_done(0);
3a2ebbf4 862 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
49d4f2ac 863 $self->sequence->delete_edge_attribute( $source, $target, $wit );
3a2ebbf4 864 }
30897024 865 unless( $self->sequence->has_edge_attributes( $source, $target ) ) {
3a2ebbf4 866 $self->sequence->delete_edge( $source, $target );
359944f7 867 $self->relations->delete_equivalence_edge( $source, $target );
3a2ebbf4 868 }
784877d9 869}
870
3a2ebbf4 871
15d2d3df 872# Extra graph-alike utility
873sub has_path {
3a2ebbf4 874 my $self = shift;
875 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
876 return undef unless $self->sequence->has_edge( $source, $target );
877 return $self->sequence->has_edge_attribute( $source, $target, $wit );
b15511bf 878}
879
4e5a7b2c 880=head2 clear_witness( @sigil_list )
3a2ebbf4 881
4e5a7b2c 882Clear the given witnesses out of the collation entirely, removing references
883to them in paths, and removing readings that belong only to them. Should only
884be called via $tradition->del_witness.
3a2ebbf4 885
886=cut
887
4e5a7b2c 888sub clear_witness {
889 my( $self, @sigils ) = @_;
890
c1915ab9 891 $self->_graphcalc_done(0);
4e5a7b2c 892 # Clear the witness(es) out of the paths
893 foreach my $e ( $self->paths ) {
894 foreach my $sig ( @sigils ) {
895 $self->del_path( $e, $sig );
896 }
897 }
898
899 # Clear out the newly unused readings
900 foreach my $r ( $self->readings ) {
901 unless( $self->reading_witnesses( $r ) ) {
902 $self->del_reading( $r );
903 }
904 }
905}
3a2ebbf4 906
907sub add_relationship {
908 my $self = shift;
5fd7be0f 909 my( $source, $target, $opts, $altered_readings ) = $self->_stringify_args( @_ );
414cc046 910 my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
45095bee 911 my $did_recalc;
864ee4bf 912 foreach my $v ( @vectors ) {
45095bee 913 my $rel = $self->get_relationship( $v );
914 next unless $rel->colocated;
915 my $r1 = $self->reading( $v->[0] );
916 my $r2 = $self->reading( $v->[1] );
917 # If it's a spelling or orthographic relationship, and one is marked
918 # as a lemma, set the normal form on the non-lemma to reflect that.
919 if( $r1->does( 'Text::Tradition::Morphology' ) ) {
5fd7be0f 920 my @changed = $r1->relationship_added( $r2, $rel );
921 if( ref( $altered_readings ) eq 'ARRAY' ) {
922 push( @$altered_readings, @changed );
923 }
45095bee 924 }
925 next if $did_recalc;
926 if( $r1->has_rank && $r2->has_rank && $r1->rank ne $r2->rank ) {
864ee4bf 927 $self->_graphcalc_done(0);
928 $self->_clear_cache;
45095bee 929 $did_recalc = 1;
864ee4bf 930 }
931 }
63778331 932 return @vectors;
22222af9 933}
ef9d481f 934
ca6e6095 935around qw/ get_relationship del_relationship / => sub {
936 my $orig = shift;
937 my $self = shift;
938 my @args = @_;
939 if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
940 @args = @{$_[0]};
941 }
e19635f8 942 my @stringargs = $self->_stringify_args( @args );
943 $self->$orig( @stringargs );
ca6e6095 944};
945
22222af9 946=head2 reading_witnesses( $reading )
910a0a6d 947
22222af9 948Return a list of sigils corresponding to the witnesses in which the reading appears.
3265b0ce 949
22222af9 950=cut
1d310495 951
1d310495 952sub reading_witnesses {
953 my( $self, $reading ) = @_;
954 # We need only check either the incoming or the outgoing edges; I have
96dc90ec 955 # arbitrarily chosen "incoming". Thus, special-case the start node.
956 if( $reading eq $self->start ) {
bed6ce83 957 return map { $_->sigil } grep { $_->is_collated } $self->tradition->witnesses;
96dc90ec 958 }
1d310495 959 my %all_witnesses;
960 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
961 my $wits = $self->sequence->get_edge_attributes( @$e );
962 @all_witnesses{ keys %$wits } = 1;
963 }
c12bb878 964 my $acstr = $self->ac_label;
965 foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
966 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
967 }
1d310495 968 return keys %all_witnesses;
910a0a6d 969}
970
4e5a7b2c 971=head1 OUTPUT METHODS
8e1394aa 972
0ecb975c 973=head2 as_svg( \%options )
8e1394aa 974
0068967c 975Returns an SVG string that represents the graph, via as_dot and graphviz.
bfcbcecb 976See as_dot for a list of options. Must have GraphViz (dot) installed to run.
8e1394aa 977
497ca664 978=begin testing
979
980use File::Which;
981use Text::Tradition;
982use XML::LibXML;
983use XML::LibXML::XPathContext;
984
985
986SKIP: {
987 skip( 'Need Graphviz installed to test graphs', 16 )
988 unless File::Which::which( 'dot' );
989
990 my $datafile = 't/data/Collatex-16.xml';
991
992 my $tradition = Text::Tradition->new(
993 'name' => 'inline',
994 'input' => 'CollateX',
995 'file' => $datafile,
996 );
997 my $collation = $tradition->collation;
998
999 # Test the svg creation
1000 my $parser = XML::LibXML->new();
1001 $parser->load_ext_dtd( 0 );
1002 my $svg = $parser->parse_string( $collation->as_svg() );
1003 is( $svg->documentElement->nodeName(), 'svg', 'Got an svg document' );
1004
1005 # Test for the correct number of nodes in the SVG
1006 my $svg_xpc = XML::LibXML::XPathContext->new( $svg->documentElement() );
1007 $svg_xpc->registerNs( 'svg', 'http://www.w3.org/2000/svg' );
1008 my @svg_nodes = $svg_xpc->findnodes( '//svg:g[@class="node"]' );
1009 is( scalar @svg_nodes, 26, "Correct number of nodes in the graph" );
1010
1011 # Test for the correct number of edges
1012 my @svg_edges = $svg_xpc->findnodes( '//svg:g[@class="edge"]' );
1013 is( scalar @svg_edges, 32, "Correct number of edges in the graph" );
1014
1015 # Test svg creation for a subgraph
1016 my $part_svg = $parser->parse_string( $collation->as_svg( { from => 15 } ) ); # start, no end
1017 is( $part_svg->documentElement->nodeName(), 'svg', "Got an svg subgraph to end" );
1018 my $part_xpc = XML::LibXML::XPathContext->new( $part_svg->documentElement() );
1019 $part_xpc->registerNs( 'svg', 'http://www.w3.org/2000/svg' );
1020 @svg_nodes = $part_xpc->findnodes( '//svg:g[@class="node"]' );
1021 is( scalar( @svg_nodes ), 9,
1022 "Correct number of nodes in the subgraph" );
1023 @svg_edges = $part_xpc->findnodes( '//svg:g[@class="edge"]' );
1024 is( scalar( @svg_edges ), 10,
1025 "Correct number of edges in the subgraph" );
1026
1027 $part_svg = $parser->parse_string( $collation->as_svg( { from => 10, to => 13 } ) ); # start, no end
1028 is( $part_svg->documentElement->nodeName(), 'svg', "Got an svg subgraph in the middle" );
1029 $part_xpc = XML::LibXML::XPathContext->new( $part_svg->documentElement() );
1030 $part_xpc->registerNs( 'svg', 'http://www.w3.org/2000/svg' );
1031 @svg_nodes = $part_xpc->findnodes( '//svg:g[@class="node"]' );
1032 is( scalar( @svg_nodes ), 9,
1033 "Correct number of nodes in the subgraph" );
1034 @svg_edges = $part_xpc->findnodes( '//svg:g[@class="edge"]' );
1035 is( scalar( @svg_edges ), 11,
1036 "Correct number of edges in the subgraph" );
1037
1038
1039 $part_svg = $parser->parse_string( $collation->as_svg( { to => 5 } ) ); # start, no end
1040 is( $part_svg->documentElement->nodeName(), 'svg', "Got an svg subgraph from start" );
1041 $part_xpc = XML::LibXML::XPathContext->new( $part_svg->documentElement() );
1042 $part_xpc->registerNs( 'svg', 'http://www.w3.org/2000/svg' );
1043 @svg_nodes = $part_xpc->findnodes( '//svg:g[@class="node"]' );
1044 is( scalar( @svg_nodes ), 7,
1045 "Correct number of nodes in the subgraph" );
1046 @svg_edges = $part_xpc->findnodes( '//svg:g[@class="edge"]' );
1047 is( scalar( @svg_edges ), 7,
1048 "Correct number of edges in the subgraph" );
1049
1050 # Test a right-to-left graph
1051 my $arabic = Text::Tradition->new(
1052 input => 'Tabular',
1053 sep_char => ',',
1054 name => 'arabic',
1055 direction => 'RL',
1056 file => 't/data/arabic_snippet.csv' );
1057 my $rl_svg = $parser->parse_string( $arabic->collation->as_svg() );
1058 is( $rl_svg->documentElement->nodeName(), 'svg', "Got an svg subgraph from start" );
1059 my $rl_xpc = XML::LibXML::XPathContext->new( $rl_svg->documentElement() );
1060 $rl_xpc->registerNs( 'svg', 'http://www.w3.org/2000/svg' );
1061 my %node_cx;
1062 foreach my $node ( $rl_xpc->findnodes( '//svg:g[@class="node"]' ) ) {
1063 my $nid = $node->getAttribute('id');
1064 $node_cx{$nid} = $rl_xpc->findvalue( './svg:ellipse/@cx', $node );
1065 }
1066 my @sorted = sort { $node_cx{$a} <=> $node_cx{$b} } keys( %node_cx );
1067 is( $sorted[0], '__END__', "End node is the leftmost" );
1068 is( $sorted[$#sorted], '__START__', "Start node is the rightmost" );
1069
1070} #SKIP
1071
1072=end testing
1073
8e1394aa 1074=cut
1075
1076sub as_svg {
0ecb975c 1077 my( $self, $opts ) = @_;
bfcbcecb 1078 throw( "Need GraphViz installed to output SVG" )
1079 unless File::Which::which( 'dot' );
e247aad1 1080 my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
1ff82d4f 1081 $self->calculate_ranks()
1082 unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
be3af600 1083 my @cmd = qw/dot -Tsvg/;
1084 my( $svg, $err );
1085 my $dotfile = File::Temp->new();
1086 ## USE FOR DEBUGGING
1087 # $dotfile->unlink_on_destroy(0);
1088 binmode $dotfile, ':utf8';
1089 print $dotfile $self->as_dot( $opts );
1090 push( @cmd, $dotfile->filename );
1091 run( \@cmd, ">", binary(), \$svg );
1092 $svg = decode_utf8( $svg );
1093 return $svg;
8e1394aa 1094}
1095
b22576c6 1096
0ecb975c 1097=head2 as_dot( \%options )
b22576c6 1098
0ecb975c 1099Returns a string that is the collation graph expressed in dot
1100(i.e. GraphViz) format. Options include:
b22576c6 1101
0ecb975c 1102=over 4
b22576c6 1103
0ecb975c 1104=item * from
b22576c6 1105
0ecb975c 1106=item * to
df6d9812 1107
0ecb975c 1108=item * color_common
1109
1110=back
df6d9812 1111
1112=cut
1113
1114sub as_dot {
0ecb975c 1115 my( $self, $opts ) = @_;
1116 my $startrank = $opts->{'from'} if $opts;
1117 my $endrank = $opts->{'to'} if $opts;
1118 my $color_common = $opts->{'color_common'} if $opts;
b365fbae 1119 my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
1120 && $self->end->rank > 100;
6648ee3d 1121 $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
b365fbae 1122
b22576c6 1123 # Check the arguments
1124 if( $startrank ) {
1125 return if $endrank && $startrank > $endrank;
1126 return if $startrank > $self->end->rank;
1127 }
1128 if( defined $endrank ) {
1129 return if $endrank < 0;
f1b3b33a 1130 $endrank = undef if $endrank == $self->end->rank;
b22576c6 1131 }
1132
67da8d6c 1133 my $graph_name = $self->tradition->name;
1134 $graph_name =~ s/[^\w\s]//g;
1135 $graph_name = join( '_', split( /\s+/, $graph_name ) );
f13b5582 1136
1137 my %graph_attrs = (
f13b5582 1138 'bgcolor' => 'none',
1139 );
d1afb4b5 1140 unless( $self->direction eq 'BI' ) {
1141 $graph_attrs{rankdir} = $self->direction;
1142 }
f13b5582 1143 my %node_attrs = (
b8990398 1144 'fontsize' => 14,
f13b5582 1145 'fillcolor' => 'white',
1146 'style' => 'filled',
1147 'shape' => 'ellipse'
1148 );
1149 my %edge_attrs = (
1150 'arrowhead' => 'open',
1151 'color' => '#000000',
1152 'fontcolor' => '#000000',
1153 );
1154
67da8d6c 1155 my $dot = sprintf( "digraph %s {\n", $graph_name );
f13b5582 1156 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
1157 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
df6d9812 1158
b22576c6 1159 # Output substitute start/end readings if necessary
1160 if( $startrank ) {
43c94341 1161 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
b22576c6 1162 }
1163 if( $endrank ) {
43c94341 1164 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
b22576c6 1165 }
b365fbae 1166 if( $STRAIGHTENHACK ) {
1167 ## HACK part 1
43c94341 1168 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
1169 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
b365fbae 1170 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
1171 }
b22576c6 1172 my %used; # Keep track of the readings that actually appear in the graph
30ddc24c 1173 # Sort the readings by rank if we have ranks; this speeds layout.
1174 my @all_readings = $self->end->has_rank
1175 ? sort { $a->rank <=> $b->rank } $self->readings
1176 : $self->readings;
4633f9e4 1177 # TODO Refrain from outputting lacuna nodes - just grey out the edges.
30ddc24c 1178 foreach my $reading ( @all_readings ) {
b22576c6 1179 # Only output readings within our rank range.
1180 next if $startrank && $reading->rank < $startrank;
1181 next if $endrank && $reading->rank > $endrank;
1182 $used{$reading->id} = 1;
910a0a6d 1183 # Need not output nodes without separate labels
3a2ebbf4 1184 next if $reading->id eq $reading->text;
d4b75f44 1185 my $rattrs;
30f0df34 1186 my $label = $reading->text;
6b6c7098 1187 unless( $label =~ /^[[:punct:]]+$/ ) {
1188 $label .= '-' if $reading->join_next;
1189 $label = "-$label" if $reading->join_prior;
1190 }
8f9cab7b 1191 $label =~ s/\"/\\\"/g;
d4b75f44 1192 $rattrs->{'label'} = $label;
10e4b1ac 1193 $rattrs->{'id'} = $reading->id;
0ecb975c 1194 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
d4b75f44 1195 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
df6d9812 1196 }
3a2ebbf4 1197
522c1314 1198 # Add the real edges.
b22576c6 1199 my @edges = $self->paths;
3bdec618 1200 my( %substart, %subend );
b22576c6 1201 foreach my $edge ( @edges ) {
1202 # Do we need to output this edge?
508fd430 1203 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
57560672 1204 my $label = $self->_path_display_label( $opts,
1205 $self->path_witnesses( $edge ) );
f13b5582 1206 my $variables = { %edge_attrs, 'label' => $label };
30ddc24c 1207
b22576c6 1208 # Account for the rank gap if necessary
30ddc24c 1209 my $rank0 = $self->reading( $edge->[0] )->rank
1210 if $self->reading( $edge->[0] )->has_rank;
1211 my $rank1 = $self->reading( $edge->[1] )->rank
1212 if $self->reading( $edge->[1] )->has_rank;
1213 if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
1214 $variables->{'minlen'} = $rank1 - $rank0;
1215 }
1216
508fd430 1217 # EXPERIMENTAL: make edge width reflect no. of witnesses
1218 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
1219 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
1220
f13b5582 1221 my $varopts = _dot_attr_string( $variables );
1222 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1223 $edge->[0], $edge->[1], $varopts );
3bdec618 1224 } elsif( $used{$edge->[0]} ) {
96ba0418 1225 $subend{$edge->[0]} = $edge->[1];
3bdec618 1226 } elsif( $used{$edge->[1]} ) {
96ba0418 1227 $substart{$edge->[1]} = $edge->[0];
b22576c6 1228 }
df6d9812 1229 }
bed6ce83 1230
1231 # If we are asked to, add relationship links
1232 if( exists $opts->{show_relations} ) {
1233 my $filter = $opts->{show_relations}; # can be 'transposition' or 'all'
1234 if( $filter eq 'transposition' ) {
1235 $filter =~ qr/^transposition$/;
1236 }
9e54e11b 1237 my %typecolors;
1238 my @types = sort( map { $_->name } $self->relations->types );
1239 if( exists $opts->{graphcolors} ) {
1240 foreach my $tdx ( 0 .. $#types ) {
1241 $typecolors{$types[$tdx]} = $opts->{graphcolors}->[$tdx];
1242 }
1243 } else {
1244 map { $typecolors{$_} = '#FFA14F' } @types;
1245 }
bed6ce83 1246 foreach my $redge ( $self->relationships ) {
1247 if( $used{$redge->[0]} && $used{$redge->[1]} ) {
9e54e11b 1248 my $rel = $self->get_relationship( $redge );
1249 next unless $filter eq 'all' || $rel->type =~ /$filter/;
1250 my $variables = {
1251 arrowhead => 'none',
1252 color => $typecolors{$rel->type},
1253 constraint => 'false',
1254 penwidth => '3',
1255 };
1256 unless( exists $opts->{graphcolors} ) {
1257 $variables->{label} = uc( substr( $rel->type, 0, 4 ) ),
bed6ce83 1258 }
9e54e11b 1259 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1260 $redge->[0], $redge->[1], _dot_attr_string( $variables ) );
bed6ce83 1261 }
1262 }
1263 }
1264
3bdec618 1265 # Add substitute start and end edges if necessary
1266 foreach my $node ( keys %substart ) {
57560672 1267 my $witstr = $self->_path_display_label( $opts,
1268 $self->path_witnesses( $substart{$node}, $node ) );
f13b5582 1269 my $variables = { %edge_attrs, 'label' => $witstr };
96ba0418 1270 my $nrdg = $self->reading( $node );
1271 if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
1272 # Substart is actually one lower than $startrank
1273 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
1274 }
f13b5582 1275 my $varopts = _dot_attr_string( $variables );
96ba0418 1276 $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
3bdec618 1277 }
1278 foreach my $node ( keys %subend ) {
57560672 1279 my $witstr = $self->_path_display_label( $opts,
1280 $self->path_witnesses( $node, $subend{$node} ) );
f13b5582 1281 my $variables = { %edge_attrs, 'label' => $witstr };
1282 my $varopts = _dot_attr_string( $variables );
96ba0418 1283 $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
3bdec618 1284 }
b365fbae 1285 # HACK part 2
1286 if( $STRAIGHTENHACK ) {
43c94341 1287 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
1288 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
b365fbae 1289 }
30ddc24c 1290
df6d9812 1291 $dot .= "}\n";
1292 return $dot;
1293}
1294
f13b5582 1295sub _dot_attr_string {
1296 my( $hash ) = @_;
1297 my @attrs;
1298 foreach my $k ( sort keys %$hash ) {
1299 my $v = $hash->{$k};
1300 push( @attrs, $k.'="'.$v.'"' );
1301 }
1302 return( '[ ' . join( ', ', @attrs ) . ' ]' );
1303}
1304
027d819c 1305=head2 path_witnesses( $edge )
1306
1307Returns the list of sigils whose witnesses are associated with the given edge.
1308The edge can be passed as either an array or an arrayref of ( $source, $target ).
1309
1310=cut
1311
3a2ebbf4 1312sub path_witnesses {
1313 my( $self, @edge ) = @_;
1314 # If edge is an arrayref, cope.
1315 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
1316 my $e = shift @edge;
1317 @edge = @$e;
1318 }
1319 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
508fd430 1320 return @wits;
3a2ebbf4 1321}
1322
7f9f05e8 1323# Helper function. Make a display label for the given witnesses, showing a.c.
1324# witnesses only where the main witness is not also in the list.
027d819c 1325sub _path_display_label {
508fd430 1326 my $self = shift;
57560672 1327 my $opts = shift;
7f9f05e8 1328 my %wits;
1329 map { $wits{$_} = 1 } @_;
1330
1331 # If an a.c. wit is listed, remove it if the main wit is also listed.
1332 # Otherwise keep it for explicit listing.
1333 my $aclabel = $self->ac_label;
1334 my @disp_ac;
1335 foreach my $w ( sort keys %wits ) {
1336 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
1337 if( exists $wits{$1} ) {
1338 delete $wits{$w};
1339 } else {
1340 push( @disp_ac, $w );
1341 }
1342 }
1343 }
1344
57560672 1345 if( $opts->{'explicit_wits'} ) {
7f9f05e8 1346 return join( ', ', sort keys %wits );
57560672 1347 } else {
1348 # See if we are in a majority situation.
1349 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
1350 $maj = $maj > 5 ? $maj : 5;
1351 if( scalar keys %wits > $maj ) {
1352 unshift( @disp_ac, 'majority' );
1353 return join( ', ', @disp_ac );
1354 } else {
1355 return join( ', ', sort keys %wits );
1356 }
8f9cab7b 1357 }
1358}
1dd07bda 1359
522c1314 1360=head2 as_adjacency_list
1dd07bda 1361
522c1314 1362Returns a JSON structure that represents the collation sequence graph.
1dd07bda 1363
8a9a8200 1364=begin testing
1365
1366use JSON qw/ from_json /;
1367use Text::Tradition;
1368
1369my $t = Text::Tradition->new(
1370 'input' => 'Self',
1371 'file' => 't/data/florilegium_graphml.xml' );
1372my $c = $t->collation;
1373
1374# Make a connection so we can test rank preservation
1375$c->add_relationship( 'w91', 'w92', { type => 'grammatical' } );
1376
1377# Create an adjacency list of the whole thing; test the output.
1378my $adj_whole = from_json( $c->as_adjacency_list() );
1379is( scalar @$adj_whole, scalar $c->readings(),
1380 "Same number of nodes in graph and adjacency list" );
1381my @adj_whole_edges;
1382map { push( @adj_whole_edges, @{$_->{adjacent}} ) } @$adj_whole;
1383is( scalar @adj_whole_edges, scalar $c->sequence->edges,
1384 "Same number of edges in graph and adjacency list" );
1385# Find the reading whose rank should be preserved
1386my( $test_rdg ) = grep { $_->{id} eq 'w89' } @$adj_whole;
1387my( $test_edge ) = grep { $_->{id} eq 'w92' } @{$test_rdg->{adjacent}};
1388is( $test_edge->{minlen}, 2, "Rank of test reading is preserved" );
1389
1390# Now create an adjacency list of just a portion. w76 to w122
1391my $adj_part = from_json( $c->as_adjacency_list(
1392 { from => $c->reading('w76')->rank,
1393 to => $c->reading('w122')->rank }));
1394is( scalar @$adj_part, 48, "Correct number of nodes in partial graph" );
1395my @adj_part_edges;
1396map { push( @adj_part_edges, @{$_->{adjacent}} ) } @$adj_part;
1397is( scalar @adj_part_edges, 58,
1398 "Same number of edges in partial graph and adjacency list" );
1399# Check for consistency
1400my %part_nodes;
1401map { $part_nodes{$_->{id}} = 1 } @$adj_part;
1402foreach my $edge ( @adj_part_edges ) {
1403 my $testid = $edge->{id};
1404 ok( $part_nodes{$testid}, "ID $testid referenced in edge is given as node" );
1405}
1406
1407=end testing
1408
1dd07bda 1409=cut
1410
522c1314 1411sub as_adjacency_list {
1412 my( $self, $opts ) = @_;
1413 # Make a structure that contains all the nodes, the nodes they point to,
1414 # and the attributes of the edges that connect them.
1415 # [ { id: 'n0', label: 'Gallia', adjacent: [
1416 # { id: 'n1', label: 'P Q' } ,
1417 # { id: 'n2', label: 'R S', minlen: 2 } ] },
1418 # { id: 'n1', label: 'est', adjacent: [ ... ] },
1419 # ... ]
1420 my $startrank = $opts->{'from'} || 0;
1421 my $endrank = $opts->{'to'} || $self->end->rank;
1422
8a9a8200 1423 $self->calculate_ranks()
1424 unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
522c1314 1425 my $list = [];
1426 foreach my $rdg ( $self->readings ) {
1427 my @successors;
1428 my $phony = '';
8a9a8200 1429 # Figure out what the node's successors should be.
522c1314 1430 if( $rdg eq $self->start && $startrank > 0 ) {
1431 # Connect the start node with all the nodes at startrank.
8a9a8200 1432 # Lacunas should be included only if the node really has that rank.
1433 @successors = $self->readings_at_rank( $startrank, 1 );
522c1314 1434 $phony = 'start';
1435 } elsif( $rdg->rank < $startrank
1436 || $rdg->rank > $endrank && $rdg ne $self->end ) {
1437 next;
522c1314 1438 } else {
1439 @successors = $rdg->successors;
1440 }
8a9a8200 1441 # Make sure that the end node is at the end of the successors
1442 # list if it is needed.
1443 if( grep { $_ eq $self->end } @successors ) {
1444 my @ts = grep { $_ ne $self->end } @successors;
1445 @successors = ( @ts, $self->end );
1446 } elsif ( grep { $_->rank > $endrank } @successors ) {
1447 push( @successors, $self->end );
1448 }
522c1314 1449
1450 my $listitem = { id => $rdg->id, label => $rdg->text };
1451 my $adjacent = [];
8a9a8200 1452 my @endwits;
522c1314 1453 foreach my $succ ( @successors ) {
1454 my @edgewits;
1455 if( $phony eq 'start' ) {
1456 @edgewits = $succ->witnesses;
8a9a8200 1457 } elsif( $self->sequence->has_edge( $rdg->id, $succ->id ) ) {
522c1314 1458 @edgewits = $self->path_witnesses( $rdg->id, $succ->id );
1459 }
8a9a8200 1460
1461 if( $succ eq $self->end ) {
1462 @edgewits = @endwits;
1463 } elsif( $succ->rank > $endrank ) {
1464 # These witnesses will point to 'end' instead, not to the
1465 # actual successor.
1466 push( @endwits, @edgewits );
1467 next;
1468 }
522c1314 1469 my $edgelabel = $self->_path_display_label( $opts, @edgewits );
1470 my $edgedef = { id => $succ->id, label => $edgelabel };
1471 my $rankoffset = $succ->rank - $rdg->rank;
1472 if( $rankoffset > 1 and $succ ne $self->end ) {
1473 $edgedef->{minlen} = $rankoffset;
1474 }
1475 push( @$adjacent, $edgedef );
1476 }
1477 $listitem->{adjacent} = $adjacent;
1478 push( @$list, $listitem );
bf6e338d 1479 }
522c1314 1480 return to_json( $list );
1481}
8f9cab7b 1482
4e5a7b2c 1483=head2 as_graphml
8e1394aa 1484
4e5a7b2c 1485Returns a GraphML representation of the collation. The GraphML will contain
1486two graphs. The first expresses the attributes of the readings and the witness
1487paths that link them; the second expresses the relationships that link the
1488readings. This is the native transfer format for a tradition.
8e1394aa 1489
56eefa04 1490=begin testing
1491
1492use Text::Tradition;
951ddfe8 1493use TryCatch;
56eefa04 1494
1495my $READINGS = 311;
1496my $PATHS = 361;
1497
1498my $datafile = 't/data/florilegium_tei_ps.xml';
1499my $tradition = Text::Tradition->new( 'input' => 'TEI',
1500 'name' => 'test0',
1501 'file' => $datafile,
1502 'linear' => 1 );
1503
1504ok( $tradition, "Got a tradition object" );
1505is( scalar $tradition->witnesses, 13, "Found all witnesses" );
1506ok( $tradition->collation, "Tradition has a collation" );
1507
1508my $c = $tradition->collation;
1509is( scalar $c->readings, $READINGS, "Collation has all readings" );
1510is( scalar $c->paths, $PATHS, "Collation has all paths" );
1511is( scalar $c->relationships, 0, "Collation has all relationships" );
1512
1513# Add a few relationships
1514$c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
1515$c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
b71e7ea8 1516$c->add_relationship( 'w257', 'w262', { 'type' => 'transposition',
1517 'is_significant' => 'yes' } );
56eefa04 1518
1519# Now write it to GraphML and parse it again.
1520
1521my $graphml = $c->as_graphml;
1522my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
1523is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
1524is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
1525is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
b71e7ea8 1526my $sigrel = $st->collation->get_relationship( 'w257', 'w262' );
1527is( $sigrel->is_significant, 'yes', "Ternary attribute value was restored" );
56eefa04 1528
9fef629b 1529# Now add a stemma, write to GraphML, and look at the output.
951ddfe8 1530SKIP: {
37bf09f4 1531 skip "Analysis module not present", 3 unless $tradition->can( 'add_stemma' );
951ddfe8 1532 my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
1533 is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
1534 is( $tradition->stemmata, 1, "Tradition now has the stemma" );
1535 $graphml = $c->as_graphml;
1536 like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
1537}
2a812726 1538
56eefa04 1539=end testing
1540
8e1394aa 1541=cut
1542
a445ce40 1543## TODO MOVE this to Tradition.pm and modularize it better
8e1394aa 1544sub as_graphml {
a30ca502 1545 my( $self, $options ) = @_;
3d14b48e 1546 $self->calculate_ranks unless $self->_graphcalc_done;
1547
a30ca502 1548 my $start = $options->{'from'}
1549 ? $self->reading( $options->{'from'} ) : $self->start;
1550 my $end = $options->{'to'}
1551 ? $self->reading( $options->{'to'} ) : $self->end;
1552 if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
1553 throw( 'Start node must be before end node' );
1554 }
1555 # The readings need to be ranked for this to work.
1556 $start = $self->start unless $start->has_rank;
1557 $end = $self->end unless $end->has_rank;
414cc046 1558 my $rankoffset = 0;
1559 unless( $start eq $self->start ) {
1560 $rankoffset = $start->rank - 1;
1561 }
a30ca502 1562 my %use_readings;
1563
8e1394aa 1564 # Some namespaces
1565 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
1566 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
1567 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
910a0a6d 1568 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
8e1394aa 1569
1570 # Create the document and root node
428bcf0b 1571 require XML::LibXML;
8e1394aa 1572 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
1573 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
1574 $graphml->setDocumentElement( $root );
1575 $root->setNamespace( $xsi_ns, 'xsi', 0 );
1576 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
bbd064a9 1577
1578 # List of attribute types to save on our objects and their corresponding
1579 # GraphML types
1580 my %save_types = (
1581 'Str' => 'string',
1582 'Int' => 'int',
1583 'Bool' => 'boolean',
10e4b1ac 1584 'ReadingID' => 'string',
bbd064a9 1585 'RelationshipType' => 'string',
1586 'RelationshipScope' => 'string',
b71e7ea8 1587 'Ternary' => 'string',
bbd064a9 1588 );
1589
bbd064a9 1590 # Add the data keys for the graph. Include an extra key 'version' for the
1591 # GraphML output version.
e309421a 1592 my %graph_data_keys;
1593 my $gdi = 0;
bbd064a9 1594 my %graph_attributes = ( 'version' => 'string' );
1595 # Graph attributes include those of Tradition and those of Collation.
1596 my %gattr_from;
f97ef19e 1597 # TODO Use meta introspection method from duplicate_reading to do this
1598 # instead of naming custom keys.
bbd064a9 1599 my $tmeta = $self->tradition->meta;
1600 my $cmeta = $self->meta;
1601 map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
1602 map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
1603 foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
1604 next if $attr->name =~ /^_/;
bbd064a9 1605 next unless $save_types{$attr->type_constraint->name};
1606 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1607 }
9fef629b 1608 # Extra custom keys for complex objects that should be saved in some form.
1609 # The subroutine should return a string, or undef/empty.
951ddfe8 1610 if( $tmeta->has_method('stemmata') ) {
1611 $graph_attributes{'stemmata'} = sub {
1612 my @stemstrs;
1613 map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1614 $self->tradition->stemmata;
1615 join( "\n", @stemstrs );
1616 };
1617 }
1618
8943ff68 1619 if( $tmeta->has_method('user') ) {
1620 $graph_attributes{'user'} = sub {
1621 $self->tradition->user ? $self->tradition->user->id : undef
1622 };
1623 }
bbd064a9 1624
1625 foreach my $datum ( sort keys %graph_attributes ) {
e309421a 1626 $graph_data_keys{$datum} = 'dg'.$gdi++;
1627 my $key = $root->addNewChild( $graphml_ns, 'key' );
9fef629b 1628 my $dtype = ref( $graph_attributes{$datum} ) ? 'string'
1629 : $graph_attributes{$datum};
e309421a 1630 $key->setAttribute( 'attr.name', $datum );
9fef629b 1631 $key->setAttribute( 'attr.type', $dtype );
e309421a 1632 $key->setAttribute( 'for', 'graph' );
1633 $key->setAttribute( 'id', $graph_data_keys{$datum} );
1634 }
f6066bac 1635
bbd064a9 1636 # Add the data keys for reading nodes
1637 my %reading_attributes;
1638 my $rmeta = Text::Tradition::Collation::Reading->meta;
1639 foreach my $attr( $rmeta->get_all_attributes ) {
1640 next if $attr->name =~ /^_/;
bbd064a9 1641 next unless $save_types{$attr->type_constraint->name};
1642 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1643 }
a445ce40 1644 if( $self->start->does('Text::Tradition::Morphology' ) ) {
1645 # Extra custom key for the reading morphology
1646 $reading_attributes{'lexemes'} = 'string';
1647 }
7cd9f181 1648
ef9d481f 1649 my %node_data_keys;
1650 my $ndi = 0;
bbd064a9 1651 foreach my $datum ( sort keys %reading_attributes ) {
910a0a6d 1652 $node_data_keys{$datum} = 'dn'.$ndi++;
1653 my $key = $root->addNewChild( $graphml_ns, 'key' );
1654 $key->setAttribute( 'attr.name', $datum );
bbd064a9 1655 $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
910a0a6d 1656 $key->setAttribute( 'for', 'node' );
1657 $key->setAttribute( 'id', $node_data_keys{$datum} );
8e1394aa 1658 }
1659
bbd064a9 1660 # Add the data keys for edges, that is, paths and relationships. Path
1661 # data does not come from a Moose class so is here manually.
ef9d481f 1662 my $edi = 0;
1663 my %edge_data_keys;
bbd064a9 1664 my %edge_attributes = (
3a2ebbf4 1665 witness => 'string', # ID/label for a path
3a2ebbf4 1666 extra => 'boolean', # Path key
3a2ebbf4 1667 );
bbd064a9 1668 my @path_attributes = keys %edge_attributes; # track our manual additions
1669 my $pmeta = Text::Tradition::Collation::Relationship->meta;
1670 foreach my $attr( $pmeta->get_all_attributes ) {
1671 next if $attr->name =~ /^_/;
bbd064a9 1672 next unless $save_types{$attr->type_constraint->name};
1673 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1674 }
1675 foreach my $datum ( sort keys %edge_attributes ) {
3a2ebbf4 1676 $edge_data_keys{$datum} = 'de'.$edi++;
910a0a6d 1677 my $key = $root->addNewChild( $graphml_ns, 'key' );
3a2ebbf4 1678 $key->setAttribute( 'attr.name', $datum );
bbd064a9 1679 $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
910a0a6d 1680 $key->setAttribute( 'for', 'edge' );
3a2ebbf4 1681 $key->setAttribute( 'id', $edge_data_keys{$datum} );
8e1394aa 1682 }
3a2ebbf4 1683
cc31ebaa 1684 # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1685 my $xmlidname = $self->tradition->name;
1686 $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1687 if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1688 $xmlidname = '_'.$xmlidname;
1689 }
2c669bca 1690 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1691 $sgraph->setAttribute( 'edgedefault', 'directed' );
cc31ebaa 1692 $sgraph->setAttribute( 'id', $xmlidname );
2c669bca 1693 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
cc31ebaa 1694 $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
2c669bca 1695 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
cc31ebaa 1696 $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
2c669bca 1697 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
22222af9 1698
2a812726 1699 # Tradition/collation attribute data
bbd064a9 1700 foreach my $datum ( keys %graph_attributes ) {
1701 my $value;
1702 if( $datum eq 'version' ) {
2a812726 1703 $value = '3.2';
9fef629b 1704 } elsif( ref( $graph_attributes{$datum} ) ) {
1705 my $sub = $graph_attributes{$datum};
1706 $value = &$sub();
bbd064a9 1707 } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1708 $value = $self->tradition->$datum;
1709 } else {
1710 $value = $self->$datum;
1711 }
2c669bca 1712 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
e309421a 1713 }
8e1394aa 1714
1715 my $node_ctr = 0;
1716 my %node_hash;
22222af9 1717 # Add our readings to the graph
3a2ebbf4 1718 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
a30ca502 1719 next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1720 ( $n->rank < $start->rank || $n->rank > $end->rank );
1721 $use_readings{$n->id} = 1;
2c669bca 1722 # Add to the main graph
1723 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
910a0a6d 1724 my $node_xmlid = 'n' . $node_ctr++;
3a2ebbf4 1725 $node_hash{ $n->id } = $node_xmlid;
910a0a6d 1726 $node_el->setAttribute( 'id', $node_xmlid );
bbd064a9 1727 foreach my $d ( keys %reading_attributes ) {
255875b8 1728 my $nval = $n->$d;
7cd9f181 1729 # Custom serialization
1730 if( $d eq 'lexemes' ) {
1731 # If nval is a true value, we have lexemes so we need to
1732 # serialize them. Otherwise set nval to undef so that the
1733 # key is excluded from this reading.
1734 $nval = $nval ? $n->_serialize_lexemes : undef;
18c64d55 1735 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1736 $nval = undef;
7cd9f181 1737 }
cc31ebaa 1738 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
414cc046 1739 # Adjust the ranks within the subgraph.
cc31ebaa 1740 $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1741 : $nval - $rankoffset;
414cc046 1742 }
255875b8 1743 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1744 if defined $nval;
1745 }
b15511bf 1746 }
1747
2c669bca 1748 # Add the path edges to the sequence graph
df6d9812 1749 my $edge_ctr = 0;
3a2ebbf4 1750 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1751 # We add an edge in the graphml for every witness in $e.
a30ca502 1752 next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1753 my @edge_wits = sort $self->path_witnesses( $e );
cc31ebaa 1754 $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1755 $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1756 # Skip any path from start to end; that witness is not in the subgraph.
1757 next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
a30ca502 1758 foreach my $wit ( @edge_wits ) {
3a2ebbf4 1759 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1760 $node_hash{ $e->[0] },
1761 $node_hash{ $e->[1] } );
2c669bca 1762 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
3a2ebbf4 1763 $edge_el->setAttribute( 'source', $from );
1764 $edge_el->setAttribute( 'target', $to );
1765 $edge_el->setAttribute( 'id', $id );
3a2ebbf4 1766
1767 # It's a witness path, so add the witness
1768 my $base = $wit;
1769 my $key = $edge_data_keys{'witness'};
1770 # Is this an ante-corr witness?
1771 my $aclabel = $self->ac_label;
1772 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1773 # Keep the base witness
1774 $base = $1;
1775 # ...and record that this is an 'extra' reading path
1776 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1777 }
1778 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1779 }
1780 }
1781
cc31ebaa 1782 # Report the actual number of nodes and edges that went in
1783 $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1784 $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1785
22222af9 1786 # Add the relationship graph to the XML
bbd064a9 1787 map { delete $edge_data_keys{$_} } @path_attributes;
826d8773 1788 $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1789 $node_data_keys{'id'}, \%edge_data_keys );
8e1394aa 1790
94c00c71 1791 # Save and return the thing
1792 my $result = decode_utf8( $graphml->toString(1) );
94c00c71 1793 return $result;
df6d9812 1794}
1795
b15511bf 1796sub _add_graphml_data {
1797 my( $el, $key, $value ) = @_;
b15511bf 1798 return unless defined $value;
c9bf3dbf 1799 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
b15511bf 1800 $data_el->setAttribute( 'key', $key );
1801 $data_el->appendText( $value );
8e1394aa 1802}
1803
4e5a7b2c 1804=head2 as_csv
910a0a6d 1805
1806Returns a CSV alignment table representation of the collation graph, one
2c669bca 1807row per witness (or witness uncorrected.)
910a0a6d 1808
ce5966fb 1809=head2 as_tsv
1810
1811Returns a tab-separated alignment table representation of the collation graph,
1812one row per witness (or witness uncorrected.)
1813
16203db5 1814=begin testing
1815
1816use Text::Tradition;
34ca808b 1817use Text::CSV;
16203db5 1818
1819my $READINGS = 311;
1820my $PATHS = 361;
34ca808b 1821my $WITS = 13;
1822my $WITAC = 4;
16203db5 1823
1824my $datafile = 't/data/florilegium_tei_ps.xml';
1825my $tradition = Text::Tradition->new( 'input' => 'TEI',
1826 'name' => 'test0',
1827 'file' => $datafile,
1828 'linear' => 1 );
1829
1830my $c = $tradition->collation;
1831# Export the thing to CSV
1832my $csvstr = $c->as_csv();
34ca808b 1833# Count the columns
1834my $csv = Text::CSV->new({ sep_char => ',', binary => 1 });
1835my @lines = split(/\n/, $csvstr );
1836ok( $csv->parse( $lines[0] ), "Successfully parsed first line of CSV" );
1837is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
cbc8e08f 1838my @q_ac = grep { $_ eq 'Q'.$c->ac_label } $csv->fields;
1839ok( @q_ac, "Found a layered witness" );
1840
16203db5 1841my $t2 = Text::Tradition->new( input => 'Tabular',
1842 name => 'test2',
1843 string => $csvstr,
1844 sep_char => ',' );
1845is( scalar $t2->collation->readings, $READINGS, "Reparsed CSV collation has all readings" );
1846is( scalar $t2->collation->paths, $PATHS, "Reparsed CSV collation has all paths" );
1847
1848# Now do it with TSV
1849my $tsvstr = $c->as_tsv();
1850my $t3 = Text::Tradition->new( input => 'Tabular',
1851 name => 'test3',
1852 string => $tsvstr,
1853 sep_char => "\t" );
1854is( scalar $t3->collation->readings, $READINGS, "Reparsed TSV collation has all readings" );
1855is( scalar $t3->collation->paths, $PATHS, "Reparsed TSV collation has all paths" );
1856
4e64b669 1857my $table = $c->alignment_table;
34ca808b 1858my $noaccsv = $c->as_csv({ noac => 1 });
1859my @noaclines = split(/\n/, $noaccsv );
1860ok( $csv->parse( $noaclines[0] ), "Successfully parsed first line of no-ac CSV" );
1861is( scalar( $csv->fields ), $WITS, "CSV has correct number of witness columns" );
4e64b669 1862is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
34ca808b 1863
cbc8e08f 1864my $safecsv = $c->as_csv({ safe_ac => 1});
1865my @safelines = split(/\n/, $safecsv );
1866ok( $csv->parse( $safelines[0] ), "Successfully parsed first line of safe CSV" );
1867is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1868@q_ac = grep { $_ eq 'Q__L' } $csv->fields;
1869ok( @q_ac, "Found a sanitized layered witness" );
1870is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
16203db5 1871
de20588d 1872# Test relationship collapse
1873$c->add_relationship( $c->readings_at_rank( 37 ), { type => 'spelling' } );
1874$c->add_relationship( $c->readings_at_rank( 60 ), { type => 'spelling' } );
1875
1876my $mergedtsv = $c->as_tsv({mergetypes => [ 'spelling', 'orthographic' ] });
1877my $t4 = Text::Tradition->new( input => 'Tabular',
1878 name => 'test4',
1879 string => $mergedtsv,
1880 sep_char => "\t" );
1881is( scalar $t4->collation->readings, $READINGS - 2, "Reparsed TSV merge collation has fewer readings" );
1882is( scalar $t4->collation->paths, $PATHS - 4, "Reparsed TSV merge collation has fewer paths" );
1883
53ec2b6d 1884# Test non-ASCII sigla
1885my $t5 = Text::Tradition->new( input => 'Tabular',
1886 name => 'nonascii',
1887 file => 't/data/armexample.xlsx',
1888 excel => 'xlsx' );
1889my $awittsv = $t5->collation->as_tsv({ noac => 1, ascii => 1 });
1890my @awitlines = split( /\n/, $awittsv );
1891like( $awitlines[0], qr/_A_5315622/, "Found ASCII sigil variant in TSV" );
1892
16203db5 1893=end testing
1894
910a0a6d 1895=cut
1896
ce5966fb 1897sub _tabular {
34ca808b 1898 my( $self, $opts ) = @_;
1899 my $table = $self->alignment_table( $opts );
16203db5 1900 my $csv_options = { binary => 1, quote_null => 0 };
34ca808b 1901 $csv_options->{'sep_char'} = $opts->{fieldsep};
1902 if( $opts->{fieldsep} eq "\t" ) {
16203db5 1903 # If it is really tab separated, nothing is an escape char.
1904 $csv_options->{'quote_char'} = undef;
1905 $csv_options->{'escape_char'} = '';
1906 }
1907 my $csv = Text::CSV->new( $csv_options );
910a0a6d 1908 my @result;
53ec2b6d 1909
2c669bca 1910 # Make the header row
53ec2b6d 1911 my @witnesses = map { $_->{'witness'} } @{$table->{'alignment'}};
1912 if( $opts->{ascii} ) {
1913 # TODO think of a fix for this
1914 throw( "Cannot currently produce ASCII sigla with witness layers" )
1915 unless $opts->{noac};
1916 my @awits = map { $self->tradition->witness( $_ )->ascii_sigil } @witnesses;
1917 @witnesses = @awits;
1918 }
1919 $csv->combine( @witnesses );
43800a64 1920 push( @result, $csv->string );
53ec2b6d 1921
2c669bca 1922 # Make the rest of the rows
1923 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
566f4595 1924 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1dd07bda 1925 my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
de20588d 1926 # Quick and dirty collapse of requested relationship types
1927 if( ref( $opts->{mergetypes} ) eq 'ARRAY' ) {
1928 # Now substitute the reading in the relevant index of @row
1929 # for its merge-related reading
1930 my %substitutes;
1931 while( @rowobjs ) {
1932 my $thisr = shift @rowobjs;
1933 next unless $thisr;
1934 next if exists $substitutes{$thisr->{t}->text};
1935 # Make sure we don't have A <-> B substitutions.
1936 $substitutes{$thisr->{t}->text} = $thisr->{t}->text;
1937 foreach my $thatr ( @rowobjs ) {
1938 next unless $thatr;
1939 next if exists $substitutes{$thatr->{t}->text};
1940 my $ttrel = $self->get_relationship( $thisr->{t}, $thatr->{t} );
1941 next unless $ttrel;
1942 next unless grep { $ttrel->type eq $_ } @{$opts->{mergetypes}};
1943 # If we have got this far then we need to merge them.
1944 $substitutes{$thatr->{t}->text} = $thisr->{t}->text;
1945 }
1946 }
1947 @row = map { $_ && exists $substitutes{$_} ? $substitutes{$_} : $_ } @row;
1948 }
2c669bca 1949 $csv->combine( @row );
43800a64 1950 push( @result, $csv->string );
910a0a6d 1951 }
3a2ebbf4 1952 return join( "\n", @result );
910a0a6d 1953}
1954
ce5966fb 1955sub as_csv {
1956 my $self = shift;
34ca808b 1957 my $opts = shift || {};
1958 $opts->{fieldsep} = ',';
1959 return $self->_tabular( $opts );
ce5966fb 1960}
1961
1962sub as_tsv {
1963 my $self = shift;
34ca808b 1964 my $opts = shift || {};
1965 $opts->{fieldsep} = "\t";
1966 return $self->_tabular( $opts );
ce5966fb 1967}
1968
248276a2 1969=head2 alignment_table
2c669bca 1970
566f4595 1971Return a reference to an alignment table, in a slightly enhanced CollateX
1972format which looks like this:
1973
1974 $table = { alignment => [ { witness => "SIGIL",
4e5a7b2c 1975 tokens => [ { t => "TEXT" }, ... ] },
566f4595 1976 { witness => "SIG2",
4e5a7b2c 1977 tokens => [ { t => "TEXT" }, ... ] },
566f4595 1978 ... ],
1979 length => TEXTLEN };
1980
2c669bca 1981=cut
9f3ba6f7 1982
1dd07bda 1983sub alignment_table {
34ca808b 1984 my( $self, $opts ) = @_;
4e64b669 1985 if( $self->has_cached_table ) {
cbc8e08f 1986 return $self->cached_table
1987 unless $opts->{noac} || $opts->{safe_ac};
4e64b669 1988 }
1dd07bda 1989
0ecb975c 1990 # Make sure we can do this
1991 throw( "Need a linear graph in order to make an alignment table" )
1992 unless $self->linear;
b39e7cb5 1993 $self->calculate_ranks()
1994 unless $self->_graphcalc_done && $self->end->has_rank;
1995
2c669bca 1996 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
3a2ebbf4 1997 my @all_pos = ( 1 .. $self->end->rank - 1 );
68454b71 1998 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
6771a1b1 1999 # say STDERR "Making witness row(s) for " . $wit->sigil;
1f7aa795 2000 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1dd07bda 2001 my @row = _make_witness_row( \@wit_path, \@all_pos );
bed6ce83 2002 my $witobj = { 'witness' => $wit->sigil, 'tokens' => \@row };
2003 $witobj->{'identifier'} = $wit->identifier if $wit->identifier;
2004 push( @{$table->{'alignment'}}, $witobj );
34ca808b 2005 if( $wit->is_layered && !$opts->{noac} ) {
1f7aa795 2006 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
861c3e27 2007 $wit->sigil.$self->ac_label );
1dd07bda 2008 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
cbc8e08f 2009 my $witlabel = $opts->{safe_ac}
2010 ? $wit->sigil . '__L' : $wit->sigil.$self->ac_label;
2011 my $witacobj = { 'witness' => $witlabel,
bed6ce83 2012 'tokens' => \@ac_row };
2013 $witacobj->{'identifier'} = $wit->identifier if $wit->identifier;
2014 push( @{$table->{'alignment'}}, $witacobj );
910a0a6d 2015 }
2016 }
cbc8e08f 2017 unless( $opts->{noac} || $opts->{safe_ac} ) {
4e64b669 2018 $self->cached_table( $table );
2019 }
1dd07bda 2020 return $table;
910a0a6d 2021}
2022
2023sub _make_witness_row {
1dd07bda 2024 my( $path, $positions ) = @_;
910a0a6d 2025 my %char_hash;
2026 map { $char_hash{$_} = undef } @$positions;
2c669bca 2027 my $debug = 0;
910a0a6d 2028 foreach my $rdg ( @$path ) {
6771a1b1 2029 say STDERR "rank " . $rdg->rank if $debug;
2030 # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
1dd07bda 2031 $char_hash{$rdg->rank} = { 't' => $rdg };
910a0a6d 2032 }
2033 my @row = map { $char_hash{$_} } @$positions;
eca16057 2034 # Fill in lacuna markers for undef spots in the row
2035 my $last_el = shift @row;
2036 my @filled_row = ( $last_el );
2037 foreach my $el ( @row ) {
0e476982 2038 # If we are using node reference, make the lacuna node appear many times
2039 # in the table. If not, use the lacuna tag.
1dd07bda 2040 if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
2041 $el = $last_el;
eca16057 2042 }
2043 push( @filled_row, $el );
2044 $last_el = $el;
2045 }
2046 return @filled_row;
910a0a6d 2047}
2048
248276a2 2049
4e5a7b2c 2050=head1 NAVIGATION METHODS
910a0a6d 2051
4e5a7b2c 2052=head2 reading_sequence( $first, $last, $sigil, $backup )
e2902068 2053
2054Returns the ordered list of readings, starting with $first and ending
4e5a7b2c 2055with $last, for the witness given in $sigil. If a $backup sigil is
2056specified (e.g. when walking a layered witness), it will be used wherever
2057no $sigil path exists. If there is a base text reading, that will be
2058used wherever no path exists for $sigil or $backup.
e2902068 2059
2060=cut
2061
910a0a6d 2062# TODO Think about returning some lazy-eval iterator.
b0b4421a 2063# TODO Get rid of backup; we should know from what witness is whether we need it.
910a0a6d 2064
e2902068 2065sub reading_sequence {
861c3e27 2066 my( $self, $start, $end, $witness ) = @_;
e2902068 2067
930ff666 2068 $witness = $self->baselabel unless $witness;
e2902068 2069 my @readings = ( $start );
2070 my %seen;
2071 my $n = $start;
3a2ebbf4 2072 while( $n && $n->id ne $end->id ) {
2073 if( exists( $seen{$n->id} ) ) {
63778331 2074 throw( "Detected loop for $witness at " . $n->id );
910a0a6d 2075 }
3a2ebbf4 2076 $seen{$n->id} = 1;
910a0a6d 2077
861c3e27 2078 my $next = $self->next_reading( $n, $witness );
44771cf2 2079 unless( $next ) {
63778331 2080 throw( "Did not find any path for $witness from reading " . $n->id );
44771cf2 2081 }
910a0a6d 2082 push( @readings, $next );
2083 $n = $next;
e2902068 2084 }
2085 # Check that the last reading is our end reading.
2086 my $last = $readings[$#readings];
63778331 2087 throw( "Last reading found from " . $start->text .
2088 " for witness $witness is not the end!" ) # TODO do we get this far?
3a2ebbf4 2089 unless $last->id eq $end->id;
e2902068 2090
2091 return @readings;
2092}
2093
522c1314 2094=head2 readings_at_rank( $rank )
2095
2096Returns a list of readings at a given rank, taken from the alignment table.
2097
2098=cut
2099
2100sub readings_at_rank {
8a9a8200 2101 my( $self, $rank, $nolacuna ) = @_;
522c1314 2102 my $table = $self->alignment_table;
2103 # Table rank is real rank - 1.
2104 my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
2105 my %readings;
2106 foreach my $e ( @elements ) {
2107 next unless ref( $e ) eq 'HASH';
2108 next unless exists $e->{'t'};
8a9a8200 2109 my $rdg = $e->{'t'};
2110 next if $nolacuna && $rdg->is_lacuna && $rdg->rank ne $rank;
522c1314 2111 $readings{$e->{'t'}->id} = $e->{'t'};
2112 }
2113 return values %readings;
2114}
2115
4e5a7b2c 2116=head2 next_reading( $reading, $sigil );
8e1394aa 2117
4a8828f0 2118Returns the reading that follows the given reading along the given witness
930ff666 2119path.
8e1394aa 2120
2121=cut
2122
4a8828f0 2123sub next_reading {
e2902068 2124 # Return the successor via the corresponding path.
8e1394aa 2125 my $self = shift;
3a2ebbf4 2126 my $answer = $self->_find_linked_reading( 'next', @_ );
2c669bca 2127 return undef unless $answer;
3a2ebbf4 2128 return $self->reading( $answer );
8e1394aa 2129}
2130
4e5a7b2c 2131=head2 prior_reading( $reading, $sigil )
8e1394aa 2132
4a8828f0 2133Returns the reading that precedes the given reading along the given witness
930ff666 2134path.
8e1394aa 2135
2136=cut
2137
4a8828f0 2138sub prior_reading {
e2902068 2139 # Return the predecessor via the corresponding path.
8e1394aa 2140 my $self = shift;
3a2ebbf4 2141 my $answer = $self->_find_linked_reading( 'prior', @_ );
2142 return $self->reading( $answer );
8e1394aa 2143}
2144
4a8828f0 2145sub _find_linked_reading {
861c3e27 2146 my( $self, $direction, $node, $path ) = @_;
2147
2148 # Get a backup if we are dealing with a layered witness
2149 my $alt_path;
2150 my $aclabel = $self->ac_label;
2151 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
2152 $alt_path = $1;
2153 }
2154
e2902068 2155 my @linked_paths = $direction eq 'next'
3a2ebbf4 2156 ? $self->sequence->edges_from( $node )
2157 : $self->sequence->edges_to( $node );
e2902068 2158 return undef unless scalar( @linked_paths );
8e1394aa 2159
e2902068 2160 # We have to find the linked path that contains all of the
2161 # witnesses supplied in $path.
2162 my( @path_wits, @alt_path_wits );
4e5a7b2c 2163 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
2164 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
e2902068 2165 my $base_le;
2166 my $alt_le;
2167 foreach my $le ( @linked_paths ) {
3a2ebbf4 2168 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
910a0a6d 2169 $base_le = $le;
910a0a6d 2170 }
508fd430 2171 my @le_wits = sort $self->path_witnesses( $le );
3a2ebbf4 2172 if( _is_within( \@path_wits, \@le_wits ) ) {
2173 # This is the right path.
2174 return $direction eq 'next' ? $le->[1] : $le->[0];
2175 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
2176 $alt_le = $le;
2177 }
8e1394aa 2178 }
e2902068 2179 # Got this far? Return the alternate path if it exists.
3a2ebbf4 2180 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
910a0a6d 2181 if $alt_le;
e2902068 2182
2183 # Got this far? Return the base path if it exists.
3a2ebbf4 2184 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
910a0a6d 2185 if $base_le;
e2902068 2186
2187 # Got this far? We have no appropriate path.
2c669bca 2188 warn "Could not find $direction node from " . $node->id
910a0a6d 2189 . " along path $path";
8e1394aa 2190 return undef;
2191}
2192
4a8828f0 2193# Some set logic.
2194sub _is_within {
2195 my( $set1, $set2 ) = @_;
7854e12e 2196 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
4a8828f0 2197 foreach my $el ( @$set1 ) {
910a0a6d 2198 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
4a8828f0 2199 }
2200 return $ret;
2201}
2202
4e5a7b2c 2203# Return the string that joins together a list of witnesses for
2204# display on a single path.
2205sub _witnesses_of_label {
2206 my( $self, $label ) = @_;
2207 my $regex = $self->wit_list_separator;
2208 my @answer = split( /\Q$regex\E/, $label );
2209 return @answer;
b0b4421a 2210}
2211
d4b75f44 2212=head2 common_readings
2213
2214Returns the list of common readings in the graph (i.e. those readings that are
2215shared by all non-lacunose witnesses.)
2216
2217=cut
2218
2219sub common_readings {
2220 my $self = shift;
2221 my @common = grep { $_->is_common } $self->readings;
2222 return @common;
2223}
2224
fae52efd 2225=head2 path_text( $sigil, [, $start, $end ] )
b0b4421a 2226
2227Returns the text of a witness (plus its backup, if we are using a layer)
2228as stored in the collation. The text is returned as a string, where the
2229individual readings are joined with spaces and the meta-readings (e.g.
2230lacunae) are omitted. Optional specification of $start and $end allows
2231the generation of a subset of the witness text.
4e5a7b2c 2232
b0b4421a 2233=cut
2234
2235sub path_text {
861c3e27 2236 my( $self, $wit, $start, $end ) = @_;
b0b4421a 2237 $start = $self->start unless $start;
2238 $end = $self->end unless $end;
861c3e27 2239 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
629e27b0 2240 my $pathtext = '';
2241 my $last;
fd613f7e 2242
2243 if ($self->direction eq 'RL') {
2244 @path = reverse @path;
2245 }
2246
629e27b0 2247 foreach my $r ( @path ) {
6ad2ce78 2248 unless ( $r->join_prior || !$last || $last->join_next ) {
2249 $pathtext .= ' ';
2250 }
2251 $pathtext .= $r->text;
629e27b0 2252 $last = $r;
2253 }
2254 return $pathtext;
b0b4421a 2255}
4e5a7b2c 2256
2257=head1 INITIALIZATION METHODS
2258
2259These are mostly for use by parsers.
2260
2261=head2 make_witness_path( $witness )
2262
2263Link the array of readings contained in $witness->path (and in
2264$witness->uncorrected_path if it exists) into collation paths.
2265Clear out the arrays when finished.
de51424a 2266
4e5a7b2c 2267=head2 make_witness_paths
2268
2269Call make_witness_path for all witnesses in the tradition.
2270
2271=cut
930ff666 2272
7e450e44 2273# For use when a collation is constructed from a base text and an apparatus.
2274# We have the sequences of readings and just need to add path edges.
1f7aa795 2275# When we are done, clear out the witness path attributes, as they are no
2276# longer needed.
2277# TODO Find a way to replace the witness path attributes with encapsulated functions?
e2902068 2278
6a222840 2279sub make_witness_paths {
2280 my( $self ) = @_;
910a0a6d 2281 foreach my $wit ( $self->tradition->witnesses ) {
6771a1b1 2282 # say STDERR "Making path for " . $wit->sigil;
910a0a6d 2283 $self->make_witness_path( $wit );
7854e12e 2284 }
7854e12e 2285}
2286
6a222840 2287sub make_witness_path {
7854e12e 2288 my( $self, $wit ) = @_;
2289 my @chain = @{$wit->path};
15d2d3df 2290 my $sig = $wit->sigil;
fae52efd 2291 # Add start and end if necessary
2292 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
2293 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
7854e12e 2294 foreach my $idx ( 0 .. $#chain-1 ) {
910a0a6d 2295 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
7854e12e 2296 }
1f7aa795 2297 if( $wit->is_layered ) {
d9e873d0 2298 @chain = @{$wit->uncorrected_path};
fae52efd 2299 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
2300 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
d9e873d0 2301 foreach my $idx( 0 .. $#chain-1 ) {
2302 my $source = $chain[$idx];
2303 my $target = $chain[$idx+1];
2304 $self->add_path( $source, $target, $sig.$self->ac_label )
2305 unless $self->has_path( $source, $target, $sig );
2306 }
15d2d3df 2307 }
1f7aa795 2308 $wit->clear_path;
2309 $wit->clear_uncorrected_path;
e2902068 2310}
2311
4e5a7b2c 2312=head2 calculate_ranks
2313
2314Calculate the reading ranks (that is, their aligned positions relative
2315to each other) for the graph. This can only be called on linear collations.
2316
b365fbae 2317=begin testing
2318
2319use Text::Tradition;
2320
2321my $cxfile = 't/data/Collatex-16.xml';
2322my $t = Text::Tradition->new(
2323 'name' => 'inline',
2324 'input' => 'CollateX',
2325 'file' => $cxfile,
2326 );
2327my $c = $t->collation;
2328
2329# Make an svg
bfcbcecb 2330my $table = $c->alignment_table;
2331ok( $c->has_cached_table, "Alignment table was cached" );
2332is( $c->alignment_table, $table, "Cached table returned upon second call" );
b365fbae 2333$c->calculate_ranks;
bfcbcecb 2334is( $c->alignment_table, $table, "Cached table retained with no rank change" );
864ee4bf 2335$c->add_relationship( 'n13', 'n23', { type => 'repetition' } );
2336is( $c->alignment_table, $table, "Alignment table unchanged after non-colo relationship add" );
2337$c->add_relationship( 'n24', 'n23', { type => 'spelling' } );
2338isnt( $c->alignment_table, $table, "Alignment table changed after colo relationship add" );
b365fbae 2339
2340=end testing
2341
4e5a7b2c 2342=cut
2343
910a0a6d 2344sub calculate_ranks {
2345 my $self = shift;
b365fbae 2346 # Save the existing ranks, in case we need to invalidate the cached SVG.
8d9494a8 2347 throw( "Cannot calculate ranks on a non-linear graph" )
2348 unless $self->linear;
b365fbae 2349 my %existing_ranks;
ac4d7ac2 2350 map { $existing_ranks{$_} = $_->rank } $self->readings;
359944f7 2351
2352 # Do the rankings based on the relationship equivalence graph, starting
2353 # with the start node.
56772e8c 2354 my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
2355
910a0a6d 2356 # Transfer our rankings from the topological graph to the real one.
2357 foreach my $r ( $self->readings ) {
cecbe56d 2358 if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
359944f7 2359 $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
67da8d6c 2360 } else {
63778331 2361 # Die. Find the last rank we calculated.
359944f7 2362 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
2363 <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
63778331 2364 $self->readings;
2365 my $last = pop @all_defined;
2366 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
67da8d6c 2367 }
de51424a 2368 }
bfcbcecb 2369 # Do we need to invalidate the cached data?
be3af600 2370 if( $self->has_cached_table ) {
b365fbae 2371 foreach my $r ( $self->readings ) {
7c293912 2372 next if defined( $existing_ranks{$r} )
2373 && $existing_ranks{$r} == $r->rank;
c1915ab9 2374 # Something has changed, so clear the cache
bfcbcecb 2375 $self->_clear_cache;
c1915ab9 2376 # ...and recalculate the common readings.
2377 $self->calculate_common_readings();
b365fbae 2378 last;
2379 }
2380 }
c1915ab9 2381 # The graph calculation information is now up to date.
2382 $self->_graphcalc_done(1);
8e1394aa 2383}
3a1f2523 2384
c1915ab9 2385sub _clear_cache {
2386 my $self = shift;
c1915ab9 2387 $self->wipe_table if $self->has_cached_table;
2388}
2389
2390
4e5a7b2c 2391=head2 flatten_ranks
2392
2393A convenience method for parsing collation data. Searches the graph for readings
2394with the same text at the same rank, and merges any that are found.
2395
2396=cut
2397
0e476982 2398sub flatten_ranks {
4ef65ab4 2399 my ( $self, %args ) = shift;
0e476982 2400 my %unique_rank_rdg;
bf6e338d 2401 my $changed;
4ef65ab4 2402 foreach my $p ( $self->identical_readings( %args ) ) {
2403 # say STDERR "Combining readings at same rank: @$p";
2404 $changed = 1;
2405 $self->merge_readings( @$p );
2406 # TODO see if this now makes a common point.
7a0956c1 2407 }
2408 # If we merged readings, the ranks are still fine but the alignment
2409 # table is wrong. Wipe it.
2410 $self->wipe_table() if $changed;
2411}
2412
2413=head2 identical_readings
2414=head2 identical_readings( start => $startnode, end => $endnode )
2415=head2 identical_readings( startrank => $startrank, endrank => $endrank )
2416
2417Goes through the graph identifying all pairs of readings that appear to be
2418identical, and therefore able to be merged into a single reading. Returns the
2419relevant identical pairs. Can be restricted to run over only a part of the
2420graph, specified either by node or by rank.
2421
2422=cut
2423
2424sub identical_readings {
2425 my ( $self, %args ) = @_;
2426 # Find where we should start and end.
2427 my $startrank = $args{startrank} || 0;
2428 if( $args{start} ) {
2429 throw( "Starting reading has no rank" ) unless $self->reading( $args{start} )
2430 && $self->reading( $args{start} )->has_rank;
2431 $startrank = $self->reading( $args{start} )->rank;
2432 }
2433 my $endrank = $args{endrank} || $self->end->rank;
2434 if( $args{end} ) {
2435 throw( "Ending reading has no rank" ) unless $self->reading( $args{end} )
2436 && $self->reading( $args{end} )->has_rank;
3c234eb6 2437 $endrank = $self->reading( $args{end} )->rank;
7a0956c1 2438 }
2439
2440 # Make sure the ranks are correct.
2441 unless( $self->_graphcalc_done ) {
2442 $self->calculate_ranks;
2443 }
2444 # Go through the readings looking for duplicates.
2445 my %unique_rank_rdg;
2446 my @pairs;
0e476982 2447 foreach my $rdg ( $self->readings ) {
2448 next unless $rdg->has_rank;
7a0956c1 2449 my $rk = $rdg->rank;
2450 next if $rk > $endrank || $rk < $startrank;
2451 my $key = $rk . "||" . $rdg->text;
0e476982 2452 if( exists $unique_rank_rdg{$key} ) {
07e6765f 2453 # Make sure they don't have different grammatical forms
2454 my $ur = $unique_rank_rdg{$key};
a445ce40 2455 if( $rdg->is_identical( $ur ) ) {
7a0956c1 2456 push( @pairs, [ $ur, $rdg ] );
07e6765f 2457 }
0e476982 2458 } else {
2459 $unique_rank_rdg{$key} = $rdg;
2460 }
7a0956c1 2461 }
2462
2463 return @pairs;
0e476982 2464}
4633f9e4 2465
2466
d4b75f44 2467=head2 calculate_common_readings
2468
2469Goes through the graph identifying the readings that appear in every witness
2470(apart from those with lacunae at that spot.) Marks them as common and returns
2471the list.
2472
2473=begin testing
2474
2475use Text::Tradition;
2476
2477my $cxfile = 't/data/Collatex-16.xml';
2478my $t = Text::Tradition->new(
2479 'name' => 'inline',
2480 'input' => 'CollateX',
2481 'file' => $cxfile,
2482 );
2483my $c = $t->collation;
2484
2485my @common = $c->calculate_common_readings();
2486is( scalar @common, 8, "Found correct number of common readings" );
2487my @marked = sort $c->common_readings();
2488is( scalar @common, 8, "All common readings got marked as such" );
679f17e1 2489my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
d4b75f44 2490is_deeply( \@marked, \@expected, "Found correct list of common readings" );
2491
2492=end testing
2493
2494=cut
2495
2496sub calculate_common_readings {
2497 my $self = shift;
2498 my @common;
c1915ab9 2499 map { $_->is_common( 0 ) } $self->readings;
2500 # Implicitly calls calculate_ranks
1dd07bda 2501 my $table = $self->alignment_table;
d4b75f44 2502 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
7f52eac8 2503 my @row = map { $_->{'tokens'}->[$idx]
2504 ? $_->{'tokens'}->[$idx]->{'t'} : '' }
2505 @{$table->{'alignment'}};
d4b75f44 2506 my %hash;
2507 foreach my $r ( @row ) {
2508 if( $r ) {
2509 $hash{$r->id} = $r unless $r->is_meta;
2510 } else {
2511 $hash{'UNDEF'} = $r;
2512 }
2513 }
2514 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
2515 my( $r ) = values %hash;
2516 $r->is_common( 1 );
2517 push( @common, $r );
2518 }
2519 }
2520 return @common;
2521}
2522
861c3e27 2523=head2 text_from_paths
2524
2525Calculate the text array for all witnesses from the path, for later consistency
2526checking. Only to be used if there is no non-graph-based way to know the
2527original texts.
2528
2529=cut
2530
2531sub text_from_paths {
2532 my $self = shift;
2533 foreach my $wit ( $self->tradition->witnesses ) {
5164a6f0 2534 my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
2535 my @text;
2536 foreach my $r ( @readings ) {
2537 next if $r->is_meta;
2538 push( @text, $r->text );
2539 }
861c3e27 2540 $wit->text( \@text );
2541 if( $wit->is_layered ) {
5164a6f0 2542 my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
2543 $wit->sigil.$self->ac_label );
2544 my @uctext;
2545 foreach my $r ( @ucrdgs ) {
2546 next if $r->is_meta;
2547 push( @uctext, $r->text );
2548 }
2549 $wit->layertext( \@uctext );
861c3e27 2550 }
2551 }
2552}
0e476982 2553
4e5a7b2c 2554=head1 UTILITY FUNCTIONS
2555
2556=head2 common_predecessor( $reading_a, $reading_b )
8e1394aa 2557
4e5a7b2c 2558Find the last reading that occurs in sequence before both the given readings.
414cc046 2559At the very least this should be $self->start.
4e5a7b2c 2560
2561=head2 common_successor( $reading_a, $reading_b )
2562
2563Find the first reading that occurs in sequence after both the given readings.
414cc046 2564At the very least this should be $self->end.
4e5a7b2c 2565
22222af9 2566=begin testing
2567
2568use Text::Tradition;
2569
2570my $cxfile = 't/data/Collatex-16.xml';
2571my $t = Text::Tradition->new(
2572 'name' => 'inline',
2573 'input' => 'CollateX',
2574 'file' => $cxfile,
2575 );
2576my $c = $t->collation;
2577
679f17e1 2578is( $c->common_predecessor( 'n24', 'n23' )->id,
22222af9 2579 'n20', "Found correct common predecessor" );
679f17e1 2580is( $c->common_successor( 'n24', 'n23' )->id,
10e4b1ac 2581 '__END__', "Found correct common successor" );
22222af9 2582
4e5a7b2c 2583is( $c->common_predecessor( 'n19', 'n17' )->id,
22222af9 2584 'n16', "Found correct common predecessor for readings on same path" );
679f17e1 2585is( $c->common_successor( 'n21', 'n10' )->id,
10e4b1ac 2586 '__END__', "Found correct common successor for readings on same path" );
22222af9 2587
2588=end testing
2589
2590=cut
2591
2592## Return the closest reading that is a predecessor of both the given readings.
2593sub common_predecessor {
2594 my $self = shift;
4e5a7b2c 2595 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 2596 return $self->_common_in_path( $r1, $r2, 'predecessors' );
22222af9 2597}
2598
2599sub common_successor {
2600 my $self = shift;
4e5a7b2c 2601 my( $r1, $r2 ) = $self->_objectify_args( @_ );
027d819c 2602 return $self->_common_in_path( $r1, $r2, 'successors' );
22222af9 2603}
2604
414cc046 2605
2606# TODO think about how to do this without ranks...
027d819c 2607sub _common_in_path {
22222af9 2608 my( $self, $r1, $r2, $dir ) = @_;
414cc046 2609 my $iter = $self->end->rank;
22222af9 2610 my @candidates;
414cc046 2611 my @last_r1 = ( $r1 );
2612 my @last_r2 = ( $r2 );
2613 # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
22222af9 2614 my %all_seen;
6771a1b1 2615 # say STDERR "Finding common $dir for $r1, $r2";
22222af9 2616 while( !@candidates ) {
414cc046 2617 last unless $iter--; # Avoid looping infinitely
2618 # Iterate separately down the graph from r1 and r2
2619 my( @new_lc1, @new_lc2 );
2620 foreach my $lc ( @last_r1 ) {
2621 foreach my $p ( $lc->$dir ) {
2622 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
6771a1b1 2623 # say STDERR "Path candidate $p from $lc";
414cc046 2624 push( @candidates, $p );
002e3600 2625 } elsif( !$all_seen{$p->id} ) {
414cc046 2626 $all_seen{$p->id} = 'r1';
2627 push( @new_lc1, $p );
2628 }
2629 }
2630 }
2631 foreach my $lc ( @last_r2 ) {
22222af9 2632 foreach my $p ( $lc->$dir ) {
414cc046 2633 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
6771a1b1 2634 # say STDERR "Path candidate $p from $lc";
22222af9 2635 push( @candidates, $p );
002e3600 2636 } elsif( !$all_seen{$p->id} ) {
414cc046 2637 $all_seen{$p->id} = 'r2';
2638 push( @new_lc2, $p );
22222af9 2639 }
2640 }
2641 }
414cc046 2642 @last_r1 = @new_lc1;
2643 @last_r2 = @new_lc2;
22222af9 2644 }
2645 my @answer = sort { $a->rank <=> $b->rank } @candidates;
2646 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
2647}
2648
63778331 2649sub throw {
2650 Text::Tradition::Error->throw(
2651 'ident' => 'Collation error',
2652 'message' => $_[0],
2653 );
2654}
2655
dd3b58b0 2656no Moose;
2657__PACKAGE__->meta->make_immutable;
e867486f 2658
a445ce40 2659=head1 BUGS/TODO
2660
2661=over
2662
2663=item * Rework XML serialization in a more modular way
2664
2665=back
2666
027d819c 2667=head1 LICENSE
e867486f 2668
027d819c 2669This package is free software and is provided "as is" without express
2670or implied warranty. You can redistribute it and/or modify it under
2671the same terms as Perl itself.
e867486f 2672
027d819c 2673=head1 AUTHOR
e867486f 2674
027d819c 2675Tara L Andrews E<lt>aurum@cpan.orgE<gt>