1 package Text::Tradition::Collation;
4 use Encode qw( decode_utf8 );
8 use IPC::Run qw( run binary );
9 use JSON qw/ to_json /;
11 use Text::Tradition::Collation::Data;
12 use Text::Tradition::Collation::Reading;
13 use Text::Tradition::Collation::RelationshipStore;
14 use Text::Tradition::Error;
15 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
17 use XML::LibXML::XPathContext;
21 isa => 'Text::Tradition::Collation::Data',
61 isa => 'Text::Tradition',
62 writer => '_set_tradition',
70 Text::Tradition::Collation - a software model for a text collation
75 my $t = Text::Tradition->new(
76 'name' => 'this is a text',
78 'file' => '/path/to/tei_parallel_seg_file.xml' );
80 my $c = $t->collation;
81 my @readings = $c->readings;
82 my @paths = $c->paths;
83 my @relationships = $c->relationships;
85 my $svg_variant_graph = $t->collation->as_svg();
89 Text::Tradition is a library for representation and analysis of collated
90 texts, particularly medieval ones. The Collation is the central feature of
91 a Tradition, where the text, its sequence of readings, and its relationships
92 between readings are actually kept.
98 The constructor. Takes a hash or hashref of the following arguments:
102 =item * tradition - The Text::Tradition object to which the collation
105 =item * linear - Whether the collation should be linear; that is, whether
106 transposed readings should be treated as two linked readings rather than one,
107 and therefore whether the collation graph is acyclic. Defaults to true.
109 =item * baselabel - The default label for the path taken by a base text
110 (if any). Defaults to 'base text'.
112 =item * wit_list_separator - The string to join a list of witnesses for
113 purposes of making labels in display graphs. Defaults to ', '.
115 =item * ac_label - The extra label to tack onto a witness sigil when
116 representing another layer of path for the given witness - that is, when
117 a text has more than one possible reading due to scribal corrections or
118 the like. Defaults to ' (a.c.)'.
120 =item * wordsep - The string used to separate words in the original text.
131 =head2 wit_list_separator
139 Simple accessors for collation attributes.
143 The meta-reading at the start of every witness path.
147 The meta-reading at the end of every witness path.
151 Returns all Reading objects in the graph.
153 =head2 reading( $id )
155 Returns the Reading object corresponding to the given ID.
157 =head2 add_reading( $reading_args )
159 Adds a new reading object to the collation.
160 See L<Text::Tradition::Collation::Reading> for the available arguments.
162 =head2 del_reading( $object_or_id )
164 Removes the given reading from the collation, implicitly removing its
165 paths and relationships.
167 =head2 has_reading( $id )
169 Predicate to see whether a given reading ID is in the graph.
171 =head2 reading_witnesses( $object_or_id )
173 Returns a list of sigils whose witnesses contain the reading.
177 Returns all reading paths within the document - that is, all edges in the
178 collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
180 =head2 add_path( $source, $target, $sigil )
182 Links the given readings in the collation in sequence, under the given witness
183 sigil. The readings may be specified by object or ID.
185 =head2 del_path( $source, $target, $sigil )
187 Links the given readings in the collation in sequence, under the given witness
188 sigil. The readings may be specified by object or ID.
190 =head2 has_path( $source, $target );
192 Returns true if the two readings are linked in sequence in any witness.
193 The readings may be specified by object or ID.
197 Returns all Relationship objects in the collation.
199 =head2 add_relationship( $reading, $other_reading, $options, $changed_readings )
201 Adds a new relationship of the type given in $options between the two readings,
202 which may be specified by object or ID. Returns a value of ( $status, @vectors)
203 where $status is true on success, and @vectors is a list of relationship edges
204 that were ultimately added. If an array reference is passed in as $changed_readings,
205 then any readings that were altered due to the relationship creation are added to
208 See L<Text::Tradition::Collation::Relationship> for the available options.
213 my ( $class, @args ) = @_;
214 my %args = @args == 1 ? %{ $args[0] } : @args;
215 # TODO determine these from the Moose::Meta object
216 my @delegate_attrs = qw(sequence relations readings wit_list_separator baselabel
217 linear wordsep direction start end cached_table _graphcalc_done);
219 for my $attr (@delegate_attrs) {
220 $data_args{$attr} = delete $args{$attr} if exists $args{$attr};
222 $args{_data} = Text::Tradition::Collation::Data->new(%data_args);
228 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
229 $self->_set_start( $self->add_reading(
230 { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
231 $self->_set_end( $self->add_reading(
232 { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
235 =head2 register_relationship_type( %relationship_definition )
237 Add a relationship type definition to this collation. The argument can be either a
238 hash or a hashref, defining the properties of the relationship. For relationship types
239 and their properties, see L<Text::Tradition::Collation::RelationshipType>.
241 =head2 get_relationship_type( $relationship_name )
243 Retrieve the RelationshipType object for the relationship with the given name.
247 sub register_relationship_type {
249 my %args = @_ == 1 ? %{$_[0]} : @_;
250 if( $self->relations->has_type( $args{name} ) ) {
251 throw( 'Relationship type ' . $args{name} . ' already registered' );
253 $self->relations->add_type( %args );
256 sub get_relationship_type {
257 my( $self, $name ) = @_;
258 return $self->relations->has_type( $name )
259 ? $self->relations->type( $name ) : undef;
262 ### Reading construct/destruct functions
265 my( $self, $reading ) = @_;
266 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
267 my %args = %$reading;
268 if( $args{'init'} ) {
269 # If we are initializing an empty collation, don't assume that we
270 # have set a tradition.
271 delete $args{'init'};
272 } elsif( $self->tradition->can('language') && $self->tradition->has_language
273 && !exists $args{'language'} ) {
274 $args{'language'} = $self->tradition->language;
276 $reading = Text::Tradition::Collation::Reading->new(
277 'collation' => $self,
280 # First check to see if a reading with this ID exists.
281 if( $self->reading( $reading->id ) ) {
282 throw( "Collation already has a reading with id " . $reading->id );
284 $self->_graphcalc_done(0);
285 $self->_add_reading( $reading->id => $reading );
286 # Once the reading has been added, put it in both graphs.
287 $self->sequence->add_vertex( $reading->id );
288 $self->relations->add_reading( $reading->id );
292 around del_reading => sub {
297 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
300 # Remove the reading from the graphs.
301 $self->_graphcalc_done(0);
302 $self->_clear_cache; # Explicitly clear caches to GC the reading
303 $self->sequence->delete_vertex( $arg );
304 $self->relations->delete_reading( $arg );
307 $self->$orig( $arg );
310 =head2 merge_readings( $main, $second, $concatenate, $with_str )
312 Merges the $second reading into the $main one. If $concatenate is true, then
313 the merged node will carry the text of both readings, concatenated with either
314 $with_str (if specified) or a sensible default (the empty string if the
315 appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
317 The first two arguments may be either readings or reading IDs.
324 my $cxfile = 't/data/Collatex-16.xml';
325 my $t = Text::Tradition->new(
327 'input' => 'CollateX',
330 my $c = $t->collation;
332 my $rno = scalar $c->readings;
333 # Split n21 ('unto') for testing purposes
334 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
335 my $old_r = $c->reading( 'n21' );
336 $old_r->alter_text( 'to' );
337 $c->del_path( 'n20', 'n21', 'A' );
338 $c->add_path( 'n20', 'n21p0', 'A' );
339 $c->add_path( 'n21p0', 'n21', 'A' );
340 $c->add_relationship( 'n21', 'n22', { type => 'collated', scope => 'local' } );
342 ok( $c->reading( 'n21p0' ), "New reading exists" );
343 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
345 # Combine n3 and n4 ( with his )
346 $c->merge_readings( 'n3', 'n4', 1 );
347 ok( !$c->reading('n4'), "Reading n4 is gone" );
348 is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
350 # Collapse n9 and n10 ( rood / root )
351 $c->merge_readings( 'n9', 'n10' );
352 ok( !$c->reading('n10'), "Reading n10 is gone" );
353 is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
355 # Try to combine n21 and n21p0. This should break.
356 my $remaining = $c->reading('n21');
357 $remaining ||= $c->reading('n22'); # one of these should still exist
359 $c->merge_readings( 'n21p0', $remaining, 1 );
360 ok( 0, "Bad reading merge changed the graph" );
361 } catch( Text::Tradition::Error $e ) {
362 like( $e->message, qr/neither concatenated nor collated/, "Expected exception from bad concatenation" );
364 ok( 0, "Unexpected error on bad reading merge: $@" );
368 $c->calculate_ranks();
369 ok( 1, "Graph is still evidently whole" );
370 } catch( Text::Tradition::Error $e ) {
371 ok( 0, "Caught a rank exception: " . $e->message );
382 my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
383 my $mergemeta = $kept_obj->is_meta;
384 throw( "Cannot merge meta and non-meta reading" )
385 unless ( $mergemeta && $del_obj->is_meta )
386 || ( !$mergemeta && !$del_obj->is_meta );
388 throw( "Cannot merge with start or end node" )
389 if( $kept_obj eq $self->start || $kept_obj eq $self->end
390 || $del_obj eq $self->start || $del_obj eq $self->end );
391 throw( "Cannot combine text of meta readings" ) if $combine;
393 # We can only merge readings in a linear graph if:
394 # - they are contiguous with only one edge between them, OR
395 # - they are at equivalent ranks in the graph.
396 if( $self->linear ) {
397 my @delpred = $del_obj->predecessors;
398 my @keptsuc = $kept_obj->successors;
399 unless ( @delpred == 1 && $delpred[0] eq $kept_obj
400 && @keptsuc == 1 && $keptsuc[0] eq $del_obj ) {
401 my( $is_ok, $msg ) = $self->relations->relationship_valid(
402 $kept_obj, $del_obj, 'collated' );
404 throw( "Readings $kept_obj and $del_obj can be neither concatenated nor collated" );
409 # We only need the IDs for adding paths to the graph, not the reading
410 # objects themselves.
411 my $kept = $kept_obj->id;
412 my $deleted = $del_obj->id;
413 $self->_graphcalc_done(0);
415 # The kept reading should inherit the paths and the relationships
416 # of the deleted reading.
417 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
418 my @vector = ( $kept );
419 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
420 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
421 next if $vector[0] eq $vector[1]; # Don't add a self loop
422 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
423 $self->sequence->add_edge( @vector );
424 my $fwits = $self->sequence->get_edge_attributes( @vector );
425 @wits{keys %$fwits} = values %$fwits;
426 $self->sequence->set_edge_attributes( @vector, \%wits );
428 $self->relations->merge_readings( $kept, $deleted, $combine );
430 # Do the deletion deed.
432 # Combine the text of the readings
433 my $joinstr = $combine_char;
434 unless( defined $joinstr ) {
435 $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
436 $joinstr = $self->wordsep unless defined $joinstr;
438 $kept_obj->_combine( $del_obj, $joinstr );
440 $self->del_reading( $deleted );
443 =head2 merge_related( @relationship_types )
445 Merge all readings linked with the relationship types given. If any of the selected type(s) is not a colocation, the graph will no longer be linear. The majority/plurality reading in each case will be the one kept.
447 WARNING: This operation cannot be undone.
459 $t = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/legendfrag.xml' );
460 } [qr/Cannot set relationship on a meta reading/],
461 "Got expected relationship drop warning on parse";
463 my $c = $t->collation;
464 # Force the transitive propagation of all existing relationships.
465 $c->relations->propagate_all_relationships();
468 map { $rdg_ids{$_} = 1 } $c->readings;
469 $c->merge_related( 'orthographic' );
470 is( scalar( $c->readings ), keys( %rdg_ids ) - 9,
471 "Successfully collapsed orthographic variation" );
472 map { $rdg_ids{$_} = undef } qw/ r13.3 r11.4 r8.5 r8.2 r7.7 r7.5 r7.4 r7.3 r7.1 /;
473 foreach my $rid ( keys %rdg_ids ) {
474 my $exp = $rdg_ids{$rid};
475 is( !$c->reading( $rid ), !$exp, "Reading $rid correctly " .
476 ( $exp ? "retained" : "removed" ) );
478 ok( $c->linear, "Graph is still linear" );
480 $c->calculate_ranks; # This should succeed
481 ok( 1, "Can still calculate ranks on the new graph" );
483 ok( 0, "Rank calculation on merged graph failed: $@" );
486 # Now add some transpositions
487 $c->add_relationship( 'r8.4', 'r10.4', { type => 'transposition' } );
488 $c->merge_related( 'transposition' );
489 is( scalar( $c->readings ), keys( %rdg_ids ) - 10,
490 "Transposed relationship is merged away" );
491 ok( !$c->reading('r8.4'), "Correct transposed reading removed" );
492 ok( !$c->linear, "Graph is no longer linear" );
494 $c->calculate_ranks; # This should fail
495 ok( 0, "Rank calculation happened on nonlinear graph?!" );
496 } catch ( Text::Tradition::Error $e ) {
497 is( $e->message, 'Cannot calculate ranks on a non-linear graph',
498 "Rank calculation on merged graph threw an error" );
505 # TODO: there should be a way to display merged without affecting the underlying data!
510 map { $reltypehash{$_} = 1 } @_;
512 # Set up the filter for finding related readings
514 exists $reltypehash{$_[0]->type};
517 # Go through all readings looking for related ones
518 foreach my $r ( $self->readings ) {
519 next unless $self->reading( "$r" ); # might have been deleted meanwhile
520 while( my @related = $self->related_readings( $r, $filter ) ) {
521 push( @related, $r );
523 scalar $b->witnesses <=> scalar $a->witnesses
525 my $keep = shift @related;
526 foreach my $delr ( @related ) {
528 unless( $self->get_relationship( $keep, $delr )->colocated );
529 $self->merge_readings( $keep, $delr );
535 =head2 compress_readings
537 Where possible in the graph, compresses plain sequences of readings into a
538 single reading. The sequences must consist of readings with no
539 relationships to other readings, with only a single witness path between
540 them and no other witness paths from either that would skip the other. The
541 readings must also not be marked as nonsense or bad grammar.
543 WARNING: This operation cannot be undone.
549 my $t = Text::Tradition->new( input => 'CollateX', file => 't/data/Collatex-16.xml' );
550 my $c = $t->collation;
551 my $n = scalar $c->readings;
552 $c->compress_readings();
553 is( scalar $c->readings, $n - 6, "Compressing readings seems to work" );
555 # Now put in a join-word and make sure the thing still works.
556 my $t2 = Text::Tradition->new( input => 'CollateX', file => 't/data/Collatex-16.xml' );
557 my $c2 = $t2->collation;
558 # Split n21 ('unto') for testing purposes
559 my $new_r = $c2->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
560 my $old_r = $c2->reading( 'n21' );
561 $old_r->alter_text( 'to' );
562 $c2->del_path( 'n20', 'n21', 'A' );
563 $c2->add_path( 'n20', 'n21p0', 'A' );
564 $c2->add_path( 'n21p0', 'n21', 'A' );
565 $c2->calculate_ranks();
566 is( scalar $c2->readings, $n + 1, "We have our extra test reading" );
567 $c2->compress_readings();
568 is( scalar $c2->readings, $n - 6, "Compressing readings also works with join_next" );
569 is( $c2->reading( 'n21p0' )->text, 'unto', "The joined word has no space" );
576 sub compress_readings {
578 # Sanity check: first save the original text of each witness.
580 foreach my $wit ( $self->tradition->witnesses ) {
581 $origtext{$wit->sigil} = $self->path_text( $wit->sigil );
582 if( $wit->is_layered ) {
583 my $acsig = $wit->sigil . $self->ac_label;
584 $origtext{$acsig} = $self->path_text( $acsig );
589 # Anywhere in the graph that there is a reading that joins only to a single
590 # successor, and neither of these have any relationships, just join the two
592 foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
593 # Now look for readings that can be joined to their successors.
594 next unless $rdg->is_combinable;
596 while( $self->sequence->successors( $rdg ) == 1 ) {
597 my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
598 throw( "Infinite loop" ) if $seen{$next->id};
599 $seen{$next->id} = 1;
600 last if $self->sequence->predecessors( $next ) > 1;
601 last unless $next->is_combinable;
602 say "Joining readings $rdg and $next";
603 $self->merge_readings( $rdg, $next, 1 );
607 # Finally, make sure we haven't screwed anything up.
608 foreach my $wit ( $self->tradition->witnesses ) {
609 my $pathtext = $self->path_text( $wit->sigil );
610 throw( "Text differs for witness " . $wit->sigil )
611 unless $pathtext eq $origtext{$wit->sigil};
612 if( $wit->is_layered ) {
613 my $acsig = $wit->sigil . $self->ac_label;
614 $pathtext = $self->path_text( $acsig );
615 throw( "Layered text differs for witness " . $wit->sigil )
616 unless $pathtext eq $origtext{$acsig};
620 $self->relations->rebuild_equivalence();
621 $self->calculate_ranks();
624 # Helper function for manipulating the graph.
625 sub _stringify_args {
626 my( $self, $first, $second, @args ) = @_;
628 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
629 $second = $second->id
630 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
631 return( $first, $second, @args );
634 # Helper function for manipulating the graph.
635 sub _objectify_args {
636 my( $self, $first, $second, $arg ) = @_;
637 $first = $self->reading( $first )
638 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
639 $second = $self->reading( $second )
640 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
641 return( $first, $second, $arg );
644 =head2 duplicate_reading( $reading, @witlist )
646 Split the given reading into two, so that the new reading is in the path for
647 the witnesses given in @witlist. If the result is that certain non-colocated
648 relationships (e.g. transpositions) are no longer valid, these will be removed.
649 Returns the newly-created reading.
653 use Test::More::UTF8;
657 my $st = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/collatecorr.xml' );
658 is( ref( $st ), 'Text::Tradition', "Got a tradition from test file" );
659 ok( $st->has_witness('Ba96'), "Tradition has the affected witness" );
661 my $sc = $st->collation;
663 ok( $sc->reading('n131'), "Tradition has the affected reading" );
664 is( scalar( $sc->readings ), $numr, "There are $numr readings in the graph" );
665 is( $sc->end->rank, 14, "There are fourteen ranks in the graph" );
667 # Detach the erroneously collated reading
668 my( $newr, @del_rdgs ) = $sc->duplicate_reading( 'n131', 'Ba96' );
669 ok( $newr, "New reading was created" );
670 ok( $sc->reading('n131_0'), "Detached the bad collation with a new reading" );
671 is( scalar( $sc->readings ), $numr + 1, "A reading was added to the graph" );
672 is( $sc->end->rank, 10, "There are now only ten ranks in the graph" );
673 my $csucc = $sc->common_successor( 'n131', 'n131_0' );
674 is( $csucc->id, 'n136', "Found correct common successor to duped reading" );
676 # Check that the bad transposition is gone
677 is( scalar @del_rdgs, 1, "Deleted reading was returned by API call" );
678 is( $sc->get_relationship( 'n130', 'n135' ), undef, "Bad transposition relationship is gone" );
680 # The collation should not be fixed
681 my @pairs = $sc->identical_readings();
682 is( scalar @pairs, 0, "Not re-collated yet" );
684 ok( $sc->merge_readings( 'n124', 'n131_0' ), "Collated the readings correctly" );
685 @pairs = $sc->identical_readings( start => 'n124', end => $csucc->id );
686 is( scalar @pairs, 3, "Found three more identical readings" );
687 is( $sc->end->rank, 11, "The ranks shifted appropriately" );
688 $sc->flatten_ranks();
689 is( scalar( $sc->readings ), $numr - 3, "Now we are collated correctly" );
691 # Check that we can't "duplicate" a reading with no wits or with all wits
693 my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124' );
694 ok( 0, "Reading duplication without witnesses throws an error" );
695 } catch( Text::Tradition::Error $e ) {
696 like( $e->message, qr/Must specify one or more witnesses/,
697 "Reading duplication without witnesses throws the expected error" );
699 ok( 0, "Reading duplication without witnesses threw the wrong error" );
703 my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124', 'Ba96', 'Mü11475' );
704 ok( 0, "Reading duplication with all witnesses throws an error" );
705 } catch( Text::Tradition::Error $e ) {
706 like( $e->message, qr/Cannot join all witnesses/,
707 "Reading duplication with all witnesses throws the expected error" );
709 ok( 0, "Reading duplication with all witnesses threw the wrong error" );
713 $sc->calculate_ranks();
714 ok( 1, "Graph is still evidently whole" );
715 } catch( Text::Tradition::Error $e ) {
716 ok( 0, "Caught a rank exception: " . $e->message );
723 sub duplicate_reading {
724 my( $self, $r, @wits ) = @_;
725 # Check that we are not doing anything unwise.
726 throw( "Must specify one or more witnesses for the duplicated reading" )
728 unless( ref( $r ) eq 'Text::Tradition::Collation::Reading' ) {
729 $r = $self->reading( $r );
731 throw( "Cannot duplicate a meta-reading" )
733 throw( "Cannot join all witnesses to the new reading" )
734 if scalar( @wits ) == scalar( $r->witnesses );
736 # Get all the reading attributes and duplicate them.
737 my $rmeta = Text::Tradition::Collation::Reading->meta;
739 foreach my $attr( $rmeta->get_all_attributes ) {
740 next if $attr->name =~ /^_/;
741 my $acc = $attr->get_read_method;
742 if( !$acc && $attr->has_applied_traits ) {
743 my $tr = $attr->applied_traits;
744 if( $tr->[0] =~ /::(Array|Hash)$/ ) {
746 my %methods = reverse %{$attr->handles};
747 $acc = $methods{elements};
748 $args{$attr->name} = $which eq 'Array'
749 ? [ $r->$acc ] : { $r->$acc };
752 my $attrval = $r->$acc;
753 if( defined $attrval ) {
754 $args{$attr->name} = $attrval;
758 # By definition the new reading will no longer be common.
759 $args{is_common} = 0;
760 # The new reading also needs its own ID.
761 $args{id} = $self->_generate_dup_id( $r->id );
763 # Try to make the new reading.
764 my $newr = $self->add_reading( \%args );
765 # The old reading is also no longer common.
768 # For each of the witnesses, dissociate from the old reading and
769 # associate with the new.
770 foreach my $wit ( @wits ) {
771 my $prior = $self->prior_reading( $r, $wit );
772 my $next = $self->next_reading( $r, $wit );
773 $self->del_path( $prior, $r, $wit );
774 $self->add_path( $prior, $newr, $wit );
775 $self->del_path( $r, $next, $wit );
776 $self->add_path( $newr, $next, $wit );
779 # If the graph is ranked, we need to look for relationships that are now
780 # invalid (i.e. 'non-colocation' types that might now be colocated) and
781 # remove them. If not, we can skip it.
784 my @deleted_relations;
785 if( $self->end->has_rank ) {
786 # Find the point where we can stop checking
787 $succ = $self->common_successor( $r, $newr );
789 # Hash the existing ranks
790 foreach my $rdg ( $self->readings ) {
791 $rrk{$rdg->id} = $rdg->rank;
793 # Calculate the new ranks
794 $self->calculate_ranks();
796 # Check for invalid non-colocated relationships among changed-rank readings
797 # from where the ranks start changing up to $succ
798 my $lastrank = $succ->rank;
799 foreach my $rdg ( $self->readings ) {
800 next if $rdg->rank > $lastrank;
801 next if $rdg->rank == $rrk{$rdg->id};
802 my @noncolo = $rdg->related_readings( sub { !$_[0]->colocated } );
803 next unless @noncolo;
804 foreach my $nc ( @noncolo ) {
805 unless( $self->relations->verify_or_delete( $rdg, $nc ) ) {
806 push( @deleted_relations, [ $rdg->id, $nc->id ] );
811 return ( $newr, @deleted_relations );
814 sub _generate_dup_id {
815 my( $self, $rid ) = @_;
820 if( $self->has_reading( $newid ) ) {
833 # We only need the IDs for adding paths to the graph, not the reading
834 # objects themselves.
835 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
837 $self->_graphcalc_done(0);
838 # Connect the readings
839 unless( $self->sequence->has_edge( $source, $target ) ) {
840 $self->sequence->add_edge( $source, $target );
841 $self->relations->add_equivalence_edge( $source, $target );
843 # Note the witness in question
844 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
850 if( ref( $_[0] ) eq 'ARRAY' ) {
857 # We only need the IDs for removing paths from the graph, not the reading
858 # objects themselves.
859 my( $source, $target, $wit ) = $self->_stringify_args( @args );
861 $self->_graphcalc_done(0);
862 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
863 $self->sequence->delete_edge_attribute( $source, $target, $wit );
865 unless( $self->sequence->has_edge_attributes( $source, $target ) ) {
866 $self->sequence->delete_edge( $source, $target );
867 $self->relations->delete_equivalence_edge( $source, $target );
872 # Extra graph-alike utility
875 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
876 return undef unless $self->sequence->has_edge( $source, $target );
877 return $self->sequence->has_edge_attribute( $source, $target, $wit );
880 =head2 clear_witness( @sigil_list )
882 Clear the given witnesses out of the collation entirely, removing references
883 to them in paths, and removing readings that belong only to them. Should only
884 be called via $tradition->del_witness.
889 my( $self, @sigils ) = @_;
891 $self->_graphcalc_done(0);
892 # Clear the witness(es) out of the paths
893 foreach my $e ( $self->paths ) {
894 foreach my $sig ( @sigils ) {
895 $self->del_path( $e, $sig );
899 # Clear out the newly unused readings
900 foreach my $r ( $self->readings ) {
901 unless( $self->reading_witnesses( $r ) ) {
902 $self->del_reading( $r );
907 sub add_relationship {
909 my( $source, $target, $opts, $altered_readings ) = $self->_stringify_args( @_ );
910 my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
912 foreach my $v ( @vectors ) {
913 my $rel = $self->get_relationship( $v );
914 next unless $rel->colocated;
915 my $r1 = $self->reading( $v->[0] );
916 my $r2 = $self->reading( $v->[1] );
917 # If it's a spelling or orthographic relationship, and one is marked
918 # as a lemma, set the normal form on the non-lemma to reflect that.
919 if( $r1->does( 'Text::Tradition::Morphology' ) ) {
920 my @changed = $r1->relationship_added( $r2, $rel );
921 if( ref( $altered_readings ) eq 'ARRAY' ) {
922 push( @$altered_readings, @changed );
926 if( $r1->has_rank && $r2->has_rank && $r1->rank ne $r2->rank ) {
927 $self->_graphcalc_done(0);
935 around qw/ get_relationship del_relationship / => sub {
939 if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
942 my @stringargs = $self->_stringify_args( @args );
943 $self->$orig( @stringargs );
946 =head2 reading_witnesses( $reading )
948 Return a list of sigils corresponding to the witnesses in which the reading appears.
952 sub reading_witnesses {
953 my( $self, $reading ) = @_;
954 # We need only check either the incoming or the outgoing edges; I have
955 # arbitrarily chosen "incoming". Thus, special-case the start node.
956 if( $reading eq $self->start ) {
957 return map { $_->sigil } grep { $_->is_collated } $self->tradition->witnesses;
960 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
961 my $wits = $self->sequence->get_edge_attributes( @$e );
962 @all_witnesses{ keys %$wits } = 1;
964 my $acstr = $self->ac_label;
965 foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
966 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
968 return keys %all_witnesses;
971 =head1 OUTPUT METHODS
973 =head2 as_svg( \%options )
975 Returns an SVG string that represents the graph, via as_dot and graphviz.
976 See as_dot for a list of options. Must have GraphViz (dot) installed to run.
983 use XML::LibXML::XPathContext;
987 skip( 'Need Graphviz installed to test graphs', 16 )
988 unless File::Which::which( 'dot' );
990 my $datafile = 't/data/Collatex-16.xml';
992 my $tradition = Text::Tradition->new(
994 'input' => 'CollateX',
997 my $collation = $tradition->collation;
999 # Test the svg creation
1000 my $parser = XML::LibXML->new();
1001 $parser->load_ext_dtd( 0 );
1002 my $svg = $parser->parse_string( $collation->as_svg() );
1003 is( $svg->documentElement->nodeName(), 'svg', 'Got an svg document' );
1005 # Test for the correct number of nodes in the SVG
1006 my $svg_xpc = XML::LibXML::XPathContext->new( $svg->documentElement() );
1007 $svg_xpc->registerNs( 'svg', 'http://www.w3.org/2000/svg' );
1008 my @svg_nodes = $svg_xpc->findnodes( '//svg:g[@class="node"]' );
1009 is( scalar @svg_nodes, 26, "Correct number of nodes in the graph" );
1011 # Test for the correct number of edges
1012 my @svg_edges = $svg_xpc->findnodes( '//svg:g[@class="edge"]' );
1013 is( scalar @svg_edges, 32, "Correct number of edges in the graph" );
1015 # Test svg creation for a subgraph
1016 my $part_svg = $parser->parse_string( $collation->as_svg( { from => 15 } ) ); # start, no end
1017 is( $part_svg->documentElement->nodeName(), 'svg', "Got an svg subgraph to end" );
1018 my $part_xpc = XML::LibXML::XPathContext->new( $part_svg->documentElement() );
1019 $part_xpc->registerNs( 'svg', 'http://www.w3.org/2000/svg' );
1020 @svg_nodes = $part_xpc->findnodes( '//svg:g[@class="node"]' );
1021 is( scalar( @svg_nodes ), 9,
1022 "Correct number of nodes in the subgraph" );
1023 @svg_edges = $part_xpc->findnodes( '//svg:g[@class="edge"]' );
1024 is( scalar( @svg_edges ), 10,
1025 "Correct number of edges in the subgraph" );
1027 $part_svg = $parser->parse_string( $collation->as_svg( { from => 10, to => 13 } ) ); # start, no end
1028 is( $part_svg->documentElement->nodeName(), 'svg', "Got an svg subgraph in the middle" );
1029 $part_xpc = XML::LibXML::XPathContext->new( $part_svg->documentElement() );
1030 $part_xpc->registerNs( 'svg', 'http://www.w3.org/2000/svg' );
1031 @svg_nodes = $part_xpc->findnodes( '//svg:g[@class="node"]' );
1032 is( scalar( @svg_nodes ), 9,
1033 "Correct number of nodes in the subgraph" );
1034 @svg_edges = $part_xpc->findnodes( '//svg:g[@class="edge"]' );
1035 is( scalar( @svg_edges ), 11,
1036 "Correct number of edges in the subgraph" );
1039 $part_svg = $parser->parse_string( $collation->as_svg( { to => 5 } ) ); # start, no end
1040 is( $part_svg->documentElement->nodeName(), 'svg', "Got an svg subgraph from start" );
1041 $part_xpc = XML::LibXML::XPathContext->new( $part_svg->documentElement() );
1042 $part_xpc->registerNs( 'svg', 'http://www.w3.org/2000/svg' );
1043 @svg_nodes = $part_xpc->findnodes( '//svg:g[@class="node"]' );
1044 is( scalar( @svg_nodes ), 7,
1045 "Correct number of nodes in the subgraph" );
1046 @svg_edges = $part_xpc->findnodes( '//svg:g[@class="edge"]' );
1047 is( scalar( @svg_edges ), 7,
1048 "Correct number of edges in the subgraph" );
1050 # Test a right-to-left graph
1051 my $arabic = Text::Tradition->new(
1056 file => 't/data/arabic_snippet.csv' );
1057 my $rl_svg = $parser->parse_string( $arabic->collation->as_svg() );
1058 is( $rl_svg->documentElement->nodeName(), 'svg', "Got an svg subgraph from start" );
1059 my $rl_xpc = XML::LibXML::XPathContext->new( $rl_svg->documentElement() );
1060 $rl_xpc->registerNs( 'svg', 'http://www.w3.org/2000/svg' );
1062 foreach my $node ( $rl_xpc->findnodes( '//svg:g[@class="node"]' ) ) {
1063 my $nid = $node->getAttribute('id');
1064 $node_cx{$nid} = $rl_xpc->findvalue( './svg:ellipse/@cx', $node );
1066 my @sorted = sort { $node_cx{$a} <=> $node_cx{$b} } keys( %node_cx );
1067 is( $sorted[0], '__END__', "End node is the leftmost" );
1068 is( $sorted[$#sorted], '__START__', "Start node is the rightmost" );
1077 my( $self, $opts ) = @_;
1078 throw( "Need GraphViz installed to output SVG" )
1079 unless File::Which::which( 'dot' );
1080 my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
1081 $self->calculate_ranks()
1082 unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
1083 my @cmd = qw/dot -Tsvg/;
1085 my $dotfile = File::Temp->new();
1086 ## USE FOR DEBUGGING
1087 # $dotfile->unlink_on_destroy(0);
1088 binmode $dotfile, ':utf8';
1089 print $dotfile $self->as_dot( $opts );
1090 push( @cmd, $dotfile->filename );
1091 run( \@cmd, ">", binary(), \$svg );
1092 $svg = decode_utf8( $svg );
1097 =head2 as_dot( \%options )
1099 Returns a string that is the collation graph expressed in dot
1100 (i.e. GraphViz) format. Options include:
1108 =item * color_common
1115 my( $self, $opts ) = @_;
1116 my $startrank = $opts->{'from'} if $opts;
1117 my $endrank = $opts->{'to'} if $opts;
1118 my $color_common = $opts->{'color_common'} if $opts;
1119 my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
1120 && $self->end->rank > 100;
1121 $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
1123 # Check the arguments
1125 return if $endrank && $startrank > $endrank;
1126 return if $startrank > $self->end->rank;
1128 if( defined $endrank ) {
1129 return if $endrank < 0;
1130 $endrank = undef if $endrank == $self->end->rank;
1133 my $graph_name = $self->tradition->name;
1134 $graph_name =~ s/[^\w\s]//g;
1135 $graph_name = join( '_', split( /\s+/, $graph_name ) );
1138 'bgcolor' => 'none',
1140 unless( $self->direction eq 'BI' ) {
1141 $graph_attrs{rankdir} = $self->direction;
1145 'fillcolor' => 'white',
1146 'style' => 'filled',
1147 'shape' => 'ellipse'
1150 'arrowhead' => 'open',
1151 'color' => '#000000',
1152 'fontcolor' => '#000000',
1155 my $dot = sprintf( "digraph %s {\n", $graph_name );
1156 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
1157 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
1159 # Output substitute start/end readings if necessary
1161 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
1164 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
1166 if( $STRAIGHTENHACK ) {
1168 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
1169 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
1170 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
1172 my %used; # Keep track of the readings that actually appear in the graph
1173 # Sort the readings by rank if we have ranks; this speeds layout.
1174 my @all_readings = $self->end->has_rank
1175 ? sort { $a->rank <=> $b->rank } $self->readings
1177 # TODO Refrain from outputting lacuna nodes - just grey out the edges.
1178 foreach my $reading ( @all_readings ) {
1179 # Only output readings within our rank range.
1180 next if $startrank && $reading->rank < $startrank;
1181 next if $endrank && $reading->rank > $endrank;
1182 $used{$reading->id} = 1;
1183 # Need not output nodes without separate labels
1184 next if $reading->id eq $reading->text;
1186 my $label = $reading->text;
1187 unless( $label =~ /^[[:punct:]]+$/ ) {
1188 $label .= '-' if $reading->join_next;
1189 $label = "-$label" if $reading->join_prior;
1191 $label =~ s/\"/\\\"/g;
1192 $rattrs->{'label'} = $label;
1193 $rattrs->{'id'} = $reading->id;
1194 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
1195 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
1198 # Add the real edges.
1199 my @edges = $self->paths;
1200 my( %substart, %subend );
1201 foreach my $edge ( @edges ) {
1202 # Do we need to output this edge?
1203 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
1204 my $label = $self->_path_display_label( $opts,
1205 $self->path_witnesses( $edge ) );
1206 my $variables = { %edge_attrs, 'label' => $label };
1208 # Account for the rank gap if necessary
1209 my $rank0 = $self->reading( $edge->[0] )->rank
1210 if $self->reading( $edge->[0] )->has_rank;
1211 my $rank1 = $self->reading( $edge->[1] )->rank
1212 if $self->reading( $edge->[1] )->has_rank;
1213 if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
1214 $variables->{'minlen'} = $rank1 - $rank0;
1217 # EXPERIMENTAL: make edge width reflect no. of witnesses
1218 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
1219 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
1221 my $varopts = _dot_attr_string( $variables );
1222 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1223 $edge->[0], $edge->[1], $varopts );
1224 } elsif( $used{$edge->[0]} ) {
1225 $subend{$edge->[0]} = $edge->[1];
1226 } elsif( $used{$edge->[1]} ) {
1227 $substart{$edge->[1]} = $edge->[0];
1231 # If we are asked to, add relationship links
1232 if( exists $opts->{show_relations} ) {
1233 my $filter = $opts->{show_relations}; # can be 'transposition' or 'all'
1234 if( $filter eq 'transposition' ) {
1235 $filter =~ qr/^transposition$/;
1238 my @types = sort( map { $_->name } $self->relations->types );
1239 if( exists $opts->{graphcolors} ) {
1240 foreach my $tdx ( 0 .. $#types ) {
1241 $typecolors{$types[$tdx]} = $opts->{graphcolors}->[$tdx];
1244 map { $typecolors{$_} = '#FFA14F' } @types;
1246 foreach my $redge ( $self->relationships ) {
1247 if( $used{$redge->[0]} && $used{$redge->[1]} ) {
1248 my $rel = $self->get_relationship( $redge );
1249 next unless $filter eq 'all' || $rel->type =~ /$filter/;
1251 arrowhead => 'none',
1252 color => $typecolors{$rel->type},
1253 constraint => 'false',
1256 unless( exists $opts->{graphcolors} ) {
1257 $variables->{label} = uc( substr( $rel->type, 0, 4 ) ),
1259 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1260 $redge->[0], $redge->[1], _dot_attr_string( $variables ) );
1265 # Add substitute start and end edges if necessary
1266 foreach my $node ( keys %substart ) {
1267 my $witstr = $self->_path_display_label( $opts,
1268 $self->path_witnesses( $substart{$node}, $node ) );
1269 my $variables = { %edge_attrs, 'label' => $witstr };
1270 my $nrdg = $self->reading( $node );
1271 if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
1272 # Substart is actually one lower than $startrank
1273 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
1275 my $varopts = _dot_attr_string( $variables );
1276 $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
1278 foreach my $node ( keys %subend ) {
1279 my $witstr = $self->_path_display_label( $opts,
1280 $self->path_witnesses( $node, $subend{$node} ) );
1281 my $variables = { %edge_attrs, 'label' => $witstr };
1282 my $varopts = _dot_attr_string( $variables );
1283 $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
1286 if( $STRAIGHTENHACK ) {
1287 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
1288 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
1295 sub _dot_attr_string {
1298 foreach my $k ( sort keys %$hash ) {
1299 my $v = $hash->{$k};
1300 push( @attrs, $k.'="'.$v.'"' );
1302 return( '[ ' . join( ', ', @attrs ) . ' ]' );
1305 =head2 path_witnesses( $edge )
1307 Returns the list of sigils whose witnesses are associated with the given edge.
1308 The edge can be passed as either an array or an arrayref of ( $source, $target ).
1312 sub path_witnesses {
1313 my( $self, @edge ) = @_;
1314 # If edge is an arrayref, cope.
1315 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
1316 my $e = shift @edge;
1319 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
1323 # Helper function. Make a display label for the given witnesses, showing a.c.
1324 # witnesses only where the main witness is not also in the list.
1325 sub _path_display_label {
1329 map { $wits{$_} = 1 } @_;
1331 # If an a.c. wit is listed, remove it if the main wit is also listed.
1332 # Otherwise keep it for explicit listing.
1333 my $aclabel = $self->ac_label;
1335 foreach my $w ( sort keys %wits ) {
1336 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
1337 if( exists $wits{$1} ) {
1340 push( @disp_ac, $w );
1345 if( $opts->{'explicit_wits'} ) {
1346 return join( ', ', sort keys %wits );
1348 # See if we are in a majority situation.
1349 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
1350 $maj = $maj > 5 ? $maj : 5;
1351 if( scalar keys %wits > $maj ) {
1352 unshift( @disp_ac, 'majority' );
1353 return join( ', ', @disp_ac );
1355 return join( ', ', sort keys %wits );
1360 =head2 as_adjacency_list
1362 Returns a JSON structure that represents the collation sequence graph.
1366 use JSON qw/ from_json /;
1367 use Text::Tradition;
1369 my $t = Text::Tradition->new(
1371 'file' => 't/data/florilegium_graphml.xml' );
1372 my $c = $t->collation;
1374 # Make a connection so we can test rank preservation
1375 $c->add_relationship( 'w91', 'w92', { type => 'grammatical' } );
1377 # Create an adjacency list of the whole thing; test the output.
1378 my $adj_whole = from_json( $c->as_adjacency_list() );
1379 is( scalar @$adj_whole, scalar $c->readings(),
1380 "Same number of nodes in graph and adjacency list" );
1381 my @adj_whole_edges;
1382 map { push( @adj_whole_edges, @{$_->{adjacent}} ) } @$adj_whole;
1383 is( scalar @adj_whole_edges, scalar $c->sequence->edges,
1384 "Same number of edges in graph and adjacency list" );
1385 # Find the reading whose rank should be preserved
1386 my( $test_rdg ) = grep { $_->{id} eq 'w89' } @$adj_whole;
1387 my( $test_edge ) = grep { $_->{id} eq 'w92' } @{$test_rdg->{adjacent}};
1388 is( $test_edge->{minlen}, 2, "Rank of test reading is preserved" );
1390 # Now create an adjacency list of just a portion. w76 to w122
1391 my $adj_part = from_json( $c->as_adjacency_list(
1392 { from => $c->reading('w76')->rank,
1393 to => $c->reading('w122')->rank }));
1394 is( scalar @$adj_part, 48, "Correct number of nodes in partial graph" );
1396 map { push( @adj_part_edges, @{$_->{adjacent}} ) } @$adj_part;
1397 is( scalar @adj_part_edges, 58,
1398 "Same number of edges in partial graph and adjacency list" );
1399 # Check for consistency
1401 map { $part_nodes{$_->{id}} = 1 } @$adj_part;
1402 foreach my $edge ( @adj_part_edges ) {
1403 my $testid = $edge->{id};
1404 ok( $part_nodes{$testid}, "ID $testid referenced in edge is given as node" );
1411 sub as_adjacency_list {
1412 my( $self, $opts ) = @_;
1413 # Make a structure that contains all the nodes, the nodes they point to,
1414 # and the attributes of the edges that connect them.
1415 # [ { id: 'n0', label: 'Gallia', adjacent: [
1416 # { id: 'n1', label: 'P Q' } ,
1417 # { id: 'n2', label: 'R S', minlen: 2 } ] },
1418 # { id: 'n1', label: 'est', adjacent: [ ... ] },
1420 my $startrank = $opts->{'from'} || 0;
1421 my $endrank = $opts->{'to'} || $self->end->rank;
1423 $self->calculate_ranks()
1424 unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
1426 foreach my $rdg ( $self->readings ) {
1429 # Figure out what the node's successors should be.
1430 if( $rdg eq $self->start && $startrank > 0 ) {
1431 # Connect the start node with all the nodes at startrank.
1432 # Lacunas should be included only if the node really has that rank.
1433 @successors = $self->readings_at_rank( $startrank, 1 );
1435 } elsif( $rdg->rank < $startrank
1436 || $rdg->rank > $endrank && $rdg ne $self->end ) {
1439 @successors = $rdg->successors;
1441 # Make sure that the end node is at the end of the successors
1442 # list if it is needed.
1443 if( grep { $_ eq $self->end } @successors ) {
1444 my @ts = grep { $_ ne $self->end } @successors;
1445 @successors = ( @ts, $self->end );
1446 } elsif ( grep { $_->rank > $endrank } @successors ) {
1447 push( @successors, $self->end );
1450 my $listitem = { id => $rdg->id, label => $rdg->text };
1453 foreach my $succ ( @successors ) {
1455 if( $phony eq 'start' ) {
1456 @edgewits = $succ->witnesses;
1457 } elsif( $self->sequence->has_edge( $rdg->id, $succ->id ) ) {
1458 @edgewits = $self->path_witnesses( $rdg->id, $succ->id );
1461 if( $succ eq $self->end ) {
1462 @edgewits = @endwits;
1463 } elsif( $succ->rank > $endrank ) {
1464 # These witnesses will point to 'end' instead, not to the
1466 push( @endwits, @edgewits );
1469 my $edgelabel = $self->_path_display_label( $opts, @edgewits );
1470 my $edgedef = { id => $succ->id, label => $edgelabel };
1471 my $rankoffset = $succ->rank - $rdg->rank;
1472 if( $rankoffset > 1 and $succ ne $self->end ) {
1473 $edgedef->{minlen} = $rankoffset;
1475 push( @$adjacent, $edgedef );
1477 $listitem->{adjacent} = $adjacent;
1478 push( @$list, $listitem );
1480 return to_json( $list );
1485 Returns a GraphML representation of the collation. The GraphML will contain
1486 two graphs. The first expresses the attributes of the readings and the witness
1487 paths that link them; the second expresses the relationships that link the
1488 readings. This is the native transfer format for a tradition.
1492 use Text::Tradition;
1498 my $datafile = 't/data/florilegium_tei_ps.xml';
1499 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1501 'file' => $datafile,
1504 ok( $tradition, "Got a tradition object" );
1505 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
1506 ok( $tradition->collation, "Tradition has a collation" );
1508 my $c = $tradition->collation;
1509 is( scalar $c->readings, $READINGS, "Collation has all readings" );
1510 is( scalar $c->paths, $PATHS, "Collation has all paths" );
1511 is( scalar $c->relationships, 0, "Collation has all relationships" );
1513 # Add a few relationships
1514 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
1515 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
1516 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition',
1517 'is_significant' => 'yes' } );
1519 # Now write it to GraphML and parse it again.
1521 my $graphml = $c->as_graphml;
1522 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
1523 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
1524 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
1525 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
1526 my $sigrel = $st->collation->get_relationship( 'w257', 'w262' );
1527 is( $sigrel->is_significant, 'yes', "Ternary attribute value was restored" );
1529 # Now add a stemma, write to GraphML, and look at the output.
1531 skip "Analysis module not present", 3 unless $tradition->can( 'add_stemma' );
1532 my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
1533 is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
1534 is( $tradition->stemmata, 1, "Tradition now has the stemma" );
1535 $graphml = $c->as_graphml;
1536 like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
1543 ## TODO MOVE this to Tradition.pm and modularize it better
1545 my( $self, $options ) = @_;
1546 $self->calculate_ranks unless $self->_graphcalc_done;
1548 my $start = $options->{'from'}
1549 ? $self->reading( $options->{'from'} ) : $self->start;
1550 my $end = $options->{'to'}
1551 ? $self->reading( $options->{'to'} ) : $self->end;
1552 if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
1553 throw( 'Start node must be before end node' );
1555 # The readings need to be ranked for this to work.
1556 $start = $self->start unless $start->has_rank;
1557 $end = $self->end unless $end->has_rank;
1559 unless( $start eq $self->start ) {
1560 $rankoffset = $start->rank - 1;
1565 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
1566 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
1567 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
1568 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
1570 # Create the document and root node
1571 require XML::LibXML;
1572 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
1573 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
1574 $graphml->setDocumentElement( $root );
1575 $root->setNamespace( $xsi_ns, 'xsi', 0 );
1576 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
1578 # List of attribute types to save on our objects and their corresponding
1583 'Bool' => 'boolean',
1584 'ReadingID' => 'string',
1585 'RelationshipType' => 'string',
1586 'RelationshipScope' => 'string',
1587 'Ternary' => 'string',
1590 # Add the data keys for the graph. Include an extra key 'version' for the
1591 # GraphML output version.
1592 my %graph_data_keys;
1594 my %graph_attributes = ( 'version' => 'string' );
1595 # Graph attributes include those of Tradition and those of Collation.
1597 # TODO Use meta introspection method from duplicate_reading to do this
1598 # instead of naming custom keys.
1599 my $tmeta = $self->tradition->meta;
1600 my $cmeta = $self->meta;
1601 map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
1602 map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
1603 foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
1604 next if $attr->name =~ /^_/;
1605 next unless $save_types{$attr->type_constraint->name};
1606 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1608 # Extra custom keys for complex objects that should be saved in some form.
1609 # The subroutine should return a string, or undef/empty.
1610 if( $tmeta->has_method('stemmata') ) {
1611 $graph_attributes{'stemmata'} = sub {
1613 map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1614 $self->tradition->stemmata;
1615 join( "\n", @stemstrs );
1619 if( $tmeta->has_method('user') ) {
1620 $graph_attributes{'user'} = sub {
1621 $self->tradition->user ? $self->tradition->user->id : undef
1625 foreach my $datum ( sort keys %graph_attributes ) {
1626 $graph_data_keys{$datum} = 'dg'.$gdi++;
1627 my $key = $root->addNewChild( $graphml_ns, 'key' );
1628 my $dtype = ref( $graph_attributes{$datum} ) ? 'string'
1629 : $graph_attributes{$datum};
1630 $key->setAttribute( 'attr.name', $datum );
1631 $key->setAttribute( 'attr.type', $dtype );
1632 $key->setAttribute( 'for', 'graph' );
1633 $key->setAttribute( 'id', $graph_data_keys{$datum} );
1636 # Add the data keys for reading nodes
1637 my %reading_attributes;
1638 my $rmeta = Text::Tradition::Collation::Reading->meta;
1639 foreach my $attr( $rmeta->get_all_attributes ) {
1640 next if $attr->name =~ /^_/;
1641 next unless $save_types{$attr->type_constraint->name};
1642 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1644 if( $self->start->does('Text::Tradition::Morphology' ) ) {
1645 # Extra custom key for the reading morphology
1646 $reading_attributes{'lexemes'} = 'string';
1651 foreach my $datum ( sort keys %reading_attributes ) {
1652 $node_data_keys{$datum} = 'dn'.$ndi++;
1653 my $key = $root->addNewChild( $graphml_ns, 'key' );
1654 $key->setAttribute( 'attr.name', $datum );
1655 $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
1656 $key->setAttribute( 'for', 'node' );
1657 $key->setAttribute( 'id', $node_data_keys{$datum} );
1660 # Add the data keys for edges, that is, paths and relationships. Path
1661 # data does not come from a Moose class so is here manually.
1664 my %edge_attributes = (
1665 witness => 'string', # ID/label for a path
1666 extra => 'boolean', # Path key
1668 my @path_attributes = keys %edge_attributes; # track our manual additions
1669 my $pmeta = Text::Tradition::Collation::Relationship->meta;
1670 foreach my $attr( $pmeta->get_all_attributes ) {
1671 next if $attr->name =~ /^_/;
1672 next unless $save_types{$attr->type_constraint->name};
1673 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1675 foreach my $datum ( sort keys %edge_attributes ) {
1676 $edge_data_keys{$datum} = 'de'.$edi++;
1677 my $key = $root->addNewChild( $graphml_ns, 'key' );
1678 $key->setAttribute( 'attr.name', $datum );
1679 $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
1680 $key->setAttribute( 'for', 'edge' );
1681 $key->setAttribute( 'id', $edge_data_keys{$datum} );
1684 # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1685 my $xmlidname = $self->tradition->name;
1686 $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1687 if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1688 $xmlidname = '_'.$xmlidname;
1690 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1691 $sgraph->setAttribute( 'edgedefault', 'directed' );
1692 $sgraph->setAttribute( 'id', $xmlidname );
1693 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
1694 $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
1695 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
1696 $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
1697 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
1699 # Tradition/collation attribute data
1700 foreach my $datum ( keys %graph_attributes ) {
1702 if( $datum eq 'version' ) {
1704 } elsif( ref( $graph_attributes{$datum} ) ) {
1705 my $sub = $graph_attributes{$datum};
1707 } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1708 $value = $self->tradition->$datum;
1710 $value = $self->$datum;
1712 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
1717 # Add our readings to the graph
1718 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
1719 next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1720 ( $n->rank < $start->rank || $n->rank > $end->rank );
1721 $use_readings{$n->id} = 1;
1722 # Add to the main graph
1723 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
1724 my $node_xmlid = 'n' . $node_ctr++;
1725 $node_hash{ $n->id } = $node_xmlid;
1726 $node_el->setAttribute( 'id', $node_xmlid );
1727 foreach my $d ( keys %reading_attributes ) {
1729 # Custom serialization
1730 if( $d eq 'lexemes' ) {
1731 # If nval is a true value, we have lexemes so we need to
1732 # serialize them. Otherwise set nval to undef so that the
1733 # key is excluded from this reading.
1734 $nval = $nval ? $n->_serialize_lexemes : undef;
1735 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1738 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
1739 # Adjust the ranks within the subgraph.
1740 $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1741 : $nval - $rankoffset;
1743 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1748 # Add the path edges to the sequence graph
1750 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1751 # We add an edge in the graphml for every witness in $e.
1752 next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1753 my @edge_wits = sort $self->path_witnesses( $e );
1754 $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1755 $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1756 # Skip any path from start to end; that witness is not in the subgraph.
1757 next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
1758 foreach my $wit ( @edge_wits ) {
1759 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1760 $node_hash{ $e->[0] },
1761 $node_hash{ $e->[1] } );
1762 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
1763 $edge_el->setAttribute( 'source', $from );
1764 $edge_el->setAttribute( 'target', $to );
1765 $edge_el->setAttribute( 'id', $id );
1767 # It's a witness path, so add the witness
1769 my $key = $edge_data_keys{'witness'};
1770 # Is this an ante-corr witness?
1771 my $aclabel = $self->ac_label;
1772 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1773 # Keep the base witness
1775 # ...and record that this is an 'extra' reading path
1776 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1778 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1782 # Report the actual number of nodes and edges that went in
1783 $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1784 $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1786 # Add the relationship graph to the XML
1787 map { delete $edge_data_keys{$_} } @path_attributes;
1788 $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1789 $node_data_keys{'id'}, \%edge_data_keys );
1791 # Save and return the thing
1792 my $result = decode_utf8( $graphml->toString(1) );
1796 sub _add_graphml_data {
1797 my( $el, $key, $value ) = @_;
1798 return unless defined $value;
1799 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
1800 $data_el->setAttribute( 'key', $key );
1801 $data_el->appendText( $value );
1806 Returns a CSV alignment table representation of the collation graph, one
1807 row per witness (or witness uncorrected.)
1811 Returns a tab-separated alignment table representation of the collation graph,
1812 one row per witness (or witness uncorrected.)
1816 use Text::Tradition;
1824 my $datafile = 't/data/florilegium_tei_ps.xml';
1825 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1827 'file' => $datafile,
1830 my $c = $tradition->collation;
1831 # Export the thing to CSV
1832 my $csvstr = $c->as_csv();
1834 my $csv = Text::CSV->new({ sep_char => ',', binary => 1 });
1835 my @lines = split(/\n/, $csvstr );
1836 ok( $csv->parse( $lines[0] ), "Successfully parsed first line of CSV" );
1837 is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1838 my @q_ac = grep { $_ eq 'Q'.$c->ac_label } $csv->fields;
1839 ok( @q_ac, "Found a layered witness" );
1841 my $t2 = Text::Tradition->new( input => 'Tabular',
1845 is( scalar $t2->collation->readings, $READINGS, "Reparsed CSV collation has all readings" );
1846 is( scalar $t2->collation->paths, $PATHS, "Reparsed CSV collation has all paths" );
1848 # Now do it with TSV
1849 my $tsvstr = $c->as_tsv();
1850 my $t3 = Text::Tradition->new( input => 'Tabular',
1854 is( scalar $t3->collation->readings, $READINGS, "Reparsed TSV collation has all readings" );
1855 is( scalar $t3->collation->paths, $PATHS, "Reparsed TSV collation has all paths" );
1857 my $table = $c->alignment_table;
1858 my $noaccsv = $c->as_csv({ noac => 1 });
1859 my @noaclines = split(/\n/, $noaccsv );
1860 ok( $csv->parse( $noaclines[0] ), "Successfully parsed first line of no-ac CSV" );
1861 is( scalar( $csv->fields ), $WITS, "CSV has correct number of witness columns" );
1862 is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
1864 my $safecsv = $c->as_csv({ safe_ac => 1});
1865 my @safelines = split(/\n/, $safecsv );
1866 ok( $csv->parse( $safelines[0] ), "Successfully parsed first line of safe CSV" );
1867 is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1868 @q_ac = grep { $_ eq 'Q__L' } $csv->fields;
1869 ok( @q_ac, "Found a sanitized layered witness" );
1870 is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
1872 # Test relationship collapse
1873 $c->add_relationship( $c->readings_at_rank( 37 ), { type => 'spelling' } );
1874 $c->add_relationship( $c->readings_at_rank( 60 ), { type => 'spelling' } );
1876 my $mergedtsv = $c->as_tsv({mergetypes => [ 'spelling', 'orthographic' ] });
1877 my $t4 = Text::Tradition->new( input => 'Tabular',
1879 string => $mergedtsv,
1881 is( scalar $t4->collation->readings, $READINGS - 2, "Reparsed TSV merge collation has fewer readings" );
1882 is( scalar $t4->collation->paths, $PATHS - 4, "Reparsed TSV merge collation has fewer paths" );
1884 # Test non-ASCII sigla
1885 my $t5 = Text::Tradition->new( input => 'Tabular',
1887 file => 't/data/armexample.xlsx',
1889 my $awittsv = $t5->collation->as_tsv({ noac => 1, ascii => 1 });
1890 my @awitlines = split( /\n/, $awittsv );
1891 like( $awitlines[0], qr/_A_5315622/, "Found ASCII sigil variant in TSV" );
1898 my( $self, $opts ) = @_;
1899 my $table = $self->alignment_table( $opts );
1900 my $csv_options = { binary => 1, quote_null => 0 };
1901 $csv_options->{'sep_char'} = $opts->{fieldsep};
1902 if( $opts->{fieldsep} eq "\t" ) {
1903 # If it is really tab separated, nothing is an escape char.
1904 $csv_options->{'quote_char'} = undef;
1905 $csv_options->{'escape_char'} = '';
1907 my $csv = Text::CSV->new( $csv_options );
1910 # Make the header row
1911 my @witnesses = map { $_->{'witness'} } @{$table->{'alignment'}};
1912 if( $opts->{ascii} ) {
1913 # TODO think of a fix for this
1914 throw( "Cannot currently produce ASCII sigla with witness layers" )
1915 unless $opts->{noac};
1916 my @awits = map { $self->tradition->witness( $_ )->ascii_sigil } @witnesses;
1917 @witnesses = @awits;
1919 $csv->combine( @witnesses );
1920 push( @result, $csv->string );
1922 # Make the rest of the rows
1923 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1924 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1925 my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
1926 # Quick and dirty collapse of requested relationship types
1927 if( ref( $opts->{mergetypes} ) eq 'ARRAY' ) {
1928 # Now substitute the reading in the relevant index of @row
1929 # for its merge-related reading
1932 my $thisr = shift @rowobjs;
1934 next if exists $substitutes{$thisr->{t}->text};
1935 # Make sure we don't have A <-> B substitutions.
1936 $substitutes{$thisr->{t}->text} = $thisr->{t}->text;
1937 foreach my $thatr ( @rowobjs ) {
1939 next if exists $substitutes{$thatr->{t}->text};
1940 my $ttrel = $self->get_relationship( $thisr->{t}, $thatr->{t} );
1942 next unless grep { $ttrel->type eq $_ } @{$opts->{mergetypes}};
1943 # If we have got this far then we need to merge them.
1944 $substitutes{$thatr->{t}->text} = $thisr->{t}->text;
1947 @row = map { $_ && exists $substitutes{$_} ? $substitutes{$_} : $_ } @row;
1949 $csv->combine( @row );
1950 push( @result, $csv->string );
1952 return join( "\n", @result );
1957 my $opts = shift || {};
1958 $opts->{fieldsep} = ',';
1959 return $self->_tabular( $opts );
1964 my $opts = shift || {};
1965 $opts->{fieldsep} = "\t";
1966 return $self->_tabular( $opts );
1969 =head2 alignment_table
1971 Return a reference to an alignment table, in a slightly enhanced CollateX
1972 format which looks like this:
1974 $table = { alignment => [ { witness => "SIGIL",
1975 tokens => [ { t => "TEXT" }, ... ] },
1976 { witness => "SIG2",
1977 tokens => [ { t => "TEXT" }, ... ] },
1979 length => TEXTLEN };
1983 sub alignment_table {
1984 my( $self, $opts ) = @_;
1985 if( $self->has_cached_table ) {
1986 return $self->cached_table
1987 unless $opts->{noac} || $opts->{safe_ac};
1990 # Make sure we can do this
1991 throw( "Need a linear graph in order to make an alignment table" )
1992 unless $self->linear;
1993 $self->calculate_ranks()
1994 unless $self->_graphcalc_done && $self->end->has_rank;
1996 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
1997 my @all_pos = ( 1 .. $self->end->rank - 1 );
1998 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
1999 # say STDERR "Making witness row(s) for " . $wit->sigil;
2000 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
2001 my @row = _make_witness_row( \@wit_path, \@all_pos );
2002 my $witobj = { 'witness' => $wit->sigil, 'tokens' => \@row };
2003 $witobj->{'identifier'} = $wit->identifier if $wit->identifier;
2004 push( @{$table->{'alignment'}}, $witobj );
2005 if( $wit->is_layered && !$opts->{noac} ) {
2006 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
2007 $wit->sigil.$self->ac_label );
2008 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
2009 my $witlabel = $opts->{safe_ac}
2010 ? $wit->sigil . '__L' : $wit->sigil.$self->ac_label;
2011 my $witacobj = { 'witness' => $witlabel,
2012 'tokens' => \@ac_row };
2013 $witacobj->{'identifier'} = $wit->identifier if $wit->identifier;
2014 push( @{$table->{'alignment'}}, $witacobj );
2017 unless( $opts->{noac} || $opts->{safe_ac} ) {
2018 $self->cached_table( $table );
2023 sub _make_witness_row {
2024 my( $path, $positions ) = @_;
2026 map { $char_hash{$_} = undef } @$positions;
2028 foreach my $rdg ( @$path ) {
2029 say STDERR "rank " . $rdg->rank if $debug;
2030 # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
2031 $char_hash{$rdg->rank} = { 't' => $rdg };
2033 my @row = map { $char_hash{$_} } @$positions;
2034 # Fill in lacuna markers for undef spots in the row
2035 my $last_el = shift @row;
2036 my @filled_row = ( $last_el );
2037 foreach my $el ( @row ) {
2038 # If we are using node reference, make the lacuna node appear many times
2039 # in the table. If not, use the lacuna tag.
2040 if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
2043 push( @filled_row, $el );
2050 =head1 NAVIGATION METHODS
2052 =head2 reading_sequence( $first, $last, $sigil, $backup )
2054 Returns the ordered list of readings, starting with $first and ending
2055 with $last, for the witness given in $sigil. If a $backup sigil is
2056 specified (e.g. when walking a layered witness), it will be used wherever
2057 no $sigil path exists. If there is a base text reading, that will be
2058 used wherever no path exists for $sigil or $backup.
2062 # TODO Think about returning some lazy-eval iterator.
2063 # TODO Get rid of backup; we should know from what witness is whether we need it.
2065 sub reading_sequence {
2066 my( $self, $start, $end, $witness ) = @_;
2068 $witness = $self->baselabel unless $witness;
2069 my @readings = ( $start );
2072 while( $n && $n->id ne $end->id ) {
2073 if( exists( $seen{$n->id} ) ) {
2074 throw( "Detected loop for $witness at " . $n->id );
2078 my $next = $self->next_reading( $n, $witness );
2080 throw( "Did not find any path for $witness from reading " . $n->id );
2082 push( @readings, $next );
2085 # Check that the last reading is our end reading.
2086 my $last = $readings[$#readings];
2087 throw( "Last reading found from " . $start->text .
2088 " for witness $witness is not the end!" ) # TODO do we get this far?
2089 unless $last->id eq $end->id;
2094 =head2 readings_at_rank( $rank )
2096 Returns a list of readings at a given rank, taken from the alignment table.
2100 sub readings_at_rank {
2101 my( $self, $rank, $nolacuna ) = @_;
2102 my $table = $self->alignment_table;
2103 # Table rank is real rank - 1.
2104 my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
2106 foreach my $e ( @elements ) {
2107 next unless ref( $e ) eq 'HASH';
2108 next unless exists $e->{'t'};
2109 my $rdg = $e->{'t'};
2110 next if $nolacuna && $rdg->is_lacuna && $rdg->rank ne $rank;
2111 $readings{$e->{'t'}->id} = $e->{'t'};
2113 return values %readings;
2116 =head2 next_reading( $reading, $sigil );
2118 Returns the reading that follows the given reading along the given witness
2124 # Return the successor via the corresponding path.
2126 my $answer = $self->_find_linked_reading( 'next', @_ );
2127 return undef unless $answer;
2128 return $self->reading( $answer );
2131 =head2 prior_reading( $reading, $sigil )
2133 Returns the reading that precedes the given reading along the given witness
2139 # Return the predecessor via the corresponding path.
2141 my $answer = $self->_find_linked_reading( 'prior', @_ );
2142 return $self->reading( $answer );
2145 sub _find_linked_reading {
2146 my( $self, $direction, $node, $path ) = @_;
2148 # Get a backup if we are dealing with a layered witness
2150 my $aclabel = $self->ac_label;
2151 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
2155 my @linked_paths = $direction eq 'next'
2156 ? $self->sequence->edges_from( $node )
2157 : $self->sequence->edges_to( $node );
2158 return undef unless scalar( @linked_paths );
2160 # We have to find the linked path that contains all of the
2161 # witnesses supplied in $path.
2162 my( @path_wits, @alt_path_wits );
2163 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
2164 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
2167 foreach my $le ( @linked_paths ) {
2168 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
2171 my @le_wits = sort $self->path_witnesses( $le );
2172 if( _is_within( \@path_wits, \@le_wits ) ) {
2173 # This is the right path.
2174 return $direction eq 'next' ? $le->[1] : $le->[0];
2175 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
2179 # Got this far? Return the alternate path if it exists.
2180 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
2183 # Got this far? Return the base path if it exists.
2184 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
2187 # Got this far? We have no appropriate path.
2188 warn "Could not find $direction node from " . $node->id
2189 . " along path $path";
2195 my( $set1, $set2 ) = @_;
2196 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
2197 foreach my $el ( @$set1 ) {
2198 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
2203 # Return the string that joins together a list of witnesses for
2204 # display on a single path.
2205 sub _witnesses_of_label {
2206 my( $self, $label ) = @_;
2207 my $regex = $self->wit_list_separator;
2208 my @answer = split( /\Q$regex\E/, $label );
2212 =head2 common_readings
2214 Returns the list of common readings in the graph (i.e. those readings that are
2215 shared by all non-lacunose witnesses.)
2219 sub common_readings {
2221 my @common = grep { $_->is_common } $self->readings;
2225 =head2 path_text( $sigil, [, $start, $end ] )
2227 Returns the text of a witness (plus its backup, if we are using a layer)
2228 as stored in the collation. The text is returned as a string, where the
2229 individual readings are joined with spaces and the meta-readings (e.g.
2230 lacunae) are omitted. Optional specification of $start and $end allows
2231 the generation of a subset of the witness text.
2236 my( $self, $wit, $start, $end ) = @_;
2237 $start = $self->start unless $start;
2238 $end = $self->end unless $end;
2239 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
2242 foreach my $r ( @path ) {
2243 unless ( $r->join_prior || !$last || $last->join_next ) {
2246 $pathtext .= $r->text;
2252 =head1 INITIALIZATION METHODS
2254 These are mostly for use by parsers.
2256 =head2 make_witness_path( $witness )
2258 Link the array of readings contained in $witness->path (and in
2259 $witness->uncorrected_path if it exists) into collation paths.
2260 Clear out the arrays when finished.
2262 =head2 make_witness_paths
2264 Call make_witness_path for all witnesses in the tradition.
2268 # For use when a collation is constructed from a base text and an apparatus.
2269 # We have the sequences of readings and just need to add path edges.
2270 # When we are done, clear out the witness path attributes, as they are no
2272 # TODO Find a way to replace the witness path attributes with encapsulated functions?
2274 sub make_witness_paths {
2276 foreach my $wit ( $self->tradition->witnesses ) {
2277 # say STDERR "Making path for " . $wit->sigil;
2278 $self->make_witness_path( $wit );
2282 sub make_witness_path {
2283 my( $self, $wit ) = @_;
2284 my @chain = @{$wit->path};
2285 my $sig = $wit->sigil;
2286 # Add start and end if necessary
2287 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
2288 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
2289 foreach my $idx ( 0 .. $#chain-1 ) {
2290 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
2292 if( $wit->is_layered ) {
2293 @chain = @{$wit->uncorrected_path};
2294 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
2295 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
2296 foreach my $idx( 0 .. $#chain-1 ) {
2297 my $source = $chain[$idx];
2298 my $target = $chain[$idx+1];
2299 $self->add_path( $source, $target, $sig.$self->ac_label )
2300 unless $self->has_path( $source, $target, $sig );
2304 $wit->clear_uncorrected_path;
2307 =head2 calculate_ranks
2309 Calculate the reading ranks (that is, their aligned positions relative
2310 to each other) for the graph. This can only be called on linear collations.
2314 use Text::Tradition;
2316 my $cxfile = 't/data/Collatex-16.xml';
2317 my $t = Text::Tradition->new(
2319 'input' => 'CollateX',
2322 my $c = $t->collation;
2325 my $table = $c->alignment_table;
2326 ok( $c->has_cached_table, "Alignment table was cached" );
2327 is( $c->alignment_table, $table, "Cached table returned upon second call" );
2328 $c->calculate_ranks;
2329 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
2330 $c->add_relationship( 'n13', 'n23', { type => 'repetition' } );
2331 is( $c->alignment_table, $table, "Alignment table unchanged after non-colo relationship add" );
2332 $c->add_relationship( 'n24', 'n23', { type => 'spelling' } );
2333 isnt( $c->alignment_table, $table, "Alignment table changed after colo relationship add" );
2339 sub calculate_ranks {
2341 # Save the existing ranks, in case we need to invalidate the cached SVG.
2342 throw( "Cannot calculate ranks on a non-linear graph" )
2343 unless $self->linear;
2345 map { $existing_ranks{$_} = $_->rank } $self->readings;
2347 # Do the rankings based on the relationship equivalence graph, starting
2348 # with the start node.
2349 my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
2351 # Transfer our rankings from the topological graph to the real one.
2352 foreach my $r ( $self->readings ) {
2353 if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
2354 $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
2356 # Die. Find the last rank we calculated.
2357 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
2358 <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
2360 my $last = pop @all_defined;
2361 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
2364 # Do we need to invalidate the cached data?
2365 if( $self->has_cached_table ) {
2366 foreach my $r ( $self->readings ) {
2367 next if defined( $existing_ranks{$r} )
2368 && $existing_ranks{$r} == $r->rank;
2369 # Something has changed, so clear the cache
2370 $self->_clear_cache;
2371 # ...and recalculate the common readings.
2372 $self->calculate_common_readings();
2376 # The graph calculation information is now up to date.
2377 $self->_graphcalc_done(1);
2382 $self->wipe_table if $self->has_cached_table;
2386 =head2 flatten_ranks
2388 A convenience method for parsing collation data. Searches the graph for readings
2389 with the same text at the same rank, and merges any that are found.
2394 my ( $self, %args ) = shift;
2395 my %unique_rank_rdg;
2397 foreach my $p ( $self->identical_readings( %args ) ) {
2398 # say STDERR "Combining readings at same rank: @$p";
2400 $self->merge_readings( @$p );
2401 # TODO see if this now makes a common point.
2403 # If we merged readings, the ranks are still fine but the alignment
2404 # table is wrong. Wipe it.
2405 $self->wipe_table() if $changed;
2408 =head2 identical_readings
2409 =head2 identical_readings( start => $startnode, end => $endnode )
2410 =head2 identical_readings( startrank => $startrank, endrank => $endrank )
2412 Goes through the graph identifying all pairs of readings that appear to be
2413 identical, and therefore able to be merged into a single reading. Returns the
2414 relevant identical pairs. Can be restricted to run over only a part of the
2415 graph, specified either by node or by rank.
2419 sub identical_readings {
2420 my ( $self, %args ) = @_;
2421 # Find where we should start and end.
2422 my $startrank = $args{startrank} || 0;
2423 if( $args{start} ) {
2424 throw( "Starting reading has no rank" ) unless $self->reading( $args{start} )
2425 && $self->reading( $args{start} )->has_rank;
2426 $startrank = $self->reading( $args{start} )->rank;
2428 my $endrank = $args{endrank} || $self->end->rank;
2430 throw( "Ending reading has no rank" ) unless $self->reading( $args{end} )
2431 && $self->reading( $args{end} )->has_rank;
2432 $endrank = $self->reading( $args{end} )->rank;
2435 # Make sure the ranks are correct.
2436 unless( $self->_graphcalc_done ) {
2437 $self->calculate_ranks;
2439 # Go through the readings looking for duplicates.
2440 my %unique_rank_rdg;
2442 foreach my $rdg ( $self->readings ) {
2443 next unless $rdg->has_rank;
2444 my $rk = $rdg->rank;
2445 next if $rk > $endrank || $rk < $startrank;
2446 my $key = $rk . "||" . $rdg->text;
2447 if( exists $unique_rank_rdg{$key} ) {
2448 # Make sure they don't have different grammatical forms
2449 my $ur = $unique_rank_rdg{$key};
2450 if( $rdg->is_identical( $ur ) ) {
2451 push( @pairs, [ $ur, $rdg ] );
2454 $unique_rank_rdg{$key} = $rdg;
2462 =head2 calculate_common_readings
2464 Goes through the graph identifying the readings that appear in every witness
2465 (apart from those with lacunae at that spot.) Marks them as common and returns
2470 use Text::Tradition;
2472 my $cxfile = 't/data/Collatex-16.xml';
2473 my $t = Text::Tradition->new(
2475 'input' => 'CollateX',
2478 my $c = $t->collation;
2480 my @common = $c->calculate_common_readings();
2481 is( scalar @common, 8, "Found correct number of common readings" );
2482 my @marked = sort $c->common_readings();
2483 is( scalar @common, 8, "All common readings got marked as such" );
2484 my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
2485 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
2491 sub calculate_common_readings {
2494 map { $_->is_common( 0 ) } $self->readings;
2495 # Implicitly calls calculate_ranks
2496 my $table = $self->alignment_table;
2497 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
2498 my @row = map { $_->{'tokens'}->[$idx]
2499 ? $_->{'tokens'}->[$idx]->{'t'} : '' }
2500 @{$table->{'alignment'}};
2502 foreach my $r ( @row ) {
2504 $hash{$r->id} = $r unless $r->is_meta;
2506 $hash{'UNDEF'} = $r;
2509 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
2510 my( $r ) = values %hash;
2512 push( @common, $r );
2518 =head2 text_from_paths
2520 Calculate the text array for all witnesses from the path, for later consistency
2521 checking. Only to be used if there is no non-graph-based way to know the
2526 sub text_from_paths {
2528 foreach my $wit ( $self->tradition->witnesses ) {
2529 my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
2531 foreach my $r ( @readings ) {
2532 next if $r->is_meta;
2533 push( @text, $r->text );
2535 $wit->text( \@text );
2536 if( $wit->is_layered ) {
2537 my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
2538 $wit->sigil.$self->ac_label );
2540 foreach my $r ( @ucrdgs ) {
2541 next if $r->is_meta;
2542 push( @uctext, $r->text );
2544 $wit->layertext( \@uctext );
2549 =head1 UTILITY FUNCTIONS
2551 =head2 common_predecessor( $reading_a, $reading_b )
2553 Find the last reading that occurs in sequence before both the given readings.
2554 At the very least this should be $self->start.
2556 =head2 common_successor( $reading_a, $reading_b )
2558 Find the first reading that occurs in sequence after both the given readings.
2559 At the very least this should be $self->end.
2563 use Text::Tradition;
2565 my $cxfile = 't/data/Collatex-16.xml';
2566 my $t = Text::Tradition->new(
2568 'input' => 'CollateX',
2571 my $c = $t->collation;
2573 is( $c->common_predecessor( 'n24', 'n23' )->id,
2574 'n20', "Found correct common predecessor" );
2575 is( $c->common_successor( 'n24', 'n23' )->id,
2576 '__END__', "Found correct common successor" );
2578 is( $c->common_predecessor( 'n19', 'n17' )->id,
2579 'n16', "Found correct common predecessor for readings on same path" );
2580 is( $c->common_successor( 'n21', 'n10' )->id,
2581 '__END__', "Found correct common successor for readings on same path" );
2587 ## Return the closest reading that is a predecessor of both the given readings.
2588 sub common_predecessor {
2590 my( $r1, $r2 ) = $self->_objectify_args( @_ );
2591 return $self->_common_in_path( $r1, $r2, 'predecessors' );
2594 sub common_successor {
2596 my( $r1, $r2 ) = $self->_objectify_args( @_ );
2597 return $self->_common_in_path( $r1, $r2, 'successors' );
2601 # TODO think about how to do this without ranks...
2602 sub _common_in_path {
2603 my( $self, $r1, $r2, $dir ) = @_;
2604 my $iter = $self->end->rank;
2606 my @last_r1 = ( $r1 );
2607 my @last_r2 = ( $r2 );
2608 # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
2610 # say STDERR "Finding common $dir for $r1, $r2";
2611 while( !@candidates ) {
2612 last unless $iter--; # Avoid looping infinitely
2613 # Iterate separately down the graph from r1 and r2
2614 my( @new_lc1, @new_lc2 );
2615 foreach my $lc ( @last_r1 ) {
2616 foreach my $p ( $lc->$dir ) {
2617 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
2618 # say STDERR "Path candidate $p from $lc";
2619 push( @candidates, $p );
2620 } elsif( !$all_seen{$p->id} ) {
2621 $all_seen{$p->id} = 'r1';
2622 push( @new_lc1, $p );
2626 foreach my $lc ( @last_r2 ) {
2627 foreach my $p ( $lc->$dir ) {
2628 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
2629 # say STDERR "Path candidate $p from $lc";
2630 push( @candidates, $p );
2631 } elsif( !$all_seen{$p->id} ) {
2632 $all_seen{$p->id} = 'r2';
2633 push( @new_lc2, $p );
2637 @last_r1 = @new_lc1;
2638 @last_r2 = @new_lc2;
2640 my @answer = sort { $a->rank <=> $b->rank } @candidates;
2641 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
2645 Text::Tradition::Error->throw(
2646 'ident' => 'Collation error',
2652 __PACKAGE__->meta->make_immutable;
2658 =item * Rework XML serialization in a more modular way
2664 This package is free software and is provided "as is" without express
2665 or implied warranty. You can redistribute it and/or modify it under
2666 the same terms as Perl itself.
2670 Tara L Andrews E<lt>aurum@cpan.orgE<gt>