1 package Text::Tradition::Collation;
4 use Encode qw( decode_utf8 );
8 use IPC::Run qw( run binary );
10 use Text::Tradition::Collation::Data;
11 use Text::Tradition::Collation::Reading;
12 use Text::Tradition::Collation::RelationshipStore;
13 use Text::Tradition::Error;
14 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
16 use XML::LibXML::XPathContext;
20 isa => 'Text::Tradition::Collation::Data',
58 isa => 'Text::Tradition',
59 writer => '_set_tradition',
65 Text::Tradition::Collation - a software model for a text collation
70 my $t = Text::Tradition->new(
71 'name' => 'this is a text',
73 'file' => '/path/to/tei_parallel_seg_file.xml' );
75 my $c = $t->collation;
76 my @readings = $c->readings;
77 my @paths = $c->paths;
78 my @relationships = $c->relationships;
80 my $svg_variant_graph = $t->collation->as_svg();
84 Text::Tradition is a library for representation and analysis of collated
85 texts, particularly medieval ones. The Collation is the central feature of
86 a Tradition, where the text, its sequence of readings, and its relationships
87 between readings are actually kept.
93 The constructor. Takes a hash or hashref of the following arguments:
97 =item * tradition - The Text::Tradition object to which the collation
100 =item * linear - Whether the collation should be linear; that is, whether
101 transposed readings should be treated as two linked readings rather than one,
102 and therefore whether the collation graph is acyclic. Defaults to true.
104 =item * baselabel - The default label for the path taken by a base text
105 (if any). Defaults to 'base text'.
107 =item * wit_list_separator - The string to join a list of witnesses for
108 purposes of making labels in display graphs. Defaults to ', '.
110 =item * ac_label - The extra label to tack onto a witness sigil when
111 representing another layer of path for the given witness - that is, when
112 a text has more than one possible reading due to scribal corrections or
113 the like. Defaults to ' (a.c.)'.
115 =item * wordsep - The string used to separate words in the original text.
126 =head2 wit_list_separator
134 Simple accessors for collation attributes.
138 The meta-reading at the start of every witness path.
142 The meta-reading at the end of every witness path.
146 Returns all Reading objects in the graph.
148 =head2 reading( $id )
150 Returns the Reading object corresponding to the given ID.
152 =head2 add_reading( $reading_args )
154 Adds a new reading object to the collation.
155 See L<Text::Tradition::Collation::Reading> for the available arguments.
157 =head2 del_reading( $object_or_id )
159 Removes the given reading from the collation, implicitly removing its
160 paths and relationships.
162 =head2 merge_readings( $main, $second, $concatenate, $with_str )
164 Merges the $second reading into the $main one. If $concatenate is true, then
165 the merged node will carry the text of both readings, concatenated with either
166 $with_str (if specified) or a sensible default (the empty string if the
167 appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
169 The first two arguments may be either readings or reading IDs.
171 =head2 has_reading( $id )
173 Predicate to see whether a given reading ID is in the graph.
175 =head2 reading_witnesses( $object_or_id )
177 Returns a list of sigils whose witnesses contain the reading.
181 Returns all reading paths within the document - that is, all edges in the
182 collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
184 =head2 add_path( $source, $target, $sigil )
186 Links the given readings in the collation in sequence, under the given witness
187 sigil. The readings may be specified by object or ID.
189 =head2 del_path( $source, $target, $sigil )
191 Links the given readings in the collation in sequence, under the given witness
192 sigil. The readings may be specified by object or ID.
194 =head2 has_path( $source, $target );
196 Returns true if the two readings are linked in sequence in any witness.
197 The readings may be specified by object or ID.
201 Returns all Relationship objects in the collation.
203 =head2 add_relationship( $reading, $other_reading, $options )
205 Adds a new relationship of the type given in $options between the two readings,
206 which may be specified by object or ID. Returns a value of ( $status, @vectors)
207 where $status is true on success, and @vectors is a list of relationship edges
208 that were ultimately added.
209 See L<Text::Tradition::Collation::Relationship> for the available options.
214 my ( $class, @args ) = @_;
215 my %args = @args == 1 ? %{ $args[0] } : @args;
216 # TODO determine these from the Moose::Meta object
217 my @delegate_attrs = qw(sequence relations readings wit_list_separator baselabel
218 linear wordsep start end cached_table _graphcalc_done);
220 for my $attr (@delegate_attrs) {
221 $data_args{$attr} = delete $args{$attr} if exists $args{$attr};
223 $args{_data} = Text::Tradition::Collation::Data->new(%data_args);
229 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
230 $self->_set_start( $self->add_reading(
231 { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
232 $self->_set_end( $self->add_reading(
233 { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
236 sub register_relationship_type {
238 my %args = @_ == 1 ? %{$_[0]} : @_;
239 if( $self->relations->has_type( $args{name} ) ) {
240 throw( 'Relationship type ' . $args{name} . ' already registered' );
242 $self->relations->add_type( %args );
245 ### Reading construct/destruct functions
248 my( $self, $reading ) = @_;
249 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
250 my %args = %$reading;
251 if( $args{'init'} ) {
252 # If we are initializing an empty collation, don't assume that we
253 # have set a tradition.
254 delete $args{'init'};
255 } elsif( $self->tradition->can('language') && $self->tradition->has_language
256 && !exists $args{'language'} ) {
257 $args{'language'} = $self->tradition->language;
259 $reading = Text::Tradition::Collation::Reading->new(
260 'collation' => $self,
263 # First check to see if a reading with this ID exists.
264 if( $self->reading( $reading->id ) ) {
265 throw( "Collation already has a reading with id " . $reading->id );
267 $self->_graphcalc_done(0);
268 $self->_add_reading( $reading->id => $reading );
269 # Once the reading has been added, put it in both graphs.
270 $self->sequence->add_vertex( $reading->id );
271 $self->relations->add_reading( $reading->id );
275 around del_reading => sub {
280 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
283 # Remove the reading from the graphs.
284 $self->_graphcalc_done(0);
285 $self->_clear_cache; # Explicitly clear caches to GC the reading
286 $self->sequence->delete_vertex( $arg );
287 $self->relations->delete_reading( $arg );
290 $self->$orig( $arg );
297 my $cxfile = 't/data/Collatex-16.xml';
298 my $t = Text::Tradition->new(
300 'input' => 'CollateX',
303 my $c = $t->collation;
305 my $rno = scalar $c->readings;
306 # Split n21 ('unto') for testing purposes
307 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
308 my $old_r = $c->reading( 'n21' );
309 $old_r->alter_text( 'to' );
310 $c->del_path( 'n20', 'n21', 'A' );
311 $c->add_path( 'n20', 'n21p0', 'A' );
312 $c->add_path( 'n21p0', 'n21', 'A' );
313 $c->add_relationship( 'n21', 'n22', { type => 'collated', scope => 'local' } );
315 ok( $c->reading( 'n21p0' ), "New reading exists" );
316 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
318 # Combine n3 and n4 ( with his )
319 $c->merge_readings( 'n3', 'n4', 1 );
320 ok( !$c->reading('n4'), "Reading n4 is gone" );
321 is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
323 # Collapse n9 and n10 ( rood / root )
324 $c->merge_readings( 'n9', 'n10' );
325 ok( !$c->reading('n10'), "Reading n10 is gone" );
326 is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
328 # Combine n21 and n21p0
329 my $remaining = $c->reading('n21');
330 $remaining ||= $c->reading('n22'); # one of these should still exist
331 $c->merge_readings( 'n21p0', $remaining, 1 );
332 ok( !$c->reading('n21'), "Reading $remaining is gone" );
333 is( $c->reading('n21p0')->text, 'unto', "Reading n21p0 merged correctly" );
343 my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
344 my $mergemeta = $kept_obj->is_meta;
345 throw( "Cannot merge meta and non-meta reading" )
346 unless ( $mergemeta && $del_obj->is_meta )
347 || ( !$mergemeta && !$del_obj->is_meta );
349 throw( "Cannot merge with start or end node" )
350 if( $kept_obj eq $self->start || $kept_obj eq $self->end
351 || $del_obj eq $self->start || $del_obj eq $self->end );
352 throw( "Cannot combine text of meta readings" ) if $combine;
354 # We only need the IDs for adding paths to the graph, not the reading
355 # objects themselves.
356 my $kept = $kept_obj->id;
357 my $deleted = $del_obj->id;
358 $self->_graphcalc_done(0);
360 # The kept reading should inherit the paths and the relationships
361 # of the deleted reading.
362 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
363 my @vector = ( $kept );
364 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
365 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
366 next if $vector[0] eq $vector[1]; # Don't add a self loop
367 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
368 $self->sequence->add_edge( @vector );
369 my $fwits = $self->sequence->get_edge_attributes( @vector );
370 @wits{keys %$fwits} = values %$fwits;
371 $self->sequence->set_edge_attributes( @vector, \%wits );
373 $self->relations->merge_readings( $kept, $deleted, $combine );
375 # Do the deletion deed.
377 # Combine the text of the readings
378 my $joinstr = $combine_char;
379 unless( defined $joinstr ) {
380 $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
381 $joinstr = $self->wordsep unless defined $joinstr;
383 $kept_obj->_combine( $del_obj, $joinstr );
385 $self->del_reading( $deleted );
388 =head2 compress_readings
390 Where possible in the graph, compresses plain sequences of readings into a
391 single reading. The sequences must consist of readings with no
392 relationships to other readings, with only a single witness path between
393 them and no other witness paths from either that would skip the other. The
394 readings must also not be marked as nonsense or bad grammar.
396 WARNING: This operation cannot be undone.
400 sub compress_readings {
402 # Anywhere in the graph that there is a reading that joins only to a single
403 # successor, and neither of these have any relationships, just join the two
405 foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
406 # Now look for readings that can be joined to their successors.
407 next unless $rdg->is_combinable;
409 while( $self->sequence->successors( $rdg ) == 1 ) {
410 my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
411 throw( "Infinite loop" ) if $seen{$next->id};
412 $seen{$next->id} = 1;
413 last if $self->sequence->predecessors( $next ) > 1;
414 last unless $next->is_combinable;
415 say "Joining readings $rdg and $next";
416 $self->merge_readings( $rdg, $next, 1 );
419 # Make sure we haven't screwed anything up
420 foreach my $wit ( $self->tradition->witnesses ) {
421 my $pathtext = $self->path_text( $wit->sigil );
422 my $origtext = join( ' ', @{$wit->text} );
423 throw( "Text differs for witness " . $wit->sigil )
424 unless $pathtext eq $origtext;
425 if( $wit->is_layered ) {
426 $pathtext = $self->path_text( $wit->sigil.$self->ac_label );
427 $origtext = join( ' ', @{$wit->layertext} );
428 throw( "Ante-corr text differs for witness " . $wit->sigil )
429 unless $pathtext eq $origtext;
433 $self->relations->rebuild_equivalence();
434 $self->calculate_ranks();
437 # Helper function for manipulating the graph.
438 sub _stringify_args {
439 my( $self, $first, $second, @args ) = @_;
441 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
442 $second = $second->id
443 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
444 return( $first, $second, @args );
447 # Helper function for manipulating the graph.
448 sub _objectify_args {
449 my( $self, $first, $second, $arg ) = @_;
450 $first = $self->reading( $first )
451 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
452 $second = $self->reading( $second )
453 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
454 return( $first, $second, $arg );
457 =head2 duplicate_reading( $reading, @witlist )
459 Split the given reading into two, so that the new reading is in the path for
460 the witnesses given in @witlist. If the result is that certain non-colocated
461 relationships (e.g. transpositions) are no longer valid, these will be removed.
462 Returns the newly-created reading.
468 my $st = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/collatecorr.xml' );
469 is( ref( $st ), 'Text::Tradition', "Got a tradition from test file" );
470 ok( $st->has_witness('Ba96'), "Tradition has the affected witness" );
472 my $sc = $st->collation;
474 ok( $sc->reading('n131'), "Tradition has the affected reading" );
475 is( scalar( $sc->readings ), $numr, "There are $numr readings in the graph" );
476 is( $sc->end->rank, 14, "There are fourteen ranks in the graph" );
478 # Detach the erroneously collated reading
479 my $newr = $sc->duplicate_reading( 'n131', 'Ba96' );
480 ok( $newr, "New reading was created" );
481 ok( $sc->reading('n131_0'), "Detached the bad collation with a new reading" );
482 is( scalar( $sc->readings ), $numr + 1, "A reading was added to the graph" );
483 is( $sc->end->rank, 10, "There are now only ten ranks in the graph" );
485 # Check that the bad transposition is gone
486 is( $sc->get_relationship( 'n130', 'n135' ), undef, "Bad transposition relationship is gone" );
489 ok( $sc->add_relationship( 'n124', 'n131_0', { type => 'collated', scope => 'local' } ),
490 "Collated the readings correctly" );
491 $sc->calculate_ranks();
492 $sc->flatten_ranks();
493 is( $sc->end->rank, 11, "The ranks shifted appropriately" );
494 is( scalar( $sc->readings ), $numr - 3, "Now we are collated correctly" );
500 sub duplicate_reading {
501 my( $self, $r, @wits ) = @_;
502 # Add the new reading, duplicating $r.
503 unless( ref( $r ) eq 'Text::Tradition::Collation::Reading' ) {
504 $r = $self->reading( $r );
506 throw( "Cannot duplicate a meta-reading" )
509 # Get all the reading attributes and duplicate them.
510 my $rmeta = Text::Tradition::Collation::Reading->meta;
512 foreach my $attr( $rmeta->get_all_attributes ) {
513 next if $attr->name =~ /^_/;
514 my $acc = $attr->get_read_method;
515 if( !$acc && $attr->has_applied_traits ) {
516 my $tr = $attr->applied_traits;
517 if( $tr->[0] =~ /::(Array|Hash)$/ ) {
519 my %methods = reverse %{$attr->handles};
520 $acc = $methods{elements};
521 $args{$attr->name} = $which eq 'Array'
522 ? [ $r->$acc ] : { $r->$acc };
525 $args{$attr->name} = $r->$acc if $acc;
528 # By definition the new reading will no longer be common.
529 $args{is_common} = 0;
530 # The new reading also needs its own ID.
531 $args{id} = $self->_generate_dup_id( $r->id );
533 # Try to make the new reading.
534 my $newr = $self->add_reading( \%args );
535 # The old reading is also no longer common.
538 # For each of the witnesses, dissociate from the old reading and
539 # associate with the new.
540 foreach my $wit ( @wits ) {
541 my $prior = $self->prior_reading( $r, $wit );
542 my $next = $self->next_reading( $r, $wit );
543 $self->del_path( $prior, $r, $wit );
544 $self->add_path( $prior, $newr, $wit );
545 $self->del_path( $r, $next, $wit );
546 $self->add_path( $newr, $next, $wit );
549 # Hash the reading ranks and find the closest common successor to our
553 if( $self->end->has_rank ) {
554 $succ = $self->common_successor( $r, $newr );
555 foreach my $rdg ( $self->readings ) {
556 $rrk{$rdg->id} = $rdg->rank;
560 # Rebuild the equivalence graph and calculate the new ranks
561 $self->relations->rebuild_equivalence();
562 $self->calculate_ranks();
564 # Check for invalid non-colocated relationships among changed-rank readings
565 # from where the ranks start changing up to $succ
566 if( $self->end->has_rank ) {
567 my $lastrank = $succ->rank;
568 foreach my $rdg ( $self->readings ) {
569 next if $rdg->rank > $lastrank;
570 next if $rdg->rank == $rrk{$rdg->id};
571 my @noncolo = $rdg->related_readings( sub { !$_[0]->colocated } );
572 next unless @noncolo;
573 foreach my $nc ( @noncolo ) {
574 $self->relations->verify_or_delete( $rdg, $nc );
581 sub _generate_dup_id {
582 my( $self, $rid ) = @_;
587 if( $self->has_reading( $newid ) ) {
600 # We only need the IDs for adding paths to the graph, not the reading
601 # objects themselves.
602 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
604 $self->_graphcalc_done(0);
605 # Connect the readings
606 unless( $self->sequence->has_edge( $source, $target ) ) {
607 $self->sequence->add_edge( $source, $target );
608 $self->relations->add_equivalence_edge( $source, $target );
610 # Note the witness in question
611 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
617 if( ref( $_[0] ) eq 'ARRAY' ) {
624 # We only need the IDs for removing paths from the graph, not the reading
625 # objects themselves.
626 my( $source, $target, $wit ) = $self->_stringify_args( @args );
628 $self->_graphcalc_done(0);
629 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
630 $self->sequence->delete_edge_attribute( $source, $target, $wit );
632 unless( $self->sequence->has_edge_attributes( $source, $target ) ) {
633 $self->sequence->delete_edge( $source, $target );
634 $self->relations->delete_equivalence_edge( $source, $target );
639 # Extra graph-alike utility
642 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
643 return undef unless $self->sequence->has_edge( $source, $target );
644 return $self->sequence->has_edge_attribute( $source, $target, $wit );
647 =head2 clear_witness( @sigil_list )
649 Clear the given witnesses out of the collation entirely, removing references
650 to them in paths, and removing readings that belong only to them. Should only
651 be called via $tradition->del_witness.
656 my( $self, @sigils ) = @_;
658 $self->_graphcalc_done(0);
659 # Clear the witness(es) out of the paths
660 foreach my $e ( $self->paths ) {
661 foreach my $sig ( @sigils ) {
662 $self->del_path( $e, $sig );
666 # Clear out the newly unused readings
667 foreach my $r ( $self->readings ) {
668 unless( $self->reading_witnesses( $r ) ) {
669 $self->del_reading( $r );
674 sub add_relationship {
676 my( $source, $target, $opts ) = $self->_stringify_args( @_ );
677 my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
678 foreach my $v ( @vectors ) {
679 next unless $self->get_relationship( $v )->colocated;
680 if( $self->reading( $v->[0] )->has_rank && $self->reading( $v->[1] )->has_rank
681 && $self->reading( $v->[0] )->rank ne $self->reading( $v->[1] )->rank ) {
682 $self->_graphcalc_done(0);
690 around qw/ get_relationship del_relationship / => sub {
694 if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
697 my( $source, $target ) = $self->_stringify_args( @args );
698 $self->$orig( $source, $target );
701 =head2 reading_witnesses( $reading )
703 Return a list of sigils corresponding to the witnesses in which the reading appears.
707 sub reading_witnesses {
708 my( $self, $reading ) = @_;
709 # We need only check either the incoming or the outgoing edges; I have
710 # arbitrarily chosen "incoming". Thus, special-case the start node.
711 if( $reading eq $self->start ) {
712 return map { $_->sigil } grep { $_->is_collated } $self->tradition->witnesses;
715 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
716 my $wits = $self->sequence->get_edge_attributes( @$e );
717 @all_witnesses{ keys %$wits } = 1;
719 my $acstr = $self->ac_label;
720 foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
721 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
723 return keys %all_witnesses;
726 =head1 OUTPUT METHODS
728 =head2 as_svg( \%options )
730 Returns an SVG string that represents the graph, via as_dot and graphviz.
731 See as_dot for a list of options. Must have GraphViz (dot) installed to run.
736 my( $self, $opts ) = @_;
737 throw( "Need GraphViz installed to output SVG" )
738 unless File::Which::which( 'dot' );
739 my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
740 $self->calculate_ranks()
741 unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
742 my @cmd = qw/dot -Tsvg/;
744 my $dotfile = File::Temp->new();
746 # $dotfile->unlink_on_destroy(0);
747 binmode $dotfile, ':utf8';
748 print $dotfile $self->as_dot( $opts );
749 push( @cmd, $dotfile->filename );
750 run( \@cmd, ">", binary(), \$svg );
751 $svg = decode_utf8( $svg );
756 =head2 as_dot( \%options )
758 Returns a string that is the collation graph expressed in dot
759 (i.e. GraphViz) format. Options include:
774 my( $self, $opts ) = @_;
775 my $startrank = $opts->{'from'} if $opts;
776 my $endrank = $opts->{'to'} if $opts;
777 my $color_common = $opts->{'color_common'} if $opts;
778 my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
779 && $self->end->rank > 100;
780 $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
782 # Check the arguments
784 return if $endrank && $startrank > $endrank;
785 return if $startrank > $self->end->rank;
787 if( defined $endrank ) {
788 return if $endrank < 0;
789 $endrank = undef if $endrank == $self->end->rank;
792 my $graph_name = $self->tradition->name;
793 $graph_name =~ s/[^\w\s]//g;
794 $graph_name = join( '_', split( /\s+/, $graph_name ) );
802 'fillcolor' => 'white',
807 'arrowhead' => 'open',
808 'color' => '#000000',
809 'fontcolor' => '#000000',
812 my $dot = sprintf( "digraph %s {\n", $graph_name );
813 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
814 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
816 # Output substitute start/end readings if necessary
818 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
821 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
823 if( $STRAIGHTENHACK ) {
825 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
826 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
827 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
829 my %used; # Keep track of the readings that actually appear in the graph
830 # Sort the readings by rank if we have ranks; this speeds layout.
831 my @all_readings = $self->end->has_rank
832 ? sort { $a->rank <=> $b->rank } $self->readings
834 # TODO Refrain from outputting lacuna nodes - just grey out the edges.
835 foreach my $reading ( @all_readings ) {
836 # Only output readings within our rank range.
837 next if $startrank && $reading->rank < $startrank;
838 next if $endrank && $reading->rank > $endrank;
839 $used{$reading->id} = 1;
840 # Need not output nodes without separate labels
841 next if $reading->id eq $reading->text;
843 my $label = $reading->text;
844 $label .= '-' if $reading->join_next;
845 $label = "-$label" if $reading->join_prior;
846 $label =~ s/\"/\\\"/g;
847 $rattrs->{'label'} = $label;
848 $rattrs->{'id'} = $reading->id;
849 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
850 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
853 # Add the real edges. Need to weight one edge per rank jump, in a
855 # my $weighted = $self->_add_edge_weights;
856 my @edges = $self->paths;
857 my( %substart, %subend );
858 foreach my $edge ( @edges ) {
859 # Do we need to output this edge?
860 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
861 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
862 my $variables = { %edge_attrs, 'label' => $label };
864 # Account for the rank gap if necessary
865 my $rank0 = $self->reading( $edge->[0] )->rank
866 if $self->reading( $edge->[0] )->has_rank;
867 my $rank1 = $self->reading( $edge->[1] )->rank
868 if $self->reading( $edge->[1] )->has_rank;
869 if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
870 $variables->{'minlen'} = $rank1 - $rank0;
873 # Add the calculated edge weights
874 # if( exists $weighted->{$edge->[0]}
875 # && $weighted->{$edge->[0]} eq $edge->[1] ) {
876 # # $variables->{'color'} = 'red';
877 # $variables->{'weight'} = 3.0;
880 # EXPERIMENTAL: make edge width reflect no. of witnesses
881 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
882 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
884 my $varopts = _dot_attr_string( $variables );
885 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
886 $edge->[0], $edge->[1], $varopts );
887 } elsif( $used{$edge->[0]} ) {
888 $subend{$edge->[0]} = $edge->[1];
889 } elsif( $used{$edge->[1]} ) {
890 $substart{$edge->[1]} = $edge->[0];
894 # If we are asked to, add relationship links
895 if( exists $opts->{show_relations} ) {
896 my $filter = $opts->{show_relations}; # can be 'transposition' or 'all'
897 if( $filter eq 'transposition' ) {
898 $filter =~ qr/^transposition$/;
900 foreach my $redge ( $self->relationships ) {
901 if( $used{$redge->[0]} && $used{$redge->[1]} ) {
902 if( $filter ne 'all' ) {
903 my $rel = $self->get_relationship( $redge );
904 next unless $rel->type =~ /$filter/;
908 constraint => 'false',
909 label => uc( substr( $rel->type, 0, 4 ) ),
912 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
913 $redge->[0], $redge->[1], _dot_attr_string( $variables ) );
919 # Add substitute start and end edges if necessary
920 foreach my $node ( keys %substart ) {
921 my $witstr = $self->_path_display_label ( $self->path_witnesses( $substart{$node}, $node ) );
922 my $variables = { %edge_attrs, 'label' => $witstr };
923 my $nrdg = $self->reading( $node );
924 if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
925 # Substart is actually one lower than $startrank
926 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
928 my $varopts = _dot_attr_string( $variables );
929 $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
931 foreach my $node ( keys %subend ) {
932 my $witstr = $self->_path_display_label ( $self->path_witnesses( $node, $subend{$node} ) );
933 my $variables = { %edge_attrs, 'label' => $witstr };
934 my $varopts = _dot_attr_string( $variables );
935 $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
938 if( $STRAIGHTENHACK ) {
939 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
940 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
947 sub _dot_attr_string {
950 foreach my $k ( sort keys %$hash ) {
952 push( @attrs, $k.'="'.$v.'"' );
954 return( '[ ' . join( ', ', @attrs ) . ' ]' );
957 sub _add_edge_weights {
959 # Walk the graph from START to END, choosing the successor node with
960 # the largest number of witness paths each time.
962 my $curr = $self->start->id;
963 my $ranked = $self->end->has_rank;
964 while( $curr ne $self->end->id ) {
965 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
966 my @succ = sort { $self->path_witnesses( $curr, $a )
967 <=> $self->path_witnesses( $curr, $b ) }
968 $self->sequence->successors( $curr );
969 my $next = pop @succ;
970 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
971 # Try to avoid lacunae in the weighted path.
973 ( $self->reading( $next )->is_lacuna ||
974 $nextrank - $rank > 1 ) ){
977 $weighted->{$curr} = $next;
983 =head2 path_witnesses( $edge )
985 Returns the list of sigils whose witnesses are associated with the given edge.
986 The edge can be passed as either an array or an arrayref of ( $source, $target ).
991 my( $self, @edge ) = @_;
992 # If edge is an arrayref, cope.
993 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
997 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
1001 # Helper function. Make a display label for the given witnesses, showing a.c.
1002 # witnesses only where the main witness is not also in the list.
1003 sub _path_display_label {
1006 map { $wits{$_} = 1 } @_;
1008 # If an a.c. wit is listed, remove it if the main wit is also listed.
1009 # Otherwise keep it for explicit listing.
1010 my $aclabel = $self->ac_label;
1012 foreach my $w ( sort keys %wits ) {
1013 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
1014 if( exists $wits{$1} ) {
1017 push( @disp_ac, $w );
1022 # See if we are in a majority situation.
1023 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
1024 $maj = $maj > 5 ? $maj : 5;
1025 if( scalar keys %wits > $maj ) {
1026 unshift( @disp_ac, 'majority' );
1027 return join( ', ', @disp_ac );
1029 return join( ', ', sort keys %wits );
1033 =head2 readings_at_rank( $rank )
1035 Returns a list of readings at a given rank, taken from the alignment table.
1039 sub readings_at_rank {
1040 my( $self, $rank ) = @_;
1041 my $table = $self->alignment_table;
1042 # Table rank is real rank - 1.
1043 my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
1045 foreach my $e ( @elements ) {
1046 next unless ref( $e ) eq 'HASH';
1047 next unless exists $e->{'t'};
1048 $readings{$e->{'t'}->id} = $e->{'t'};
1050 return values %readings;
1055 Returns a GraphML representation of the collation. The GraphML will contain
1056 two graphs. The first expresses the attributes of the readings and the witness
1057 paths that link them; the second expresses the relationships that link the
1058 readings. This is the native transfer format for a tradition.
1062 use Text::Tradition;
1068 my $datafile = 't/data/florilegium_tei_ps.xml';
1069 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1071 'file' => $datafile,
1074 ok( $tradition, "Got a tradition object" );
1075 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
1076 ok( $tradition->collation, "Tradition has a collation" );
1078 my $c = $tradition->collation;
1079 is( scalar $c->readings, $READINGS, "Collation has all readings" );
1080 is( scalar $c->paths, $PATHS, "Collation has all paths" );
1081 is( scalar $c->relationships, 0, "Collation has all relationships" );
1083 # Add a few relationships
1084 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
1085 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
1086 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
1088 # Now write it to GraphML and parse it again.
1090 my $graphml = $c->as_graphml;
1091 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
1092 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
1093 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
1094 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
1096 # Now add a stemma, write to GraphML, and look at the output.
1098 skip "Analysis module not present", 3 unless $tradition->can( 'add_stemma' );
1099 my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
1100 is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
1101 is( $tradition->stemmata, 1, "Tradition now has the stemma" );
1102 $graphml = $c->as_graphml;
1103 like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
1110 ## TODO MOVE this to Tradition.pm and modularize it better
1112 my( $self, $options ) = @_;
1113 $self->calculate_ranks unless $self->_graphcalc_done;
1115 my $start = $options->{'from'}
1116 ? $self->reading( $options->{'from'} ) : $self->start;
1117 my $end = $options->{'to'}
1118 ? $self->reading( $options->{'to'} ) : $self->end;
1119 if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
1120 throw( 'Start node must be before end node' );
1122 # The readings need to be ranked for this to work.
1123 $start = $self->start unless $start->has_rank;
1124 $end = $self->end unless $end->has_rank;
1126 unless( $start eq $self->start ) {
1127 $rankoffset = $start->rank - 1;
1132 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
1133 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
1134 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
1135 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
1137 # Create the document and root node
1138 require XML::LibXML;
1139 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
1140 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
1141 $graphml->setDocumentElement( $root );
1142 $root->setNamespace( $xsi_ns, 'xsi', 0 );
1143 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
1145 # List of attribute types to save on our objects and their corresponding
1150 'Bool' => 'boolean',
1151 'ReadingID' => 'string',
1152 'RelationshipType' => 'string',
1153 'RelationshipScope' => 'string',
1156 # Add the data keys for the graph. Include an extra key 'version' for the
1157 # GraphML output version.
1158 my %graph_data_keys;
1160 my %graph_attributes = ( 'version' => 'string' );
1161 # Graph attributes include those of Tradition and those of Collation.
1163 # TODO Use meta introspection method from duplicate_reading to do this
1164 # instead of naming custom keys.
1165 my $tmeta = $self->tradition->meta;
1166 my $cmeta = $self->meta;
1167 map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
1168 map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
1169 foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
1170 next if $attr->name =~ /^_/;
1171 next unless $save_types{$attr->type_constraint->name};
1172 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1174 # Extra custom keys for complex objects that should be saved in some form.
1175 # The subroutine should return a string, or undef/empty.
1176 if( $tmeta->has_method('stemmata') ) {
1177 $graph_attributes{'stemmata'} = sub {
1179 map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1180 $self->tradition->stemmata;
1181 join( "\n", @stemstrs );
1185 if( $tmeta->has_method('user') ) {
1186 $graph_attributes{'user'} = sub {
1187 $self->tradition->user ? $self->tradition->user->id : undef
1191 foreach my $datum ( sort keys %graph_attributes ) {
1192 $graph_data_keys{$datum} = 'dg'.$gdi++;
1193 my $key = $root->addNewChild( $graphml_ns, 'key' );
1194 my $dtype = ref( $graph_attributes{$datum} ) ? 'string'
1195 : $graph_attributes{$datum};
1196 $key->setAttribute( 'attr.name', $datum );
1197 $key->setAttribute( 'attr.type', $dtype );
1198 $key->setAttribute( 'for', 'graph' );
1199 $key->setAttribute( 'id', $graph_data_keys{$datum} );
1202 # Add the data keys for reading nodes
1203 my %reading_attributes;
1204 my $rmeta = Text::Tradition::Collation::Reading->meta;
1205 foreach my $attr( $rmeta->get_all_attributes ) {
1206 next if $attr->name =~ /^_/;
1207 next unless $save_types{$attr->type_constraint->name};
1208 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1210 if( $self->start->does('Text::Tradition::Morphology' ) ) {
1211 # Extra custom key for the reading morphology
1212 $reading_attributes{'lexemes'} = 'string';
1217 foreach my $datum ( sort keys %reading_attributes ) {
1218 $node_data_keys{$datum} = 'dn'.$ndi++;
1219 my $key = $root->addNewChild( $graphml_ns, 'key' );
1220 $key->setAttribute( 'attr.name', $datum );
1221 $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
1222 $key->setAttribute( 'for', 'node' );
1223 $key->setAttribute( 'id', $node_data_keys{$datum} );
1226 # Add the data keys for edges, that is, paths and relationships. Path
1227 # data does not come from a Moose class so is here manually.
1230 my %edge_attributes = (
1231 witness => 'string', # ID/label for a path
1232 extra => 'boolean', # Path key
1234 my @path_attributes = keys %edge_attributes; # track our manual additions
1235 my $pmeta = Text::Tradition::Collation::Relationship->meta;
1236 foreach my $attr( $pmeta->get_all_attributes ) {
1237 next if $attr->name =~ /^_/;
1238 next unless $save_types{$attr->type_constraint->name};
1239 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1241 foreach my $datum ( sort keys %edge_attributes ) {
1242 $edge_data_keys{$datum} = 'de'.$edi++;
1243 my $key = $root->addNewChild( $graphml_ns, 'key' );
1244 $key->setAttribute( 'attr.name', $datum );
1245 $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
1246 $key->setAttribute( 'for', 'edge' );
1247 $key->setAttribute( 'id', $edge_data_keys{$datum} );
1250 # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1251 my $xmlidname = $self->tradition->name;
1252 $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1253 if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1254 $xmlidname = '_'.$xmlidname;
1256 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1257 $sgraph->setAttribute( 'edgedefault', 'directed' );
1258 $sgraph->setAttribute( 'id', $xmlidname );
1259 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
1260 $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
1261 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
1262 $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
1263 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
1265 # Tradition/collation attribute data
1266 foreach my $datum ( keys %graph_attributes ) {
1268 if( $datum eq 'version' ) {
1270 } elsif( ref( $graph_attributes{$datum} ) ) {
1271 my $sub = $graph_attributes{$datum};
1273 } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1274 $value = $self->tradition->$datum;
1276 $value = $self->$datum;
1278 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
1283 # Add our readings to the graph
1284 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
1285 next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1286 ( $n->rank < $start->rank || $n->rank > $end->rank );
1287 $use_readings{$n->id} = 1;
1288 # Add to the main graph
1289 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
1290 my $node_xmlid = 'n' . $node_ctr++;
1291 $node_hash{ $n->id } = $node_xmlid;
1292 $node_el->setAttribute( 'id', $node_xmlid );
1293 foreach my $d ( keys %reading_attributes ) {
1295 # Custom serialization
1296 if( $d eq 'lexemes' ) {
1297 # If nval is a true value, we have lexemes so we need to
1298 # serialize them. Otherwise set nval to undef so that the
1299 # key is excluded from this reading.
1300 $nval = $nval ? $n->_serialize_lexemes : undef;
1301 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1304 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
1305 # Adjust the ranks within the subgraph.
1306 $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1307 : $nval - $rankoffset;
1309 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1314 # Add the path edges to the sequence graph
1316 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1317 # We add an edge in the graphml for every witness in $e.
1318 next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1319 my @edge_wits = sort $self->path_witnesses( $e );
1320 $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1321 $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1322 # Skip any path from start to end; that witness is not in the subgraph.
1323 next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
1324 foreach my $wit ( @edge_wits ) {
1325 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1326 $node_hash{ $e->[0] },
1327 $node_hash{ $e->[1] } );
1328 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
1329 $edge_el->setAttribute( 'source', $from );
1330 $edge_el->setAttribute( 'target', $to );
1331 $edge_el->setAttribute( 'id', $id );
1333 # It's a witness path, so add the witness
1335 my $key = $edge_data_keys{'witness'};
1336 # Is this an ante-corr witness?
1337 my $aclabel = $self->ac_label;
1338 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1339 # Keep the base witness
1341 # ...and record that this is an 'extra' reading path
1342 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1344 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1348 # Report the actual number of nodes and edges that went in
1349 $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1350 $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1352 # Add the relationship graph to the XML
1353 map { delete $edge_data_keys{$_} } @path_attributes;
1354 $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1355 $node_data_keys{'id'}, \%edge_data_keys );
1357 # Save and return the thing
1358 my $result = decode_utf8( $graphml->toString(1) );
1362 sub _add_graphml_data {
1363 my( $el, $key, $value ) = @_;
1364 return unless defined $value;
1365 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
1366 $data_el->setAttribute( 'key', $key );
1367 $data_el->appendText( $value );
1372 Returns a CSV alignment table representation of the collation graph, one
1373 row per witness (or witness uncorrected.)
1379 my $table = $self->alignment_table;
1380 my $csv = Text::CSV->new( { binary => 1, quote_null => 0 } );
1382 # Make the header row
1383 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1384 push( @result, decode_utf8( $csv->string ) );
1385 # Make the rest of the rows
1386 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1387 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1388 my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
1389 $csv->combine( @row );
1390 push( @result, decode_utf8( $csv->string ) );
1392 return join( "\n", @result );
1395 =head2 alignment_table
1397 Return a reference to an alignment table, in a slightly enhanced CollateX
1398 format which looks like this:
1400 $table = { alignment => [ { witness => "SIGIL",
1401 tokens => [ { t => "TEXT" }, ... ] },
1402 { witness => "SIG2",
1403 tokens => [ { t => "TEXT" }, ... ] },
1405 length => TEXTLEN };
1409 sub alignment_table {
1411 return $self->cached_table if $self->has_cached_table;
1413 # Make sure we can do this
1414 throw( "Need a linear graph in order to make an alignment table" )
1415 unless $self->linear;
1416 $self->calculate_ranks()
1417 unless $self->_graphcalc_done && $self->end->has_rank;
1419 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
1420 my @all_pos = ( 1 .. $self->end->rank - 1 );
1421 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
1422 # say STDERR "Making witness row(s) for " . $wit->sigil;
1423 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1424 my @row = _make_witness_row( \@wit_path, \@all_pos );
1425 my $witobj = { 'witness' => $wit->sigil, 'tokens' => \@row };
1426 $witobj->{'identifier'} = $wit->identifier if $wit->identifier;
1427 push( @{$table->{'alignment'}}, $witobj );
1428 if( $wit->is_layered ) {
1429 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
1430 $wit->sigil.$self->ac_label );
1431 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
1432 my $witacobj = { 'witness' => $wit->sigil.$self->ac_label,
1433 'tokens' => \@ac_row };
1434 $witacobj->{'identifier'} = $wit->identifier if $wit->identifier;
1435 push( @{$table->{'alignment'}}, $witacobj );
1438 $self->cached_table( $table );
1442 sub _make_witness_row {
1443 my( $path, $positions ) = @_;
1445 map { $char_hash{$_} = undef } @$positions;
1447 foreach my $rdg ( @$path ) {
1448 say STDERR "rank " . $rdg->rank if $debug;
1449 # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
1450 $char_hash{$rdg->rank} = { 't' => $rdg };
1452 my @row = map { $char_hash{$_} } @$positions;
1453 # Fill in lacuna markers for undef spots in the row
1454 my $last_el = shift @row;
1455 my @filled_row = ( $last_el );
1456 foreach my $el ( @row ) {
1457 # If we are using node reference, make the lacuna node appear many times
1458 # in the table. If not, use the lacuna tag.
1459 if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1462 push( @filled_row, $el );
1469 =head1 NAVIGATION METHODS
1471 =head2 reading_sequence( $first, $last, $sigil, $backup )
1473 Returns the ordered list of readings, starting with $first and ending
1474 with $last, for the witness given in $sigil. If a $backup sigil is
1475 specified (e.g. when walking a layered witness), it will be used wherever
1476 no $sigil path exists. If there is a base text reading, that will be
1477 used wherever no path exists for $sigil or $backup.
1481 # TODO Think about returning some lazy-eval iterator.
1482 # TODO Get rid of backup; we should know from what witness is whether we need it.
1484 sub reading_sequence {
1485 my( $self, $start, $end, $witness ) = @_;
1487 $witness = $self->baselabel unless $witness;
1488 my @readings = ( $start );
1491 while( $n && $n->id ne $end->id ) {
1492 if( exists( $seen{$n->id} ) ) {
1493 throw( "Detected loop for $witness at " . $n->id );
1497 my $next = $self->next_reading( $n, $witness );
1499 throw( "Did not find any path for $witness from reading " . $n->id );
1501 push( @readings, $next );
1504 # Check that the last reading is our end reading.
1505 my $last = $readings[$#readings];
1506 throw( "Last reading found from " . $start->text .
1507 " for witness $witness is not the end!" ) # TODO do we get this far?
1508 unless $last->id eq $end->id;
1513 =head2 next_reading( $reading, $sigil );
1515 Returns the reading that follows the given reading along the given witness
1521 # Return the successor via the corresponding path.
1523 my $answer = $self->_find_linked_reading( 'next', @_ );
1524 return undef unless $answer;
1525 return $self->reading( $answer );
1528 =head2 prior_reading( $reading, $sigil )
1530 Returns the reading that precedes the given reading along the given witness
1536 # Return the predecessor via the corresponding path.
1538 my $answer = $self->_find_linked_reading( 'prior', @_ );
1539 return $self->reading( $answer );
1542 sub _find_linked_reading {
1543 my( $self, $direction, $node, $path ) = @_;
1545 # Get a backup if we are dealing with a layered witness
1547 my $aclabel = $self->ac_label;
1548 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1552 my @linked_paths = $direction eq 'next'
1553 ? $self->sequence->edges_from( $node )
1554 : $self->sequence->edges_to( $node );
1555 return undef unless scalar( @linked_paths );
1557 # We have to find the linked path that contains all of the
1558 # witnesses supplied in $path.
1559 my( @path_wits, @alt_path_wits );
1560 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1561 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
1564 foreach my $le ( @linked_paths ) {
1565 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
1568 my @le_wits = sort $self->path_witnesses( $le );
1569 if( _is_within( \@path_wits, \@le_wits ) ) {
1570 # This is the right path.
1571 return $direction eq 'next' ? $le->[1] : $le->[0];
1572 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1576 # Got this far? Return the alternate path if it exists.
1577 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
1580 # Got this far? Return the base path if it exists.
1581 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
1584 # Got this far? We have no appropriate path.
1585 warn "Could not find $direction node from " . $node->id
1586 . " along path $path";
1592 my( $set1, $set2 ) = @_;
1593 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
1594 foreach my $el ( @$set1 ) {
1595 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
1600 # Return the string that joins together a list of witnesses for
1601 # display on a single path.
1602 sub _witnesses_of_label {
1603 my( $self, $label ) = @_;
1604 my $regex = $self->wit_list_separator;
1605 my @answer = split( /\Q$regex\E/, $label );
1609 =head2 common_readings
1611 Returns the list of common readings in the graph (i.e. those readings that are
1612 shared by all non-lacunose witnesses.)
1616 sub common_readings {
1618 my @common = grep { $_->is_common } $self->readings;
1622 =head2 path_text( $sigil, [, $start, $end ] )
1624 Returns the text of a witness (plus its backup, if we are using a layer)
1625 as stored in the collation. The text is returned as a string, where the
1626 individual readings are joined with spaces and the meta-readings (e.g.
1627 lacunae) are omitted. Optional specification of $start and $end allows
1628 the generation of a subset of the witness text.
1633 my( $self, $wit, $start, $end ) = @_;
1634 $start = $self->start unless $start;
1635 $end = $self->end unless $end;
1636 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
1639 foreach my $r ( @path ) {
1640 unless ( $r->join_prior || !$last || $last->join_next ) {
1643 $pathtext .= $r->text;
1649 =head1 INITIALIZATION METHODS
1651 These are mostly for use by parsers.
1653 =head2 make_witness_path( $witness )
1655 Link the array of readings contained in $witness->path (and in
1656 $witness->uncorrected_path if it exists) into collation paths.
1657 Clear out the arrays when finished.
1659 =head2 make_witness_paths
1661 Call make_witness_path for all witnesses in the tradition.
1665 # For use when a collation is constructed from a base text and an apparatus.
1666 # We have the sequences of readings and just need to add path edges.
1667 # When we are done, clear out the witness path attributes, as they are no
1669 # TODO Find a way to replace the witness path attributes with encapsulated functions?
1671 sub make_witness_paths {
1673 foreach my $wit ( $self->tradition->witnesses ) {
1674 # say STDERR "Making path for " . $wit->sigil;
1675 $self->make_witness_path( $wit );
1679 sub make_witness_path {
1680 my( $self, $wit ) = @_;
1681 my @chain = @{$wit->path};
1682 my $sig = $wit->sigil;
1683 # Add start and end if necessary
1684 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1685 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1686 foreach my $idx ( 0 .. $#chain-1 ) {
1687 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
1689 if( $wit->is_layered ) {
1690 @chain = @{$wit->uncorrected_path};
1691 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1692 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1693 foreach my $idx( 0 .. $#chain-1 ) {
1694 my $source = $chain[$idx];
1695 my $target = $chain[$idx+1];
1696 $self->add_path( $source, $target, $sig.$self->ac_label )
1697 unless $self->has_path( $source, $target, $sig );
1701 $wit->clear_uncorrected_path;
1704 =head2 calculate_ranks
1706 Calculate the reading ranks (that is, their aligned positions relative
1707 to each other) for the graph. This can only be called on linear collations.
1711 use Text::Tradition;
1713 my $cxfile = 't/data/Collatex-16.xml';
1714 my $t = Text::Tradition->new(
1716 'input' => 'CollateX',
1719 my $c = $t->collation;
1722 my $table = $c->alignment_table;
1723 ok( $c->has_cached_table, "Alignment table was cached" );
1724 is( $c->alignment_table, $table, "Cached table returned upon second call" );
1725 $c->calculate_ranks;
1726 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
1727 $c->add_relationship( 'n13', 'n23', { type => 'repetition' } );
1728 is( $c->alignment_table, $table, "Alignment table unchanged after non-colo relationship add" );
1729 $c->add_relationship( 'n24', 'n23', { type => 'spelling' } );
1730 isnt( $c->alignment_table, $table, "Alignment table changed after colo relationship add" );
1736 sub calculate_ranks {
1738 # Save the existing ranks, in case we need to invalidate the cached SVG.
1740 map { $existing_ranks{$_} = $_->rank } $self->readings;
1742 # Do the rankings based on the relationship equivalence graph, starting
1743 # with the start node.
1744 my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
1746 # Transfer our rankings from the topological graph to the real one.
1747 foreach my $r ( $self->readings ) {
1748 if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
1749 $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
1751 # Die. Find the last rank we calculated.
1752 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
1753 <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
1755 my $last = pop @all_defined;
1756 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
1759 # Do we need to invalidate the cached data?
1760 if( $self->has_cached_table ) {
1761 foreach my $r ( $self->readings ) {
1762 next if defined( $existing_ranks{$r} )
1763 && $existing_ranks{$r} == $r->rank;
1764 # Something has changed, so clear the cache
1765 $self->_clear_cache;
1766 # ...and recalculate the common readings.
1767 $self->calculate_common_readings();
1771 # The graph calculation information is now up to date.
1772 $self->_graphcalc_done(1);
1777 $self->wipe_table if $self->has_cached_table;
1781 =head2 flatten_ranks
1783 A convenience method for parsing collation data. Searches the graph for readings
1784 with the same text at the same rank, and merges any that are found.
1789 my ( $self, %args ) = shift;
1790 my %unique_rank_rdg;
1792 foreach my $p ( $self->identical_readings( %args ) ) {
1793 # say STDERR "Combining readings at same rank: @$p";
1795 $self->merge_readings( @$p );
1796 # TODO see if this now makes a common point.
1798 # If we merged readings, the ranks are still fine but the alignment
1799 # table is wrong. Wipe it.
1800 $self->wipe_table() if $changed;
1803 =head2 identical_readings
1804 =head2 identical_readings( start => $startnode, end => $endnode )
1805 =head2 identical_readings( startrank => $startrank, endrank => $endrank )
1807 Goes through the graph identifying all pairs of readings that appear to be
1808 identical, and therefore able to be merged into a single reading. Returns the
1809 relevant identical pairs. Can be restricted to run over only a part of the
1810 graph, specified either by node or by rank.
1814 sub identical_readings {
1815 my ( $self, %args ) = @_;
1816 # Find where we should start and end.
1817 my $startrank = $args{startrank} || 0;
1818 if( $args{start} ) {
1819 throw( "Starting reading has no rank" ) unless $self->reading( $args{start} )
1820 && $self->reading( $args{start} )->has_rank;
1821 $startrank = $self->reading( $args{start} )->rank;
1823 my $endrank = $args{endrank} || $self->end->rank;
1825 throw( "Ending reading has no rank" ) unless $self->reading( $args{end} )
1826 && $self->reading( $args{end} )->has_rank;
1827 $endrank = $self->reading( $args{end} )->rank;
1830 # Make sure the ranks are correct.
1831 unless( $self->_graphcalc_done ) {
1832 $self->calculate_ranks;
1834 # Go through the readings looking for duplicates.
1835 my %unique_rank_rdg;
1837 foreach my $rdg ( $self->readings ) {
1838 next unless $rdg->has_rank;
1839 my $rk = $rdg->rank;
1840 next if $rk > $endrank || $rk < $startrank;
1841 my $key = $rk . "||" . $rdg->text;
1842 if( exists $unique_rank_rdg{$key} ) {
1843 # Make sure they don't have different grammatical forms
1844 my $ur = $unique_rank_rdg{$key};
1845 if( $rdg->is_identical( $ur ) ) {
1846 push( @pairs, [ $ur, $rdg ] );
1849 $unique_rank_rdg{$key} = $rdg;
1857 =head2 calculate_common_readings
1859 Goes through the graph identifying the readings that appear in every witness
1860 (apart from those with lacunae at that spot.) Marks them as common and returns
1865 use Text::Tradition;
1867 my $cxfile = 't/data/Collatex-16.xml';
1868 my $t = Text::Tradition->new(
1870 'input' => 'CollateX',
1873 my $c = $t->collation;
1875 my @common = $c->calculate_common_readings();
1876 is( scalar @common, 8, "Found correct number of common readings" );
1877 my @marked = sort $c->common_readings();
1878 is( scalar @common, 8, "All common readings got marked as such" );
1879 my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
1880 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1886 sub calculate_common_readings {
1889 map { $_->is_common( 0 ) } $self->readings;
1890 # Implicitly calls calculate_ranks
1891 my $table = $self->alignment_table;
1892 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1893 my @row = map { $_->{'tokens'}->[$idx]
1894 ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1895 @{$table->{'alignment'}};
1897 foreach my $r ( @row ) {
1899 $hash{$r->id} = $r unless $r->is_meta;
1901 $hash{'UNDEF'} = $r;
1904 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1905 my( $r ) = values %hash;
1907 push( @common, $r );
1913 =head2 text_from_paths
1915 Calculate the text array for all witnesses from the path, for later consistency
1916 checking. Only to be used if there is no non-graph-based way to know the
1921 sub text_from_paths {
1923 foreach my $wit ( $self->tradition->witnesses ) {
1924 my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1926 foreach my $r ( @readings ) {
1927 next if $r->is_meta;
1928 push( @text, $r->text );
1930 $wit->text( \@text );
1931 if( $wit->is_layered ) {
1932 my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
1933 $wit->sigil.$self->ac_label );
1935 foreach my $r ( @ucrdgs ) {
1936 next if $r->is_meta;
1937 push( @uctext, $r->text );
1939 $wit->layertext( \@uctext );
1944 =head1 UTILITY FUNCTIONS
1946 =head2 common_predecessor( $reading_a, $reading_b )
1948 Find the last reading that occurs in sequence before both the given readings.
1949 At the very least this should be $self->start.
1951 =head2 common_successor( $reading_a, $reading_b )
1953 Find the first reading that occurs in sequence after both the given readings.
1954 At the very least this should be $self->end.
1958 use Text::Tradition;
1960 my $cxfile = 't/data/Collatex-16.xml';
1961 my $t = Text::Tradition->new(
1963 'input' => 'CollateX',
1966 my $c = $t->collation;
1968 is( $c->common_predecessor( 'n24', 'n23' )->id,
1969 'n20', "Found correct common predecessor" );
1970 is( $c->common_successor( 'n24', 'n23' )->id,
1971 '__END__', "Found correct common successor" );
1973 is( $c->common_predecessor( 'n19', 'n17' )->id,
1974 'n16', "Found correct common predecessor for readings on same path" );
1975 is( $c->common_successor( 'n21', 'n10' )->id,
1976 '__END__', "Found correct common successor for readings on same path" );
1982 ## Return the closest reading that is a predecessor of both the given readings.
1983 sub common_predecessor {
1985 my( $r1, $r2 ) = $self->_objectify_args( @_ );
1986 return $self->_common_in_path( $r1, $r2, 'predecessors' );
1989 sub common_successor {
1991 my( $r1, $r2 ) = $self->_objectify_args( @_ );
1992 return $self->_common_in_path( $r1, $r2, 'successors' );
1996 # TODO think about how to do this without ranks...
1997 sub _common_in_path {
1998 my( $self, $r1, $r2, $dir ) = @_;
1999 my $iter = $self->end->rank;
2001 my @last_r1 = ( $r1 );
2002 my @last_r2 = ( $r2 );
2003 # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
2005 # say STDERR "Finding common $dir for $r1, $r2";
2006 while( !@candidates ) {
2007 last unless $iter--; # Avoid looping infinitely
2008 # Iterate separately down the graph from r1 and r2
2009 my( @new_lc1, @new_lc2 );
2010 foreach my $lc ( @last_r1 ) {
2011 foreach my $p ( $lc->$dir ) {
2012 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
2013 # say STDERR "Path candidate $p from $lc";
2014 push( @candidates, $p );
2015 } elsif( !$all_seen{$p->id} ) {
2016 $all_seen{$p->id} = 'r1';
2017 push( @new_lc1, $p );
2021 foreach my $lc ( @last_r2 ) {
2022 foreach my $p ( $lc->$dir ) {
2023 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
2024 # say STDERR "Path candidate $p from $lc";
2025 push( @candidates, $p );
2026 } elsif( !$all_seen{$p->id} ) {
2027 $all_seen{$p->id} = 'r2';
2028 push( @new_lc2, $p );
2032 @last_r1 = @new_lc1;
2033 @last_r2 = @new_lc2;
2035 my @answer = sort { $a->rank <=> $b->rank } @candidates;
2036 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
2040 Text::Tradition::Error->throw(
2041 'ident' => 'Collation error',
2047 __PACKAGE__->meta->make_immutable;
2053 =item * Rework XML serialization in a more modular way
2059 This package is free software and is provided "as is" without express
2060 or implied warranty. You can redistribute it and/or modify it under
2061 the same terms as Perl itself.
2065 Tara L Andrews E<lt>aurum@cpan.orgE<gt>