1 package Text::Tradition::Collation;
3 use Encode qw( decode_utf8 );
6 use IPC::Run qw( run binary );
8 use Text::Tradition::Collation::Reading;
9 use Text::Tradition::Collation::RelationshipStore;
10 use Text::Tradition::Error;
12 use XML::LibXML::XPathContext;
18 default => sub { Graph->new() },
26 isa => 'Text::Tradition::Collation::RelationshipStore',
28 relationships => 'relationships',
29 related_readings => 'related_readings',
31 writer => '_set_relations',
36 isa => 'Text::Tradition',
41 isa => 'HashRef[Text::Tradition::Collation::Reading]',
45 _add_reading => 'set',
46 del_reading => 'delete',
47 has_reading => 'exists',
50 default => sub { {} },
53 has 'wit_list_separator' => (
62 default => 'base text',
79 isa => 'Text::Tradition::Collation::Reading',
80 writer => '_set_start',
86 isa => 'Text::Tradition::Collation::Reading',
93 Text::Tradition::Collation - a software model for a text collation
98 my $t = Text::Tradition->new(
99 'name' => 'this is a text',
101 'file' => '/path/to/tei_parallel_seg_file.xml' );
103 my $c = $t->collation;
104 my @readings = $c->readings;
105 my @paths = $c->paths;
106 my @relationships = $c->relationships;
108 my $svg_variant_graph = $t->collation->as_svg();
112 Text::Tradition is a library for representation and analysis of collated
113 texts, particularly medieval ones. The Collation is the central feature of
114 a Tradition, where the text, its sequence of readings, and its relationships
115 between readings are actually kept.
121 The constructor. Takes a hash or hashref of the following arguments:
125 =item * tradition - The Text::Tradition object to which the collation
128 =item * linear - Whether the collation should be linear; that is, whether
129 transposed readings should be treated as two linked readings rather than one,
130 and therefore whether the collation graph is acyclic. Defaults to true.
132 =item * baselabel - The default label for the path taken by a base text
133 (if any). Defaults to 'base text'.
135 =item * wit_list_separator - The string to join a list of witnesses for
136 purposes of making labels in display graphs. Defaults to ', '.
138 =item * ac_label - The extra label to tack onto a witness sigil when
139 representing another layer of path for the given witness - that is, when
140 a text has more than one possible reading due to scribal corrections or
141 the like. Defaults to ' (a.c.)'.
151 =head2 wit_list_separator
157 Simple accessors for collation attributes.
161 The meta-reading at the start of every witness path.
165 The meta-reading at the end of every witness path.
169 Returns all Reading objects in the graph.
171 =head2 reading( $id )
173 Returns the Reading object corresponding to the given ID.
175 =head2 add_reading( $reading_args )
177 Adds a new reading object to the collation.
178 See L<Text::Tradition::Collation::Reading> for the available arguments.
180 =head2 del_reading( $object_or_id )
182 Removes the given reading from the collation, implicitly removing its
183 paths and relationships.
185 =head2 merge_readings( $main, $second )
187 Merges the $second reading into the $main one.
188 The arguments may be either readings or reading IDs.
190 =head2 has_reading( $id )
192 Predicate to see whether a given reading ID is in the graph.
194 =head2 reading_witnesses( $object_or_id )
196 Returns a list of sigils whose witnesses contain the reading.
200 Returns all reading paths within the document - that is, all edges in the
201 collation graph. Each path is an arrayref of [ $source, $target ] reading IDs.
203 =head2 add_path( $source, $target, $sigil )
205 Links the given readings in the collation in sequence, under the given witness
206 sigil. The readings may be specified by object or ID.
208 =head2 del_path( $source, $target, $sigil )
210 Links the given readings in the collation in sequence, under the given witness
211 sigil. The readings may be specified by object or ID.
213 =head2 has_path( $source, $target );
215 Returns true if the two readings are linked in sequence in any witness.
216 The readings may be specified by object or ID.
220 Returns all Relationship objects in the collation.
222 =head2 add_relationship( $reading, $other_reading, $options )
224 Adds a new relationship of the type given in $options between the two readings,
225 which may be specified by object or ID. Returns a value of ( $status, @vectors)
226 where $status is true on success, and @vectors is a list of relationship edges
227 that were ultimately added.
228 See L<Text::Tradition::Collation::Relationship> for the available options.
234 $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
235 $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
236 $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
239 ### Reading construct/destruct functions
242 my( $self, $reading ) = @_;
243 unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
244 my %args = %$reading;
245 $reading = Text::Tradition::Collation::Reading->new(
246 'collation' => $self,
249 # First check to see if a reading with this ID exists.
250 if( $self->reading( $reading->id ) ) {
251 throw( "Collation already has a reading with id " . $reading->id );
253 $self->_add_reading( $reading->id => $reading );
254 # Once the reading has been added, put it in both graphs.
255 $self->sequence->add_vertex( $reading->id );
256 $self->relations->add_reading( $reading->id );
260 around del_reading => sub {
265 if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
268 # Remove the reading from the graphs.
269 $self->sequence->delete_vertex( $arg );
270 $self->relations->delete_reading( $arg );
273 $self->$orig( $arg );
276 # merge_readings( $main, $to_be_deleted );
281 # We only need the IDs for adding paths to the graph, not the reading
282 # objects themselves.
283 my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
285 # The kept reading should inherit the paths and the relationships
286 # of the deleted reading.
287 foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
288 my @vector = ( $kept );
289 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
290 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
291 next if $vector[0] eq $vector[1]; # Don't add a self loop
292 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
293 $self->sequence->add_edge( @vector );
294 my $fwits = $self->sequence->get_edge_attributes( @vector );
295 @wits{keys %$fwits} = values %$fwits;
296 $self->sequence->set_edge_attributes( @vector, \%wits );
298 $self->relations->merge_readings( $kept, $deleted, $combine_char );
300 # Do the deletion deed.
301 if( $combine_char ) {
302 my $kept_obj = $self->reading( $kept );
303 my $new_text = join( $combine_char, $kept_obj->text,
304 $self->reading( $deleted )->text );
305 $kept_obj->alter_text( $new_text );
307 $self->del_reading( $deleted );
311 # Helper function for manipulating the graph.
312 sub _stringify_args {
313 my( $self, $first, $second, $arg ) = @_;
315 if ref( $first ) eq 'Text::Tradition::Collation::Reading';
316 $second = $second->id
317 if ref( $second ) eq 'Text::Tradition::Collation::Reading';
318 return( $first, $second, $arg );
321 # Helper function for manipulating the graph.
322 sub _objectify_args {
323 my( $self, $first, $second, $arg ) = @_;
324 $first = $self->reading( $first )
325 unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
326 $second = $self->reading( $second )
327 unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
328 return( $first, $second, $arg );
335 # We only need the IDs for adding paths to the graph, not the reading
336 # objects themselves.
337 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
339 # Connect the readings
340 $self->sequence->add_edge( $source, $target );
341 # Note the witness in question
342 $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
348 if( ref( $_[0] ) eq 'ARRAY' ) {
355 # We only need the IDs for adding paths to the graph, not the reading
356 # objects themselves.
357 my( $source, $target, $wit ) = $self->_stringify_args( @args );
359 if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
360 $self->sequence->delete_edge_attribute( $source, $target, $wit );
362 unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
363 $self->sequence->delete_edge( $source, $target );
368 # Extra graph-alike utility
371 my( $source, $target, $wit ) = $self->_stringify_args( @_ );
372 return undef unless $self->sequence->has_edge( $source, $target );
373 return $self->sequence->has_edge_attribute( $source, $target, $wit );
376 =head2 clear_witness( @sigil_list )
378 Clear the given witnesses out of the collation entirely, removing references
379 to them in paths, and removing readings that belong only to them. Should only
380 be called via $tradition->del_witness.
385 my( $self, @sigils ) = @_;
387 # Clear the witness(es) out of the paths
388 foreach my $e ( $self->paths ) {
389 foreach my $sig ( @sigils ) {
390 $self->del_path( $e, $sig );
394 # Clear out the newly unused readings
395 foreach my $r ( $self->readings ) {
396 unless( $self->reading_witnesses( $r ) ) {
397 $self->del_reading( $r );
402 sub add_relationship {
404 my( $source, $target, $opts ) = $self->_stringify_args( @_ );
405 my( @vectors ) = $self->relations->add_relationship( $source,
406 $self->reading( $source ), $target, $self->reading( $target ), $opts );
407 # Force a full rank recalculation every time. Yuck.
408 $self->calculate_ranks() if $self->end->has_rank;
412 =head2 reading_witnesses( $reading )
414 Return a list of sigils corresponding to the witnesses in which the reading appears.
418 sub reading_witnesses {
419 my( $self, $reading ) = @_;
420 # We need only check either the incoming or the outgoing edges; I have
421 # arbitrarily chosen "incoming". Thus, special-case the start node.
422 if( $reading eq $self->start ) {
423 return map { $_->sigil } $self->tradition->witnesses;
426 foreach my $e ( $self->sequence->edges_to( $reading ) ) {
427 my $wits = $self->sequence->get_edge_attributes( @$e );
428 @all_witnesses{ keys %$wits } = 1;
430 return keys %all_witnesses;
433 =head1 OUTPUT METHODS
435 =head2 as_svg( \%options )
437 Returns an SVG string that represents the graph, via as_dot and graphviz.
438 See as_dot for a list of options.
443 my( $self, $opts ) = @_;
445 my @cmd = qw/dot -Tsvg/;
447 my $dotfile = File::Temp->new();
449 # $dotfile->unlink_on_destroy(0);
450 binmode $dotfile, ':utf8';
451 print $dotfile $self->as_dot( $opts );
452 push( @cmd, $dotfile->filename );
453 run( \@cmd, ">", binary(), \$svg );
454 return decode_utf8( $svg );
458 =head2 as_dot( \%options )
460 Returns a string that is the collation graph expressed in dot
461 (i.e. GraphViz) format. Options include:
476 my( $self, $opts ) = @_;
477 my $startrank = $opts->{'from'} if $opts;
478 my $endrank = $opts->{'to'} if $opts;
479 my $color_common = $opts->{'color_common'} if $opts;
481 # Check the arguments
483 return if $endrank && $startrank > $endrank;
484 return if $startrank > $self->end->rank;
486 if( defined $endrank ) {
487 return if $endrank < 0;
488 $endrank = undef if $endrank == $self->end->rank;
491 my $graph_name = $self->tradition->name;
492 $graph_name =~ s/[^\w\s]//g;
493 $graph_name = join( '_', split( /\s+/, $graph_name ) );
501 'fillcolor' => 'white',
506 'arrowhead' => 'open',
507 'color' => '#000000',
508 'fontcolor' => '#000000',
511 my $dot = sprintf( "digraph %s {\n", $graph_name );
512 $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
513 $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
515 # Output substitute start/end readings if necessary
517 $dot .= "\t\"#SUBSTART#\" [ label=\"...\" ];\n";
520 $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n";
523 my %used; # Keep track of the readings that actually appear in the graph
524 # Sort the readings by rank if we have ranks; this speeds layout.
525 my @all_readings = $self->end->has_rank
526 ? sort { $a->rank <=> $b->rank } $self->readings
528 # TODO Refrain from outputting lacuna nodes - just grey out the edges.
529 foreach my $reading ( @all_readings ) {
530 # Only output readings within our rank range.
531 next if $startrank && $reading->rank < $startrank;
532 next if $endrank && $reading->rank > $endrank;
533 $used{$reading->id} = 1;
534 # Need not output nodes without separate labels
535 next if $reading->id eq $reading->text;
537 my $label = $reading->text;
538 $label =~ s/\"/\\\"/g;
539 $rattrs->{'label'} = $label;
540 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
541 $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
544 # Add the real edges. Need to weight one edge per rank jump, in a
546 my $weighted = $self->_add_edge_weights;
547 my @edges = $self->paths;
548 my( %substart, %subend );
549 foreach my $edge ( @edges ) {
550 # Do we need to output this edge?
551 if( $used{$edge->[0]} && $used{$edge->[1]} ) {
552 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
553 my $variables = { %edge_attrs, 'label' => $label };
555 # Account for the rank gap if necessary
556 my $rank0 = $self->reading( $edge->[0] )->rank
557 if $self->reading( $edge->[0] )->has_rank;
558 my $rank1 = $self->reading( $edge->[1] )->rank
559 if $self->reading( $edge->[1] )->has_rank;
560 if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
561 $variables->{'minlen'} = $rank1 - $rank0;
564 # Add the calculated edge weights
565 if( exists $weighted->{$edge->[0]}
566 && $weighted->{$edge->[0]} eq $edge->[1] ) {
567 # $variables->{'color'} = 'red';
568 $variables->{'weight'} = 3.0;
571 # EXPERIMENTAL: make edge width reflect no. of witnesses
572 my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
573 $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
575 my $varopts = _dot_attr_string( $variables );
576 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
577 $edge->[0], $edge->[1], $varopts );
578 } elsif( $used{$edge->[0]} ) {
579 $subend{$edge->[0]} = 1;
580 } elsif( $used{$edge->[1]} ) {
581 $substart{$edge->[1]} = 1;
584 # Add substitute start and end edges if necessary
585 foreach my $node ( keys %substart ) {
586 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
587 my $variables = { %edge_attrs, 'label' => $witstr };
588 my $varopts = _dot_attr_string( $variables );
589 $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;";
591 foreach my $node ( keys %subend ) {
592 my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
593 my $variables = { %edge_attrs, 'label' => $witstr };
594 my $varopts = _dot_attr_string( $variables );
595 $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;";
602 sub _dot_attr_string {
605 foreach my $k ( sort keys %$hash ) {
607 push( @attrs, $k.'="'.$v.'"' );
609 return( '[ ' . join( ', ', @attrs ) . ' ]' );
612 sub _add_edge_weights {
614 # Walk the graph from START to END, choosing the successor node with
615 # the largest number of witness paths each time.
617 my $curr = $self->start->id;
618 my $ranked = $self->end->has_rank;
619 while( $curr ne $self->end->id ) {
620 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
621 my @succ = sort { $self->path_witnesses( $curr, $a )
622 <=> $self->path_witnesses( $curr, $b ) }
623 $self->sequence->successors( $curr );
624 my $next = pop @succ;
625 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
626 # Try to avoid lacunae in the weighted path.
628 ( $self->reading( $next )->is_lacuna ||
629 $nextrank - $rank > 1 ) ){
632 $weighted->{$curr} = $next;
638 =head2 path_witnesses( $edge )
640 Returns the list of sigils whose witnesses are associated with the given edge.
641 The edge can be passed as either an array or an arrayref of ( $source, $target ).
646 my( $self, @edge ) = @_;
647 # If edge is an arrayref, cope.
648 if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
652 my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
656 sub _path_display_label {
659 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
660 if( scalar @wits > $maj ) {
661 # TODO break out a.c. wits
664 return join( ', ', @wits );
671 Returns a GraphML representation of the collation. The GraphML will contain
672 two graphs. The first expresses the attributes of the readings and the witness
673 paths that link them; the second expresses the relationships that link the
674 readings. This is the native transfer format for a tradition.
683 my $datafile = 't/data/florilegium_tei_ps.xml';
684 my $tradition = Text::Tradition->new( 'input' => 'TEI',
689 ok( $tradition, "Got a tradition object" );
690 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
691 ok( $tradition->collation, "Tradition has a collation" );
693 my $c = $tradition->collation;
694 is( scalar $c->readings, $READINGS, "Collation has all readings" );
695 is( scalar $c->paths, $PATHS, "Collation has all paths" );
696 is( scalar $c->relationships, 0, "Collation has all relationships" );
698 # Add a few relationships
699 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
700 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
701 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
703 # Now write it to GraphML and parse it again.
705 my $graphml = $c->as_graphml;
706 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
707 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
708 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
709 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
719 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
720 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
721 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
722 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
724 # Create the document and root node
725 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
726 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
727 $graphml->setDocumentElement( $root );
728 $root->setNamespace( $xsi_ns, 'xsi', 0 );
729 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
731 # Add the data keys for the graph
734 my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
735 foreach my $datum ( @graph_attributes ) {
736 $graph_data_keys{$datum} = 'dg'.$gdi++;
737 my $key = $root->addNewChild( $graphml_ns, 'key' );
738 $key->setAttribute( 'attr.name', $datum );
739 $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
740 $key->setAttribute( 'for', 'graph' );
741 $key->setAttribute( 'id', $graph_data_keys{$datum} );
744 # Add the data keys for nodes
751 is_start => 'boolean',
753 is_lacuna => 'boolean',
755 foreach my $datum ( keys %node_data ) {
756 $node_data_keys{$datum} = 'dn'.$ndi++;
757 my $key = $root->addNewChild( $graphml_ns, 'key' );
758 $key->setAttribute( 'attr.name', $datum );
759 $key->setAttribute( 'attr.type', $node_data{$datum} );
760 $key->setAttribute( 'for', 'node' );
761 $key->setAttribute( 'id', $node_data_keys{$datum} );
764 # Add the data keys for edges, i.e. witnesses
768 class => 'string', # Class, deprecated soon
769 witness => 'string', # ID/label for a path
770 relationship => 'string', # ID/label for a relationship
771 extra => 'boolean', # Path key
772 scope => 'string', # Relationship key
773 non_correctable => 'boolean', # Relationship key
774 non_independent => 'boolean', # Relationship key
776 foreach my $datum ( keys %edge_data ) {
777 $edge_data_keys{$datum} = 'de'.$edi++;
778 my $key = $root->addNewChild( $graphml_ns, 'key' );
779 $key->setAttribute( 'attr.name', $datum );
780 $key->setAttribute( 'attr.type', $edge_data{$datum} );
781 $key->setAttribute( 'for', 'edge' );
782 $key->setAttribute( 'id', $edge_data_keys{$datum} );
785 # Add the collation graph itself
786 my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
787 $sgraph->setAttribute( 'edgedefault', 'directed' );
788 $sgraph->setAttribute( 'id', $self->tradition->name );
789 $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
790 $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
791 $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
792 $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
793 $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
795 # Collation attribute data
796 foreach my $datum ( @graph_attributes ) {
797 my $value = $datum eq 'version' ? '3.0' : $self->$datum;
798 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
803 # Add our readings to the graph
804 foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
805 # Add to the main graph
806 my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
807 my $node_xmlid = 'n' . $node_ctr++;
808 $node_hash{ $n->id } = $node_xmlid;
809 $node_el->setAttribute( 'id', $node_xmlid );
810 foreach my $d ( keys %node_data ) {
812 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
817 # Add the path edges to the sequence graph
819 foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
820 # We add an edge in the graphml for every witness in $e.
821 foreach my $wit ( sort $self->path_witnesses( $e ) ) {
822 my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
823 $node_hash{ $e->[0] },
824 $node_hash{ $e->[1] } );
825 my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
826 $edge_el->setAttribute( 'source', $from );
827 $edge_el->setAttribute( 'target', $to );
828 $edge_el->setAttribute( 'id', $id );
830 # It's a witness path, so add the witness
832 my $key = $edge_data_keys{'witness'};
833 # Is this an ante-corr witness?
834 my $aclabel = $self->ac_label;
835 if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
836 # Keep the base witness
838 # ...and record that this is an 'extra' reading path
839 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
841 _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
842 _add_graphml_data( $edge_el, $edge_data_keys{'class'}, 'path' );
846 # Add the relationship graph to the XML
847 $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
848 $node_data_keys{'id'}, \%edge_data_keys );
850 # Save and return the thing
851 my $result = decode_utf8( $graphml->toString(1) );
855 sub _add_graphml_data {
856 my( $el, $key, $value ) = @_;
857 return unless defined $value;
858 my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
859 $data_el->setAttribute( 'key', $key );
860 $data_el->appendText( $value );
865 Returns a CSV alignment table representation of the collation graph, one
866 row per witness (or witness uncorrected.)
872 my $table = $self->make_alignment_table;
873 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
875 # Make the header row
876 $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
877 push( @result, decode_utf8( $csv->string ) );
878 # Make the rest of the rows
879 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
880 my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
881 my @row = map { $_ ? $_->{'t'} : $_ } @rowobjs;
882 $csv->combine( @row );
883 push( @result, decode_utf8( $csv->string ) );
885 return join( "\n", @result );
888 =head2 make_alignment_table( $use_refs, $include_witnesses )
890 Return a reference to an alignment table, in a slightly enhanced CollateX
891 format which looks like this:
893 $table = { alignment => [ { witness => "SIGIL",
894 tokens => [ { t => "TEXT" }, ... ] },
896 tokens => [ { t => "TEXT" }, ... ] },
900 If $use_refs is set to 1, the reading object is returned in the table
901 instead of READINGTEXT; if not, the text of the reading is returned.
903 If $include_witnesses is set to a hashref, only the witnesses whose sigil
904 keys have a true hash value will be included.
908 sub make_alignment_table {
909 my( $self, $noderefs, $include ) = @_;
910 # Make sure we can do this
911 throw( "Need a linear graph in order to make an alignment table" )
912 unless $self->linear;
913 $self->calculate_ranks unless $self->end->has_rank;
915 my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
916 my @all_pos = ( 1 .. $self->end->rank - 1 );
917 foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
919 next unless $include->{$wit->sigil};
921 # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
922 my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
923 my @row = _make_witness_row( \@wit_path, \@all_pos, $noderefs );
924 push( @{$table->{'alignment'}},
925 { 'witness' => $wit->sigil, 'tokens' => \@row } );
926 if( $wit->is_layered ) {
927 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
928 $wit->sigil.$self->ac_label );
929 my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos, $noderefs );
930 push( @{$table->{'alignment'}},
931 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
937 sub _make_witness_row {
938 my( $path, $positions, $noderefs ) = @_;
940 map { $char_hash{$_} = undef } @$positions;
942 foreach my $rdg ( @$path ) {
943 my $rtext = $rdg->text;
944 $rtext = '#LACUNA#' if $rdg->is_lacuna;
945 print STDERR "rank " . $rdg->rank . "\n" if $debug;
946 # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
947 $char_hash{$rdg->rank} = $noderefs ? { 't' => $rdg }
950 my @row = map { $char_hash{$_} } @$positions;
951 # Fill in lacuna markers for undef spots in the row
952 my $last_el = shift @row;
953 my @filled_row = ( $last_el );
954 foreach my $el ( @row ) {
955 # If we are using node reference, make the lacuna node appear many times
956 # in the table. If not, use the lacuna tag.
957 if( $last_el && _el_is_lacuna( $last_el ) && !defined $el ) {
958 $el = $noderefs ? $last_el : { 't' => '#LACUNA#' };
960 push( @filled_row, $el );
966 # Tiny utility function to say if a table element is a lacuna
969 return 1 if $el->{'t'} eq '#LACUNA#';
970 return 1 if ref( $el->{'t'} ) eq 'Text::Tradition::Collation::Reading'
971 && $el->{'t'}->is_lacuna;
975 # Helper to turn the witnesses along columns rather than rows. Assumes
980 return $result unless scalar @$table;
981 my $nrows = scalar @{$table->[0]};
982 foreach my $idx ( 0 .. $nrows - 1 ) {
983 foreach my $wit ( 0 .. $#{$table} ) {
984 $result->[$idx]->[$wit] = $table->[$wit]->[$idx];
990 =head1 NAVIGATION METHODS
992 =head2 reading_sequence( $first, $last, $sigil, $backup )
994 Returns the ordered list of readings, starting with $first and ending
995 with $last, for the witness given in $sigil. If a $backup sigil is
996 specified (e.g. when walking a layered witness), it will be used wherever
997 no $sigil path exists. If there is a base text reading, that will be
998 used wherever no path exists for $sigil or $backup.
1002 # TODO Think about returning some lazy-eval iterator.
1003 # TODO Get rid of backup; we should know from what witness is whether we need it.
1005 sub reading_sequence {
1006 my( $self, $start, $end, $witness ) = @_;
1008 $witness = $self->baselabel unless $witness;
1009 my @readings = ( $start );
1012 while( $n && $n->id ne $end->id ) {
1013 if( exists( $seen{$n->id} ) ) {
1014 throw( "Detected loop for $witness at " . $n->id );
1018 my $next = $self->next_reading( $n, $witness );
1020 throw( "Did not find any path for $witness from reading " . $n->id );
1022 push( @readings, $next );
1025 # Check that the last reading is our end reading.
1026 my $last = $readings[$#readings];
1027 throw( "Last reading found from " . $start->text .
1028 " for witness $witness is not the end!" ) # TODO do we get this far?
1029 unless $last->id eq $end->id;
1034 =head2 next_reading( $reading, $sigil );
1036 Returns the reading that follows the given reading along the given witness
1042 # Return the successor via the corresponding path.
1044 my $answer = $self->_find_linked_reading( 'next', @_ );
1045 return undef unless $answer;
1046 return $self->reading( $answer );
1049 =head2 prior_reading( $reading, $sigil )
1051 Returns the reading that precedes the given reading along the given witness
1057 # Return the predecessor via the corresponding path.
1059 my $answer = $self->_find_linked_reading( 'prior', @_ );
1060 return $self->reading( $answer );
1063 sub _find_linked_reading {
1064 my( $self, $direction, $node, $path ) = @_;
1066 # Get a backup if we are dealing with a layered witness
1068 my $aclabel = $self->ac_label;
1069 if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1073 my @linked_paths = $direction eq 'next'
1074 ? $self->sequence->edges_from( $node )
1075 : $self->sequence->edges_to( $node );
1076 return undef unless scalar( @linked_paths );
1078 # We have to find the linked path that contains all of the
1079 # witnesses supplied in $path.
1080 my( @path_wits, @alt_path_wits );
1081 @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1082 @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
1085 foreach my $le ( @linked_paths ) {
1086 if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
1089 my @le_wits = sort $self->path_witnesses( $le );
1090 if( _is_within( \@path_wits, \@le_wits ) ) {
1091 # This is the right path.
1092 return $direction eq 'next' ? $le->[1] : $le->[0];
1093 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1097 # Got this far? Return the alternate path if it exists.
1098 return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
1101 # Got this far? Return the base path if it exists.
1102 return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
1105 # Got this far? We have no appropriate path.
1106 warn "Could not find $direction node from " . $node->id
1107 . " along path $path";
1113 my( $set1, $set2 ) = @_;
1114 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
1115 foreach my $el ( @$set1 ) {
1116 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
1121 # Return the string that joins together a list of witnesses for
1122 # display on a single path.
1123 sub _witnesses_of_label {
1124 my( $self, $label ) = @_;
1125 my $regex = $self->wit_list_separator;
1126 my @answer = split( /\Q$regex\E/, $label );
1130 =head2 common_readings
1132 Returns the list of common readings in the graph (i.e. those readings that are
1133 shared by all non-lacunose witnesses.)
1137 sub common_readings {
1139 my @common = grep { $_->is_common } $self->readings;
1143 =head2 path_text( $sigil, $mainsigil [, $start, $end ] )
1145 Returns the text of a witness (plus its backup, if we are using a layer)
1146 as stored in the collation. The text is returned as a string, where the
1147 individual readings are joined with spaces and the meta-readings (e.g.
1148 lacunae) are omitted. Optional specification of $start and $end allows
1149 the generation of a subset of the witness text.
1154 my( $self, $wit, $start, $end ) = @_;
1155 $start = $self->start unless $start;
1156 $end = $self->end unless $end;
1157 my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
1158 return join( ' ', map { $_->text } @path );
1161 =head1 INITIALIZATION METHODS
1163 These are mostly for use by parsers.
1165 =head2 make_witness_path( $witness )
1167 Link the array of readings contained in $witness->path (and in
1168 $witness->uncorrected_path if it exists) into collation paths.
1169 Clear out the arrays when finished.
1171 =head2 make_witness_paths
1173 Call make_witness_path for all witnesses in the tradition.
1177 # For use when a collation is constructed from a base text and an apparatus.
1178 # We have the sequences of readings and just need to add path edges.
1179 # When we are done, clear out the witness path attributes, as they are no
1181 # TODO Find a way to replace the witness path attributes with encapsulated functions?
1183 sub make_witness_paths {
1185 foreach my $wit ( $self->tradition->witnesses ) {
1186 # print STDERR "Making path for " . $wit->sigil . "\n";
1187 $self->make_witness_path( $wit );
1191 sub make_witness_path {
1192 my( $self, $wit ) = @_;
1193 my @chain = @{$wit->path};
1194 my $sig = $wit->sigil;
1195 foreach my $idx ( 0 .. $#chain-1 ) {
1196 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
1198 if( $wit->is_layered ) {
1199 @chain = @{$wit->uncorrected_path};
1200 foreach my $idx( 0 .. $#chain-1 ) {
1201 my $source = $chain[$idx];
1202 my $target = $chain[$idx+1];
1203 $self->add_path( $source, $target, $sig.$self->ac_label )
1204 unless $self->has_path( $source, $target, $sig );
1208 $wit->clear_uncorrected_path;
1211 =head2 calculate_ranks
1213 Calculate the reading ranks (that is, their aligned positions relative
1214 to each other) for the graph. This can only be called on linear collations.
1218 sub calculate_ranks {
1220 # Walk a version of the graph where every node linked by a relationship
1221 # edge is fundamentally the same node, and do a topological ranking on
1222 # the nodes in this graph.
1223 my $topo_graph = Graph->new();
1227 foreach my $r ( $self->readings ) {
1228 next if exists $rel_containers{$r->id};
1229 my @rels = $r->related_readings( 'colocated' );
1231 # Make a relationship container.
1233 my $rn = 'rel_container_' . $rel_ctr++;
1234 $topo_graph->add_vertex( $rn );
1236 $rel_containers{$_->id} = $rn;
1239 # Add a new node to mirror the old node.
1240 $rel_containers{$r->id} = $r->id;
1241 $topo_graph->add_vertex( $r->id );
1246 foreach my $r ( $self->readings ) {
1247 foreach my $n ( $self->sequence->successors( $r->id ) ) {
1248 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
1249 $rel_containers{$n} );
1250 # $DB::single = 1 unless $tfrom && $tto;
1251 $topo_graph->add_edge( $tfrom, $tto );
1255 # Now do the rankings, starting with the start node.
1256 my $topo_start = $rel_containers{$self->start->id};
1257 my $node_ranks = { $topo_start => 0 };
1258 my @curr_origin = ( $topo_start );
1259 # A little iterative function.
1260 while( @curr_origin ) {
1261 @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
1263 # Transfer our rankings from the topological graph to the real one.
1264 foreach my $r ( $self->readings ) {
1265 if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1266 $r->rank( $node_ranks->{$rel_containers{$r->id}} );
1268 # Die. Find the last rank we calculated.
1269 my @all_defined = sort { $node_ranks->{$rel_containers{$a->id}}
1270 <=> $node_ranks->{$rel_containers{$b->id}} }
1272 my $last = pop @all_defined;
1273 throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
1279 my( $graph, $node_ranks, @current_nodes ) = @_;
1280 # Look at each of the children of @current_nodes. If all the child's
1281 # parents have a rank, assign it the highest rank + 1 and add it to
1282 # @next_nodes. Otherwise skip it; we will return when the highest-ranked
1283 # parent gets a rank.
1285 foreach my $c ( @current_nodes ) {
1286 warn "Current reading $c has no rank!"
1287 unless exists $node_ranks->{$c};
1288 # print STDERR "Looking at child of node $c, rank "
1289 # . $node_ranks->{$c} . "\n";
1290 foreach my $child ( $graph->successors( $c ) ) {
1291 next if exists $node_ranks->{$child};
1292 my $highest_rank = -1;
1294 foreach my $parent ( $graph->predecessors( $child ) ) {
1295 if( exists $node_ranks->{$parent} ) {
1296 $highest_rank = $node_ranks->{$parent}
1297 if $highest_rank <= $node_ranks->{$parent};
1304 my $c_rank = $highest_rank + 1;
1305 # print STDERR "Assigning rank $c_rank to node $child \n";
1306 $node_ranks->{$child} = $c_rank;
1307 push( @next_nodes, $child );
1313 =head2 flatten_ranks
1315 A convenience method for parsing collation data. Searches the graph for readings
1316 with the same text at the same rank, and merges any that are found.
1322 my %unique_rank_rdg;
1323 foreach my $rdg ( $self->readings ) {
1324 next unless $rdg->has_rank;
1325 my $key = $rdg->rank . "||" . $rdg->text;
1326 if( exists $unique_rank_rdg{$key} ) {
1328 # print STDERR "Combining readings at same rank: $key\n";
1329 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
1330 # TODO see if this now makes a common point.
1332 $unique_rank_rdg{$key} = $rdg;
1337 =head2 remove_collations
1339 Another convenience method for parsing. Removes all 'collation' relationships
1340 that were defined in order to get the reading ranks to be correct.
1344 use Text::Tradition;
1346 my $cxfile = 't/data/Collatex-16.xml';
1347 my $t = Text::Tradition->new(
1349 'input' => 'CollateX',
1352 my $c = $t->collation;
1354 isnt( $c->reading('n23')->rank, $c->reading('n9')->rank, "Rank skew exists" );
1355 $c->add_relationship( 'n23', 'n9', { 'type' => 'collated', 'scope' => 'local' } );
1356 is( scalar $c->relationships, 4, "Found all expected relationships" );
1357 $c->remove_collations;
1358 is( scalar $c->relationships, 3, "Collated relationships now gone" );
1359 is( $c->reading('n23')->rank, $c->reading('n9')->rank, "Aligned ranks were preserved" );
1365 sub remove_collations {
1367 foreach my $reledge ( $self->relationships ) {
1368 my $relobj = $self->relations->get_relationship( $reledge );
1369 if( $relobj && $relobj->type eq 'collated' ) {
1370 $self->relations->delete_relationship( $reledge );
1376 =head2 calculate_common_readings
1378 Goes through the graph identifying the readings that appear in every witness
1379 (apart from those with lacunae at that spot.) Marks them as common and returns
1384 use Text::Tradition;
1386 my $cxfile = 't/data/Collatex-16.xml';
1387 my $t = Text::Tradition->new(
1389 'input' => 'CollateX',
1392 my $c = $t->collation;
1394 my @common = $c->calculate_common_readings();
1395 is( scalar @common, 8, "Found correct number of common readings" );
1396 my @marked = sort $c->common_readings();
1397 is( scalar @common, 8, "All common readings got marked as such" );
1398 my @expected = qw/ n1 n12 n16 n19 n20 n5 n6 n7 /;
1399 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1405 sub calculate_common_readings {
1408 my $table = $self->make_alignment_table( 1 );
1409 foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1410 my @row = map { $_->{'tokens'}->[$idx]->{'t'} } @{$table->{'alignment'}};
1412 foreach my $r ( @row ) {
1414 $hash{$r->id} = $r unless $r->is_meta;
1416 $hash{'UNDEF'} = $r;
1419 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1420 my( $r ) = values %hash;
1422 push( @common, $r );
1428 =head2 text_from_paths
1430 Calculate the text array for all witnesses from the path, for later consistency
1431 checking. Only to be used if there is no non-graph-based way to know the
1436 sub text_from_paths {
1438 foreach my $wit ( $self->tradition->witnesses ) {
1439 my @text = split( /\s+/,
1440 $self->reading_sequence( $self->start, $self->end, $wit->sigil ) );
1441 $wit->text( \@text );
1442 if( $wit->is_layered ) {
1443 my @uctext = split( /\s+/,
1444 $self->reading_sequence( $self->start, $self->end,
1445 $wit->sigil.$self->ac_label ) );
1446 $wit->text( \@uctext );
1451 =head1 UTILITY FUNCTIONS
1453 =head2 common_predecessor( $reading_a, $reading_b )
1455 Find the last reading that occurs in sequence before both the given readings.
1457 =head2 common_successor( $reading_a, $reading_b )
1459 Find the first reading that occurs in sequence after both the given readings.
1463 use Text::Tradition;
1465 my $cxfile = 't/data/Collatex-16.xml';
1466 my $t = Text::Tradition->new(
1468 'input' => 'CollateX',
1471 my $c = $t->collation;
1473 is( $c->common_predecessor( 'n9', 'n23' )->id,
1474 'n20', "Found correct common predecessor" );
1475 is( $c->common_successor( 'n9', 'n23' )->id,
1476 '#END#', "Found correct common successor" );
1478 is( $c->common_predecessor( 'n19', 'n17' )->id,
1479 'n16', "Found correct common predecessor for readings on same path" );
1480 is( $c->common_successor( 'n21', 'n26' )->id,
1481 '#END#', "Found correct common successor for readings on same path" );
1487 ## Return the closest reading that is a predecessor of both the given readings.
1488 sub common_predecessor {
1490 my( $r1, $r2 ) = $self->_objectify_args( @_ );
1491 return $self->_common_in_path( $r1, $r2, 'predecessors' );
1494 sub common_successor {
1496 my( $r1, $r2 ) = $self->_objectify_args( @_ );
1497 return $self->_common_in_path( $r1, $r2, 'successors' );
1500 sub _common_in_path {
1501 my( $self, $r1, $r2, $dir ) = @_;
1502 my $iter = $r1->rank > $r2->rank ? $r1->rank : $r2->rank;
1503 $iter = $self->end->rank - $iter if $dir eq 'successors';
1505 my @last_checked = ( $r1, $r2 );
1507 while( !@candidates ) {
1509 foreach my $lc ( @last_checked ) {
1510 foreach my $p ( $lc->$dir ) {
1511 if( $all_seen{$p->id} ) {
1512 push( @candidates, $p );
1514 $all_seen{$p->id} = 1;
1515 push( @new_lc, $p );
1519 @last_checked = @new_lc;
1521 my @answer = sort { $a->rank <=> $b->rank } @candidates;
1522 return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1526 Text::Tradition::Error->throw(
1527 'ident' => 'Collation error',
1533 __PACKAGE__->meta->make_immutable;
1537 This package is free software and is provided "as is" without express
1538 or implied warranty. You can redistribute it and/or modify it under
1539 the same terms as Perl itself.
1543 Tara L Andrews E<lt>aurum@cpan.orgE<gt>