1 package Text::Tradition::Collation;
4 use IPC::Run qw( run binary );
5 use Text::Tradition::Collation::Reading;
12 add_reading => 'add_node',
13 del_reading => 'del_node',
14 add_path => 'add_edge',
15 del_path => 'del_edge',
21 default => sub { Graph::Easy->new( undirected => 0 ) },
27 isa => 'Text::Tradition',
33 writer => '_save_svg',
34 predicate => 'has_svg',
40 writer => '_save_graphviz',
41 predicate => 'has_graphviz',
46 isa => 'XML::LibXML::Document',
47 writer => '_save_graphml',
48 predicate => 'has_graphml',
51 # Keeps track of the lemmas within the collation. At most one lemma
52 # per position in the graph.
55 isa => 'HashRef[Maybe[Str]]',
56 default => sub { {} },
59 has 'wit_list_separator' => (
68 default => 'base text',
83 # The collation can be created two ways:
84 # 1. Collate a set of witnesses (with CollateX I guess) and process
85 # the results as in 2.
86 # 2. Read a pre-prepared collation in one of a variety of formats,
87 # and make the graph from that.
89 # The graph itself will (for now) be immutable, and the positions
90 # within the graph will also be immutable. We need to calculate those
91 # positions upon graph construction. The equivalences between graph
92 # nodes will be mutable, entirely determined by the user (or possibly
93 # by some semantic pre-processing provided by the user.) So the
94 # constructor should just make an empty equivalences object. The
95 # constructor will also need to make the witness objects, if we didn't
96 # come through option 1.
99 my( $self, $args ) = @_;
100 $self->graph->use_class('node', 'Text::Tradition::Collation::Reading');
102 # Pass through any graph-specific options.
103 my $shape = exists( $args->{'shape'} ) ? $args->{'shape'} : 'ellipse';
104 $self->graph->set_attribute( 'node', 'shape', $shape );
107 # Wrapper around add_path
109 around add_path => sub {
113 # Make sure there are three arguments
115 warn "Call add_path with args source, target, witness";
118 # Make sure the proposed path does not yet exist
119 my( $source, $target, $wit ) = @_;
120 $source = $self->reading( $source )
121 unless ref( $source ) eq 'Text::Tradition::Collation::Reading';
122 $target = $self->reading( $target )
123 unless ref( $target ) eq 'Text::Tradition::Collation::Reading';
124 foreach my $path ( $source->edges_to( $target ) ) {
125 if( $path->label eq $wit ) {
133 # Wrapper around merge_nodes
137 my $first_node = shift;
138 my $second_node = shift;
139 $first_node->merge_from( $second_node );
140 unshift( @_, $first_node, $second_node );
141 return $self->graph->merge_nodes( @_ );
144 # Extra graph-alike utility
146 my( $self, $source, $target, $label ) = @_;
147 my @paths = $source->edges_to( $target );
148 my @relevant = grep { $_->label eq $label } @paths;
149 return scalar @paths;
152 =head2 Output method(s)
158 print $graph->as_svg( $recalculate );
160 Returns an SVG string that represents the graph. Uses GraphViz to do
161 this, because Graph::Easy doesn\'t cope well with long graphs. Unless
162 $recalculate is passed (and is a true value), the method will return a
163 cached copy of the SVG after the first call to the method.
168 my( $self, $recalc ) = @_;
169 return $self->svg if $self->has_svg;
171 $self->collapse_graph_edges();
172 $self->_save_graphviz( $self->graph->as_graphviz() )
173 unless( $self->has_graphviz && !$recalc );
175 my @cmd = qw/dot -Tsvg/;
177 my $in = $self->graphviz;
178 run( \@cmd, \$in, ">", binary(), \$svg );
179 $self->{'svg'} = $svg;
180 $self->expand_graph_edges();
186 print $graph->as_graphml( $recalculate )
188 Returns a GraphML representation of the collation graph, with
189 transposition information and position information. Unless
190 $recalculate is passed (and is a true value), the method will return a
191 cached copy of the SVG after the first call to the method.
196 my( $self, $recalc ) = @_;
197 return $self->graphml if $self->has_graphml;
200 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
201 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
202 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
203 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
205 # Create the document and root node
206 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
207 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
208 $graphml->setDocumentElement( $root );
209 $root->setNamespace( $xsi_ns, 'xsi', 0 );
210 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
212 # Add the data keys for nodes
213 my @node_data = ( 'name', 'token', 'identical', 'position' );
214 foreach my $ndi ( 0 .. $#node_data ) {
215 my $key = $root->addNewChild( $graphml_ns, 'key' );
216 $key->setAttribute( 'attr.name', $node_data[$ndi] );
217 $key->setAttribute( 'attr.type', 'string' );
218 $key->setAttribute( 'for', 'node' );
219 $key->setAttribute( 'id', 'd'.$ndi );
222 # Add the data keys for edges
225 foreach my $wit ( @{$self->tradition->witnesses} ) {
226 my $wit_key = 'w' . $wit_ctr++;
227 $wit_hash{$wit} = $wit_key;
228 my $key = $root->addNewChild( $graphml_ns, 'key' );
229 $key->setAttribute( 'attr.name', $wit );
230 $key->setAttribute( 'attr.type', 'string' );
231 $key->setAttribute( 'for', 'edge' );
232 $key->setAttribute( 'id', $wit_key );
235 # Add the graph, its nodes, and its edges
236 $self->collapse_graph_edges();
237 my $graph = $root->addNewChild( $graphml_ns, 'graph' );
238 $graph->setAttribute( 'edgedefault', 'directed' );
239 $graph->setAttribute( 'id', 'g0' ); # TODO make this meaningful
240 $graph->setAttribute( 'parse.edgeids', 'canonical' );
241 $graph->setAttribute( 'parse.edges', $self->edges() );
242 $graph->setAttribute( 'parse.nodeids', 'canonical' );
243 $graph->setAttribute( 'parse.nodes', $self->nodes() );
244 $graph->setAttribute( 'parse.order', 'nodesfirst' );
248 foreach my $n ( $self->readings ) {
249 my %this_node_data = ();
250 foreach my $ndi ( 0 .. $#node_data ) {
252 $this_node_data{'d'.$ndi} = $n->name if $node_data[$ndi] eq 'name';
253 $this_node_data{'d'.$ndi} = $n->label
254 if $node_data[$ndi] eq 'token';
255 $this_node_data{'d'.$ndi} = $n->primary->name if $n->has_primary;
256 $this_node_data{'d'.$ndi} =
257 $self->{'positions'}->node_position( $n )
258 if $node_data[$ndi] eq 'position';
260 my $node_el = $graph->addNewChild( $graphml_ns, 'node' );
261 my $node_xmlid = 'n' . $node_ctr++;
262 $node_hash{ $n->name } = $node_xmlid;
263 $node_el->setAttribute( 'id', $node_xmlid );
265 foreach my $dk ( keys %this_node_data ) {
266 my $d_el = $node_el->addNewChild( $graphml_ns, 'data' );
267 $d_el->setAttribute( 'key', $dk );
268 $d_el->appendTextChild( $this_node_data{$dk} );
272 foreach my $e ( $self->paths() ) {
273 my( $name, $from, $to ) = ( $e->name,
274 $node_hash{ $e->from()->name() },
275 $node_hash{ $e->to()->name() } );
276 my $edge_el = $graph->addNewChild( $graphml_ns, 'edge' );
277 $edge_el->setAttribute( 'source', $from );
278 $edge_el->setAttribute( 'target', $to );
279 $edge_el->setAttribute( 'id', $name );
280 # TODO Got to add the witnesses
284 $self->_save_graphml( $graphml );
285 $self->expand_graph_edges();
289 sub collapse_graph_edges {
291 # Our collation graph has an edge per witness. This is great for
292 # calculation purposes, but terrible for display. Thus we want to
293 # display only one edge between any two nodes.
295 return if $self->collapsed;
297 print STDERR "Collapsing path edges in graph...\n";
299 # Don't list out every witness if we have more than half to list.
300 my $majority = int( scalar( @{$self->tradition->witnesses} ) / 2 ) + 1;
301 foreach my $node( $self->readings ) {
303 # We will visit each node, so we only look ahead.
304 foreach my $edge ( $node->outgoing() ) {
305 add_hash_entry( $newlabels, $edge->to->name, $edge->name );
306 $self->del_path( $edge );
309 foreach my $newdest ( keys %$newlabels ) {
311 my @compressed_wits = ();
312 if( @{$newlabels->{$newdest}} < $majority ) {
313 $label = join( ', ', @{$newlabels->{$newdest}} );
315 ## TODO FIX THIS HACK
317 foreach my $wit ( @{$newlabels->{$newdest}} ) {
318 if( $wit =~ /^(.*?)(\s*\(?a\.\s*c\.\)?)$/ ) {
319 push( @aclabels, $wit );
321 push( @compressed_wits, $wit );
324 $label = join( ', ', 'majority', @aclabels );
328 $self->add_path( $node, $self->reading( $newdest ), $label );
329 if( @compressed_wits ) {
330 ## TODO fix this hack too.
331 $newedge->set_attribute( 'class',
332 join( '|', @compressed_wits ) );
337 $self->collapsed( 1 );
340 sub expand_graph_edges {
342 # Our collation graph has only one edge between any two nodes.
343 # This is great for display, but not so great for analysis.
344 # Expand this so that each witness has its own edge between any
346 return unless $self->collapsed;
348 print STDERR "Expanding path edges in graph...\n";
350 foreach my $edge( $self->paths ) {
351 my $from = $edge->from;
353 my @wits = split( /, /, $edge->label );
354 if( grep { $_ eq 'majority' } @wits ) {
355 push( @wits, split( /\|/, $edge->get_attribute( 'class' ) ) );
357 $self->del_path( $edge );
359 $self->add_path( $from, $to, $_ );
362 $self->collapsed( 0 );
367 =head2 Navigation methods
373 my $beginning = $collation->start();
375 Returns the beginning of the collation, a meta-reading with label '#START#'.
380 # Return the beginning reading of the graph.
382 my( $new_start ) = @_;
384 $self->del_reading( '#START#' );
385 $self->graph->rename_node( $new_start, '#START#' );
387 return $self->reading('#START#');
390 =item B<reading_sequence>
392 my @readings = $graph->reading_sequence( $first, $last, $path[, $alt_path] );
394 Returns the ordered list of readings, starting with $first and ending
395 with $last, along the given witness path. If no path is specified,
396 assume that the path is that of the base text (if any.)
400 sub reading_sequence {
401 my( $self, $start, $end, $witness, $backup ) = @_;
403 $witness = $self->baselabel unless $witness;
404 my @readings = ( $start );
407 while( $n && $n ne $end ) {
408 if( exists( $seen{$n->name()} ) ) {
409 warn "Detected loop at " . $n->name();
412 $seen{$n->name()} = 1;
414 my $next = $self->next_reading( $n, $witness, $backup );
415 warn "Did not find any path for $witness from reading " . $n->name
417 push( @readings, $next );
420 # Check that the last reading is our end reading.
421 my $last = $readings[$#readings];
422 warn "Last reading found from " . $start->label() .
423 " for witness $witness is not the end!"
424 unless $last eq $end;
429 =item B<next_reading>
431 my $next_reading = $graph->next_reading( $reading, $witpath );
433 Returns the reading that follows the given reading along the given witness
439 # Return the successor via the corresponding path.
441 return $self->_find_linked_reading( 'next', @_ );
444 =item B<prior_reading>
446 my $prior_reading = $graph->prior_reading( $reading, $witpath );
448 Returns the reading that precedes the given reading along the given witness
454 # Return the predecessor via the corresponding path.
456 return $self->_find_linked_reading( 'prior', @_ );
459 sub _find_linked_reading {
460 my( $self, $direction, $node, $path, $alt_path ) = @_;
461 my @linked_paths = $direction eq 'next'
462 ? $node->outgoing() : $node->incoming();
463 return undef unless scalar( @linked_paths );
465 # We have to find the linked path that contains all of the
466 # witnesses supplied in $path.
467 my( @path_wits, @alt_path_wits );
468 @path_wits = $self->witnesses_of_label( $path ) if $path;
469 @alt_path_wits = $self->witnesses_of_label( $alt_path ) if $alt_path;
472 foreach my $le ( @linked_paths ) {
473 if( $le->name eq $self->baselabel ) {
476 my @le_wits = $self->witnesses_of_label( $le->name );
477 if( _is_within( \@path_wits, \@le_wits ) ) {
478 # This is the right path.
479 return $direction eq 'next' ? $le->to() : $le->from();
480 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
485 # Got this far? Return the alternate path if it exists.
486 return $direction eq 'next' ? $alt_le->to() : $alt_le->from()
489 # Got this far? Return the base path if it exists.
490 return $direction eq 'next' ? $base_le->to() : $base_le->from()
493 # Got this far? We have no appropriate path.
494 warn "Could not find $direction node from " . $node->label
495 . " along path $path";
501 my( $set1, $set2 ) = @_;
502 my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
503 foreach my $el ( @$set1 ) {
504 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
510 ## INITIALIZATION METHODS - for use by parsers
511 # Walk the paths for each witness in the graph, and return the nodes
512 # that the graph has in common. If $using_base is true, some
513 # different logic is needed.
515 sub walk_witness_paths {
516 my( $self, $end ) = @_;
517 # For each witness, walk the path through the graph.
518 # Then we need to find the common nodes.
519 # TODO This method is going to fall down if we have a very gappy
520 # text in the collation.
523 foreach my $wit ( @{$self->tradition->witnesses} ) {
524 my $curr_reading = $self->start;
525 my @wit_path = $self->reading_sequence( $self->start, $end,
527 $wit->path( \@wit_path );
529 # Detect the common readings.
530 @common_readings = _find_common( \@common_readings, \@wit_path );
533 # Mark all the nodes as either common or not.
534 foreach my $cn ( @common_readings ) {
535 print STDERR "Setting " . $cn->name . " / " . $cn->label
536 . " as common node\n";
539 foreach my $n ( $self->readings() ) {
540 $n->make_variant unless $n->is_common;
542 # Return an array of the common nodes in order.
543 return @common_readings;
547 my( $common_readings, $new_path ) = @_;
549 if( @$common_readings ) {
550 foreach my $n ( @$new_path ) {
551 push( @cr, $n ) if grep { $_ eq $n } @$common_readings;
554 push( @cr, @$new_path );
560 my( $common_readings, $divergence ) = @_;
563 map { $diverged{$_->name} = 1 } @$divergence;
564 foreach( @$common_readings ) {
565 push( @cr, $_ ) unless $diverged{$_->name};
571 # An alternative to walk_witness_paths, for use when a collation is
572 # constructed from a base text and an apparatus. We have the
573 # sequences of readings and just need to add path edges.
575 sub make_witness_paths {
579 foreach my $wit ( @{$self->tradition->witnesses} ) {
580 print STDERR "Making path for " . $wit->sigil . "\n";
581 $self->make_witness_path( $wit );
582 @common_readings = _find_common( \@common_readings, $wit->path );
583 @common_readings = _find_common( \@common_readings, $wit->uncorrected_path );
585 return @common_readings;
588 sub make_witness_path {
589 my( $self, $wit ) = @_;
590 my @chain = @{$wit->path};
591 my $sig = $wit->sigil;
592 foreach my $idx ( 0 .. $#chain-1 ) {
593 $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
595 @chain = @{$wit->uncorrected_path};
596 foreach my $idx( 0 .. $#chain-1 ) {
597 my $source = $chain[$idx];
598 my $target = $chain[$idx+1];
599 $self->add_path( $source, $target, "$sig (a.c.)" )
600 unless $self->has_path( $source, $target, $sig );
604 sub common_readings {
606 my @common = grep { $_->is_common } $self->readings();
607 return sort { _cmp_position( $a->position, $b->position ) } @common;
610 # Calculate the relative positions of nodes in the graph, if they
611 # were not given to us.
612 sub calculate_positions {
613 my( $self, @ordered_common ) = @_;
615 # We have to calculate the position identifiers for each word,
616 # keyed on the common nodes. This will be 'fun'. The end result
617 # is a hash per witness, whose key is the word node and whose
618 # value is its position in the text. Common nodes are always N,1
619 # so have identical positions in each text.
622 foreach my $wit ( @{$self->tradition->witnesses} ) {
623 print STDERR "Calculating positions in " . $wit->sigil . "\n";
624 _update_positions_from_path( $wit->path, @ordered_common );
625 _update_positions_from_path( $wit->uncorrected_path, @ordered_common )
626 if $wit->has_ante_corr;
630 foreach my $r ( $self->readings() ) {
631 print STDERR "Reading " . $r->name . "/" . $r->label . " has no position\n"
632 unless( $r->has_position );
635 $self->init_lemmata();
638 sub _update_positions_from_path {
639 my( $path, @ordered_common ) = @_;
641 # First we walk the given path, making a matrix for the witness
642 # that corresponds to its eventual position identifier. Common
643 # nodes always start a new row, and are thus always in the first
646 my $cn = 0; # We should hit the common readings in order.
648 foreach my $wn ( @{$path} ) {
649 if( $wn eq $ordered_common[$cn] ) {
650 # Set up to look for the next common node, and
651 # start a new row of words.
653 push( @$wit_matrix, $row ) if scalar( @$row );
658 push( @$wit_matrix, $row ); # Push the last row onto the matrix
660 # Now we have a matrix per witness, so that each row in the
661 # matrix begins with a common node, and continues with all the
662 # variant words that appear in the witness. We turn this into
663 # real positions in row,cell format. But we need some
664 # trickery in order to make sure that each node gets assigned
665 # to only one position.
667 foreach my $li ( 1..scalar(@$wit_matrix) ) {
668 foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) {
669 my $reading = $wit_matrix->[$li-1]->[$di-1];
670 my $position = "$li,$di";
672 # If we have seen this node before, we need to compare
673 # its position with what went before.
674 unless( $reading->has_position &&
675 _cmp_position( $position, $reading->position ) < 1 ) {
676 # The new position ID replaces the old one.
677 $reading->position( $position );
678 } # otherwise, the old position needs to stay.
686 my @pos_a = split(/,/, $a );
687 my @pos_b = split(/,/, $b );
689 my $big_cmp = $pos_a[0] <=> $pos_b[0];
690 return $big_cmp if $big_cmp;
692 return $pos_a[1] <=> $pos_b[1];
693 } elsif ( $b ) { # a is undefined
695 } elsif ( $a ) { # b is undefined
698 return 0; # they are both undefined
704 map { $positions{$_->position} = 1 } $self->readings;
705 my @answer = sort { _cmp_position( $a, $b ) } keys( %positions );
709 sub readings_at_position {
710 my( $self, $pos ) = @_;
711 my @answer = grep { $_->position eq $pos } $self->readings;
715 ## Lemmatizer functions
720 foreach my $position ( $self->all_positions ) {
721 $self->lemmata->{$position} = undef;
724 foreach my $cr ( $self->common_readings ) {
725 $self->lemmata->{$cr->position} = $cr->name;
729 =item B<lemma_readings>
731 my @state = $graph->lemma_readings( @readings_delemmatized );
733 Takes a list of readings that have just been delemmatized, and returns
734 a set of tuples of the form ['reading', 'state'] that indicates what
735 changes need to be made to the graph.
741 A state of 1 means 'lemmatize this reading'
745 A state of 0 means 'delemmatize this reading'
749 A state of undef means 'an ellipsis belongs in the text here because
750 no decision has been made / an earlier decision was backed out'
757 my( $self, @toggled_off_nodes ) = @_;
759 # First get the positions of those nodes which have been
761 my $positions_off = {};
762 map { $positions_off->{ $_->position } = $_->name } @toggled_off_nodes;
764 # Now for each position, we have to see if a node is on, and we
765 # have to see if a node has been turned off.
767 foreach my $pos ( $self->all_positions() ) {
768 # Find the state of this position. If there is an active node,
769 # its name will be the state; otherwise the state will be 0
770 # (nothing at this position) or undef (ellipsis at this position)
771 my $active = $self->lemmata->{$pos};
773 # Is there a formerly active node that was toggled off?
774 if( exists( $positions_off->{$pos} ) ) {
775 my $off_node = $positions_off->{$pos};
776 if( $active && $active ne $off_node) {
777 push( @answer, [ $off_node, 0 ], [ $active, 1 ] );
779 push( @answer, [ $off_node, $active ] );
782 # No formerly active node, so we just see if there is a currently
785 # Push the active node, whatever it is.
786 push( @answer, [ $active, 1 ] );
788 # Push the state that is there. Arbitrarily use the first node
790 my @pos_nodes = $self->readings_at_position( $pos );
791 push( @answer, [ $pos_nodes[0]->name, $self->lemmata->{$pos} ] );
798 =item B<toggle_reading>
800 my @readings_delemmatized = $graph->toggle_reading( $reading_name );
802 Takes a reading node name, and either lemmatizes or de-lemmatizes
803 it. Returns a list of all readings that are de-lemmatized as a result
809 my( $self, $rname ) = @_;
811 return unless $rname;
812 my $reading = $self->reading( $rname );
813 if( !$reading || $reading->is_common() ) {
814 # Do nothing, it's a common node.
818 my $pos = $reading->position;
819 my $old_state = $self->lemmata->{$pos};
821 if( $old_state && $old_state eq $rname ) {
822 # Turn off the node. We turn on no others by default.
823 push( @readings_off, $reading );
826 $self->lemmata->{$pos} = $rname;
827 # Any other 'on' readings in the same position should be off.
828 push( @readings_off, $self->same_position_as( $reading ) );
829 # Any node that is an identical transposed one should be off.
830 push( @readings_off, $reading->identical_readings );
832 @readings_off = unique_list( @readings_off );
834 # Turn off the readings that need to be turned off.
835 my @readings_delemmatized;
836 foreach my $n ( @readings_off ) {
837 my $state = $self->lemmata->{$n->position};
838 if( $state && $state eq $n->name ) {
839 # this reading is still on, so turn it off
840 push( @readings_delemmatized, $n );
841 my $new_state = undef;
842 if( $n eq $reading ) {
843 # This is the reading that was clicked, so if there are no
844 # other readings there, turn off the position. In all other
845 # cases, restore the ellipsis.
846 my @other_n = $self->same_position_as( $n );
847 $new_state = 0 unless @other_n;
849 $self->lemmata->{$n->position} = $new_state;
850 } elsif( $old_state && $old_state eq $n->name ) {
851 # another reading has already been turned on here
852 push( @readings_delemmatized, $n );
853 } # else some other reading was on anyway, so pass.
855 return @readings_delemmatized;
858 sub same_position_as {
859 my( $self, $reading ) = @_;
860 my $pos = $reading->position;
861 my @same = grep { $_ ne $reading } $self->readings_at_position( $reading->position );
865 # Return the string that joins together a list of witnesses for
866 # display on a single path.
869 return join( $self->wit_list_separator, @_ );
872 sub witnesses_of_label {
873 my( $self, $label ) = @_;
874 my $regex = $self->wit_list_separator;
875 my @answer = split( /\Q$regex\E/, $label );
882 map { $h{$_->name} = $_ } @list;
887 my( $hash, $key, $entry ) = @_;
888 if( exists $hash->{$key} ) {
889 push( @{$hash->{$key}}, $entry );
891 $hash->{$key} = [ $entry ];
896 __PACKAGE__->meta->make_immutable;