1 package Text::Tradition::Graph;
6 use IPC::Run qw( run binary );
8 use Text::Tradition::Graph::Position;
12 Text::Tradition::Graph
16 use Text::Tradition::Graph;
18 my $text = Text::Tradition::Graph->new( 'GraphML' => '/my/graphml/file.xml' );
19 my $text = Text::Tradition::Graph->new( 'TEI' => '/my/tei/file.xml' );
20 my $text = Text::Tradition::Graph->new( 'CSV' => '/my/csv/file.csv',
21 'base' => '/my/basefile.txt' );
22 my $text = Text::Tradition::Graph->new( 'CTE' => '/my/cte/file.txt',
23 'base' => '/my/basefile.txt' );
25 my $svg_string = $text->as_svg();
27 my $lemma_nodes = $text->active_nodes();
28 $text->toggle_node( 'some_word' );
32 A text tradition is the representation of our knowledge of a text that
33 has been passed down via manuscript copies from a time before printing
34 presses. Each text has a number of witnesses, that is, manuscripts
35 that bear a version of the text. The tradition is the aggregation of
36 these witnesses, which is to say, the collation of the text.
38 This module takes a text collation and represents it as a horizontal
39 directed graph, suitable for SVG rendering and for analysis of various
40 forms. Since this module was written by a medievalist, it also
41 provides a facility for making a critical text reconstruction by
42 choosing certain variants to be 'lemma' text - that is, text which
43 should be considered the 'standard' reading.
45 Although the graph is a very good way to render text collation, and is
46 visually very easy for a human to interpret, it doesn't have any
47 inherent information about which nodes 'go together' - that is, which
48 text readings appear in the same place as other readings. This module
49 therefore calculates 'positions' on the graph, thus holding some
50 information about which readings can and can't be substituted for
59 Constructor. Takes a source collation file from which to construct
60 the initial graph. This file can be TEI (parallel segmentation) XML,
61 CSV in a format yet to be documented, GraphML as documented by the
62 CollateX tool (L<http://gregor.middell.net/collatex/>), or a Classical
63 Text Editor apparatus. For CSV and Classical Text Editor files, the
64 user must also supply a base text to which the line numbering in the
65 collation file refers.
67 20/04/2011 Currently only CSV and GraphML are really supported.
73 my $class = ref( $proto ) || $proto;
74 my %opts = ( 'on_color' => 'yellow',
75 'off_color' => 'white',
79 # opts can be: GraphML, base+CSV, base+CTE, TEI. We need
81 my @formats = grep { /^(GraphML|CSV|CTE|TEI)$/ } keys( %opts );
82 my $format = shift( @formats );
84 warn "No data given to create a graph; will initialize an empty one";
86 if( $format =~ /^(CSV|CTE)$/ && !exists $opts{'base'} ) {
87 warn "Cannot make a graph from $format without a base text";
91 # Make a graph object.
92 my $collation_graph = Graph::Easy->new();
93 $collation_graph->set_attribute( 'node', 'shape', 'ellipse' );
94 # Starting point for all texts
95 my $last_node = $collation_graph->add_node( '#START#' );
97 $self->{'graph'} = $collation_graph;
98 bless( $self, $class );
100 # Now do the parsing.
103 if( $format =~ /^(CSV|CTE)$/ ) {
104 @args = ( 'base' => $opts{'base'},
105 'data' => $opts{$format},
106 'format' => $format );
107 $format = 'BaseText';
109 @args = ( $opts{ $format } );
111 my $mod = "Text::Tradition::Parser::$format";
113 $mod->can('parse')->( $self, @args );
118 =item B<make_positions>
120 $graph->make_positions( $common_nodes, $paths )
122 Create an associated Graph::Positions object that records the position
123 of each node in the graph. This method call is probably in the wrong
129 my( $self, $common_nodes, $paths ) = @_;
130 my $positions = Text::Tradition::Graph::Position->new( $common_nodes, $paths );
131 $self->{'positions'} = $positions;
136 =head2 Graph::Easy object accessor methods
138 See the Graph::Easy documentation for descriptions of these functions.
148 return $self->{'graph'}->node( @_ );
157 return $self->{'graph'}->edge( @_ );
164 # Not only adds the node, but also initializes internal data
169 my $node = $self->{'graph'}->add_node( @_ );
170 $self->{'identical_nodes'}->{ $node->name() } = [ $node->name() ];
180 return $self->{'graph'}->add_edge( @_ );
191 # Delete this node out of any relevant transposition pool.
192 if( ref $node eq 'Graph::Easy::Node' ) {
193 $node = $node->name();
195 my @ident = $self->identical_nodes( $node );
198 my $pool = $self->{'identical_nodes'}->{ $ident[0] };
199 foreach my $i ( 0 .. scalar(@$pool)-1 ) {
200 if( $pool->[$i] eq $node ) {
201 splice( @$pool, $i, 1 );
206 delete $self->{'identical_nodes'}->{ $node };
208 # Now delete the node.
209 return $self->{'graph'}->del_node( @_ );
218 return $self->{'graph'}->del_edge( @_ );
227 return $self->{'graph'}->nodes( @_ );
236 return $self->{'graph'}->edges( @_ );
245 return $self->{'graph'}->merge_nodes( @_ );
248 ### Helper methods for navigating the tree
252 =head2 Graph navigation methods
258 my $node = $graph->start();
260 Returns the beginning node of the graph.
265 # Return the beginning node of the graph.
267 my( $new_start ) = @_;
269 # Fix the node transposition data
270 delete $self->{'identical_nodes'}->{ $new_start->name() };
271 $self->{'identical_nodes'}->{'#START#'} = [ '#START#' ];
272 $self->{'graph'}->rename_node( $new_start, '#START#' );
274 return $self->{'graph'}->node('#START#');
279 my $next_node = $graph->next_word( $node, $path );
281 Returns the node that follows the given node along the given witness
282 path. TODO These are badly named.
287 # Return the successor via the corresponding edge.
289 return $self->_find_linked_word( 'next', @_ );
294 my $prior_node = $graph->prior_word( $node, $path );
296 Returns the node that precedes the given node along the given witness
297 path. TODO These are badly named.
302 # Return the predecessor via the corresponding edge.
304 return $self->_find_linked_word( 'prior', @_ );
307 sub _find_linked_word {
308 my( $self, $direction, $node, $edge ) = @_;
309 $edge = 'base text' unless $edge;
310 my @linked_edges = $direction eq 'next'
311 ? $node->outgoing() : $node->incoming();
312 return undef unless scalar( @linked_edges );
314 # We have to find the linked edge that contains all of the
315 # witnesses supplied in $edge.
316 my @edge_wits = split( /, /, $edge );
317 foreach my $le ( @linked_edges ) {
318 my @le_wits = split( /, /, $le->name() );
319 if( _is_within( \@edge_wits, \@le_wits ) ) {
320 # This is the right edge.
321 return $direction eq 'next' ? $le->to() : $le->from();
324 warn "Could not find $direction node from " . $node->label
325 . " along edge $edge";
331 my( $set1, $set2 ) = @_;
333 foreach my $el ( @$set1 ) {
334 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
339 =item B<node_sequence>
341 my @nodes = $graph->node_sequence( $first, $last, $path );
343 Returns the ordered list of nodes, starting with $first and ending
344 with $last, along the given witness path.
349 my( $self, $start, $end, $witness, $backup ) = @_;
350 unless( ref( $start ) eq 'Graph::Easy::Node'
351 && ref( $end ) eq 'Graph::Easy::Node' ) {
352 warn "Called node_sequence without two nodes!";
355 $witness = 'base text' unless $witness;
356 my @nodes = ( $start );
359 while( $n ne $end ) {
360 if( exists( $seen{$n->name()} ) ) {
361 warn "Detected loop at " . $n->name();
364 $seen{$n->name()} = 1;
366 my @edges = $n->outgoing();
367 my @relevant_edges = grep { my @w = split( /, /, $_->label );
368 grep { /^\Q$witness\E$/ } @w } @edges;
369 unless( @relevant_edges ) {
370 @relevant_edges = grep { my @w = split( /, /, $_->label );
371 grep { /^\Q$backup\E$/ } @w } @edges
374 unless( @relevant_edges ) {
375 @relevant_edges = grep { $_->label() eq 'base text' } @edges;
378 warn "Did not find an edge for $witness from node " . $n->label
379 unless scalar @relevant_edges;
380 my $next = $relevant_edges[0]->to();
381 push( @nodes, $next );
384 # Check that the last node is our end node.
385 my $last = $nodes[$#nodes];
386 warn "Last node found from " . $start->label() .
387 " for witness $witness is not the end!"
388 unless $last eq $end;
393 =item B<string_lemma>
395 my $text = $graph->string_lemma( $first, $last, $path );
397 Returns the whitespace-separated text, starting with $first and ending
398 with $last, represented in the graph along the given path.
403 my( $self, $start, $end, $label ) = @_;
405 my @nodes = $self->node_sequence( $start, $end, $label );
406 my @words = map { $_->label() } @nodes;
407 return join( ' ', @words );
412 =head2 Transposition handling methods
414 These should really move to their own module. For use when the graph
415 has split transposed nodes in order to avoid edges that travel
420 =item B<set_identical_node>
422 $graph->set_identical_node( $node, $other_node )
424 Tell the graph that these two nodes contain the same (transposed) reading.
428 sub set_identical_node {
429 my( $self, $node, $main_node ) = @_;
431 # The identical_nodes hash contains a key per node, and a value
432 # that is an arrayref to a list of nodes. Those nodes that are
433 # the same (transposed) node should be keys that point to the same
434 # arrayref. Each arrayref should contain the name of each node
435 # that points to it. So basically here we want to merge the
436 # arrays for the two nodes that are now identical. The 'main'
437 # node should always be first in the array.
439 my $pool = $self->{'identical_nodes'}->{ $node };
440 my $main_pool = $self->{'identical_nodes'}->{ $main_node };
443 foreach ( @$main_pool ) {
444 # Note which nodes are already in the main pool so that we
450 # Add the remaining nodes to the main pool...
451 push( @$main_pool, $_ ) unless $poolhash{$_};
453 # ...and set this node to point to the enlarged pool.
454 $self->{'identical_nodes'}->{ $node } = $main_pool;
457 =item B<identical_nodes>
459 my @nodes = $graph->identical_nodes( $node )
461 Get a list of nodes that contain the same (transposed) reading as the
466 sub identical_nodes {
467 my( $self, $node ) = @_;
468 my @others = grep { $_ !~ /^$node$/ }
469 @{$self->{'identical_nodes'}->{ $node }};
475 =head2 Output method(s)
481 print $graph->as_svg( $recalculate );
483 Returns an SVG string that represents the graph. Uses GraphViz to do
484 this, because Graph::Easy doesn't cope well with long graphs. Unless
485 $recalculate is passed (and is a true value), the method will return a
486 cached copy of the SVG after the first call to the method.
491 my( $self, $recalc ) = @_;
492 return $self->{'svg'} if( exists $self->{'svg'} && !$recalc );
494 $self->{'graphviz'} = $self->{'graph'}->as_graphviz()
495 unless( exists $self->{'graphviz'} && !$recalc );
497 my @cmd = qw/dot -Tsvg/;
499 my $in = $self->{'graphviz'};
500 run( \@cmd, \$in, ">", binary(), \$svg );
501 $self->{'svg'} = $svg;
507 print $graph->as_graphml( $recalculate )
509 Returns a GraphML representation of the collation graph, with
510 transposition information and position information. Unless
511 $recalculate is passed (and is a true value), the method will return a
512 cached copy of the SVG after the first call to the method.
517 my( $self, $recalc ) = @_;
518 return $self->{'graphml'} if( exists $self->{'graphml'} && !$recalc );
521 my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
522 my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
523 my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
524 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
526 # Create the document and root node
527 my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
528 my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
529 $graphml->setDocumentElement( $root );
530 $root->setNamespace( $xsi_ns, 'xsi', 0 );
531 $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
533 # Add the data keys for nodes
534 my @node_data = ( 'name', 'token', 'identical', 'position' );
535 foreach my $ndi ( 0 .. $#node_data ) {
536 my $key = $root->addNewChild( $graphml_ns, 'key' );
537 $key->setAttribute( 'attr.name', $node_data[$ndi] );
538 $key->setAttribute( 'attr.type', 'string' );
539 $key->setAttribute( 'for', 'node' );
540 $key->setAttribute( 'id', 'd'.$ndi );
543 # Add the data keys for edges
546 foreach my $wit ( $self->getWitnessList ) {
547 my $wit_key = 'w' . $wit_ctr++;
548 $wit_hash{$wit} = $wit_key;
549 my $key = $root->addNewChild( $graphml_ns, 'key' );
550 $key->setAttribute( 'attr.name', $wit );
551 $key->setAttribute( 'attr.type', 'string' );
552 $key->setAttribute( 'for', 'edge' );
553 $key->setAttribute( 'id', $wit_key );
556 # Add the graph, its nodes, and its edges
557 my $graph = $root->addNewChild( $graphml_ns, 'graph' );
558 $graph->setAttribute( 'edgedefault', 'directed' );
559 $graph->setAttribute( 'id', 'g0' ); # TODO make this meaningful
560 $graph->setAttribute( 'parse.edgeids', 'canonical' );
561 $graph->setAttribute( 'parse.edges', $self->edges() );
562 $graph->setAttribute( 'parse.nodeids', 'canonical' );
563 $graph->setAttribute( 'parse.nodes', $self->nodes() );
564 $graph->setAttribute( 'parse.order', 'nodesfirst' );
568 foreach my $n ( $self->nodes() ) {
569 my %this_node_data = ();
570 foreach my $ndi ( 0 .. $#node_data ) {
572 $this_node_data{'d'.$ndi} = $n->name if $node_data[$ndi] eq 'name';
573 $this_node_data{'d'.$ndi} = $n->label
574 if $node_data[$ndi] eq 'token';
575 $this_node_data{'d'.$ndi} = $self->primary_node( $n )
576 if $node_data[$ndi] eq 'name';
577 $this_node_data{'d'.$ndi} =
578 $self->{'positions'}->node_position( $n )
579 if $node_data[$ndi] eq 'position';
581 my $node_el = $graph->addNewChild( $graphml_ns, 'node' );
582 my $node_xmlid = 'n' . $node_ctr++;
583 $node_hash{ $n->name } = $node_xmlid;
584 $node_el->setAttribute( 'id', $node_xmlid );
586 foreach my $dk ( keys %this_node_data ) {
587 my $d_el = $node_el->addNewChild( $graphml_ns, 'data' );
588 $d_el->setAttribute( 'key', $dk );
589 $d_el->appendTextChild( $this_node_data{$dk} );
593 foreach my $e ( $self->edges() ) {
594 my( $name, $from, $to ) = ( $e->name,
595 $node_hash{ $e->from()->name() },
596 $node_hash{ $e->to()->name() } );
597 my $edge_el = $graph->addNewChild( $graphml_ns, 'edge' );
598 $edge_el->setAttribute( 'source', $from );
599 $edge_el->setAttribute( 'target', $to );
600 $edge_el->setAttribute( 'id', $name );
601 # TODO Got to add the witnesses
605 $self->{'graphml'} = $graphml;
611 =head2 Lemmatization methods
615 =item B<init_lemmatizer>
619 sub init_lemmatizer {
621 # Initialize the 'lemma' hash, going through all the nodes and seeing
622 # which ones are common nodes. This should only be run once.
624 return if( $self->{'lemmatizer_initialized'} );
625 my @active_names = map { $_->name } grep { $self->is_common( $_ ) }
627 $self->{'positions'}->init_lemmatizer( @active_names );
628 $self->{'lemmatizer_initialized'} = 1;
634 my @nodes_turned_off = $graph->toggle_node( $node );
636 Takes a node name, and either lemmatizes or de-lemmatizes it. Returns
637 a list of all nodes that are de-lemmatized as a result of the toggle.
642 my( $self, $node ) = @_;
644 # In case this is being called for the first time.
645 $self->init_lemmatizer();
647 if( !$node || $self->is_common( $node ) ) {
648 # Do nothing, it's a common node.
652 my $pos = $self->{'positions'}->node_position( $node );
653 my $old_state = $self->{'positions'}->state( $pos );
655 if( $old_state && $old_state eq $node ) {
656 # Turn off the node. We turn on no others by default.
657 push( @nodes_off, $node );
660 $self->{'positions'}->set_state( $pos, $node );
661 # Any other 'on' nodes in the same position should be off.
662 push( @nodes_off, $self->colocated_nodes( $node ) );
663 # Any node that is an identical transposed one should be off.
664 push( @nodes_off, $self->identical_nodes( $node ) )
665 if $self->identical_nodes( $node );
667 @nodes_off = unique_list( @nodes_off );
669 # Turn off the nodes that need to be turned off.
670 my @nodes_turned_off;
671 foreach my $n ( @nodes_off ) {
672 my $npos = $self->{'positions'}->node_position( $n );
673 my $state = $self->{'positions'}->state( $npos );
674 if( $state && $state eq $n ) {
675 # this node is still on
676 push( @nodes_turned_off, $n );
677 my $new_state = undef;
679 # This is the node that was clicked, so if there are no
680 # other nodes there, turn off the position. In all other
681 # cases, restore the ellipsis.
682 my @all_n = $self->{'positions'}->nodes_at_position( $pos );
683 $new_state = 0 if scalar( @all_n ) == 1;
685 $self->{'positions'}->set_state( $npos, $new_state );
686 } elsif( $old_state && $old_state eq $n ) {
687 # another node has already been turned on here
688 push( @nodes_turned_off, $n );
689 } # else some other node was on anyway, so pass.
691 return @nodes_turned_off;
694 =item B<active_nodes>
696 my @state = $graph->active_nodes( @nodes_turned_off );
698 Takes a list of nodes that have just been turned off, and returns a
699 set of tuples of the form ['node', 'state'] that indicates what
700 changes need to be made to the graph.
706 A state of 1 means 'turn on this node'
710 A state of 0 means 'turn off this node'
714 A state of undef means 'an ellipsis belongs in the text here because
715 no decision has been made'
722 my( $self, @toggled_off_nodes ) = @_;
724 # In case this is the first run
725 $self->init_lemmatizer();
726 # First get the positions of those nodes which have been
728 my $positions_off = {};
729 map { $positions_off->{ $self->{'positions'}->node_position( $_ ) } = $_ }
733 # Now for each position, we have to see if a node is on, and we
734 # have to see if a node has been turned off.
736 foreach my $pos ( $self->{'positions'}->all() ) {
737 # Find the state of this position. If there is an active node,
738 # its name will be the state; otherwise the state will be 0
739 # (nothing at this position) or undef (ellipsis at this position)
740 my $active = $self->{'positions'}->state( $pos );
742 # Is there a formerly active node that was toggled off?
743 if( exists( $positions_off->{$pos} ) ) {
744 my $off_node = $positions_off->{$pos};
745 if( $active && $active ne $off_node) {
746 push( @answer, [ $off_node, 0 ], [ $active, 1 ] );
748 push( @answer, [ $off_node, $active ] );
751 # No formerly active node, so we just see if there is a currently
754 # Push the active node, whatever it is.
755 push( @answer, [ $active, 1 ] );
757 # Push the state that is there. Arbitrarily use the first node
759 my @pos_nodes = $self->{'positions'}->nodes_at_position( $pos );
761 [ $pos_nodes[0], $self->{'positions'}->state( $pos ) ] );
768 # A couple of helpers.
771 my( $self, $node ) = @_;
772 $node = $self->_nodeobj( $node );
773 return $node->get_attribute('class') eq 'common';
777 my( $self, $node ) = @_;
778 unless( ref $node eq 'Graph::Easy::Node' ) {
779 $node = $self->node( $node );
784 sub colocated_nodes {
786 return $self->{'positions'}->colocated_nodes( @_ );
790 my( $self, $node_id ) = @_;
791 # This is the label of the given node.
792 return $self->node( $node_id )->label();
795 sub text_for_witness {
796 my( $self, $wit ) = @_;
798 my @nodes = $self->{'positions'}->witness_path( $wit );
799 my @words = map { $self->node( $_ )->label() } @nodes;
800 return join( ' ', @words );
806 map { $h{$_} = 1 } @list;
814 This package is free software and is provided "as is" without express
815 or implied warranty. You can redistribute it and/or modify it under
816 the same terms as Perl itself.
820 Tara L Andrews, aurum@cpan.org