1 package Text::Tradition::Graph;
6 use IPC::Run qw( run binary );
8 use Text::Tradition::Graph::Position;
12 Text::Tradition::Graph
16 use Text::Tradition::Graph;
18 my $text = Text::Tradition::Graph->new( 'GraphML' => '/my/graphml/file.xml' );
19 my $text = Text::Tradition::Graph->new( 'TEI' => '/my/tei/file.xml' );
20 my $text = Text::Tradition::Graph->new( 'CSV' => '/my/csv/file.csv',
21 'base' => '/my/basefile.txt' );
22 my $text = Text::Tradition::Graph->new( 'CTE' => '/my/cte/file.txt',
23 'base' => '/my/basefile.txt' );
25 my $svg_string = $text->as_svg();
27 my $lemma_nodes = $text->active_nodes();
28 $text->toggle_node( 'some_word' );
32 A text tradition is the representation of our knowledge of a text that
33 has been passed down via manuscript copies from a time before printing
34 presses. Each text has a number of witnesses, that is, manuscripts
35 that bear a version of the text. The tradition is the aggregation of
36 these witnesses, which is to say, the collation of the text.
38 This module takes a text collation and represents it as a horizontal
39 directed graph, suitable for SVG rendering and for analysis of various
40 forms. Since this module was written by a medievalist, it also
41 provides a facility for making a critical text reconstruction by
42 choosing certain variants to be 'lemma' text - that is, text which
43 should be considered the 'standard' reading.
45 Although the graph is a very good way to render text collation, and is
46 visually very easy for a human to interpret, it doesn't have any
47 inherent information about which nodes 'go together' - that is, which
48 text readings appear in the same place as other readings. This module
49 therefore calculates 'positions' on the graph, thus holding some
50 information about which readings can and can't be substituted for
59 Constructor. Takes a source collation file from which to construct
60 the initial graph. This file can be TEI (parallel segmentation) XML,
61 CSV in a format yet to be documented, GraphML as documented by the
62 CollateX tool (L<http://gregor.middell.net/collatex/>), or a Classical
63 Text Editor apparatus. For CSV and Classical Text Editor files, the
64 user must also supply a base text to which the line numbering in the
65 collation file refers.
67 20/04/2011 Currently only CSV and GraphML are really supported.
73 my $class = ref( $proto ) || $proto;
74 my %opts = ( 'on_color' => 'yellow',
75 'off_color' => 'white',
79 # opts can be: GraphML, base+CSV, base+CTE, TEI. We need
81 my @formats = grep { /^(GraphML|CSV|CTE|TEI)$/ } keys( %opts );
82 my $format = shift( @formats );
84 warn "No data given to create a graph; will initialize an empty one";
86 if( $format =~ /^(CSV|CTE)$/ && !exists $opts{'base'} ) {
87 warn "Cannot make a graph from $format without a base text";
91 # Make a graph object.
92 my $collation_graph = Graph::Easy->new();
93 $collation_graph->set_attribute( 'node', 'shape', 'ellipse' );
94 # Starting point for all texts
95 my $last_node = $collation_graph->add_node( '#START#' );
97 $self->{'graph'} = $collation_graph;
98 bless( $self, $class );
100 # Now do the parsing.
103 if( $format =~ /^(CSV|CTE)$/ ) {
104 @args = ( 'base' => $opts{'base'},
105 'data' => $opts{$format},
106 'format' => $format );
107 $format = 'BaseText';
109 @args = ( $opts{ $format } );
111 my $mod = "Text::Tradition::Parser::$format";
113 $mod->can('parse')->( $self, @args );
118 =item B<make_positions>
120 $graph->make_positions( $common_nodes, $paths )
122 Create an associated Graph::Positions object that records the position
123 of each node in the graph. This method call is probably in the wrong
129 my( $self, $common_nodes, $paths ) = @_;
130 my $positions = Text::Tradition::Graph::Position->new( $common_nodes, $paths );
131 $self->{'positions'} = $positions;
136 =head2 Graph::Easy object accessor methods
138 See the Graph::Easy documentation for descriptions of these functions.
148 return $self->{'graph'}->node( @_ );
157 return $self->{'graph'}->edge( @_ );
164 # Not only adds the node, but also initializes internal data
169 my $node = $self->{'graph'}->add_node( @_ );
170 $self->{'identical_nodes'}->{ $node->name() } = [ $node->name() ];
180 return $self->{'graph'}->add_edge( @_ );
191 # Delete this node out of any relevant transposition pool.
192 if( ref $node eq 'Graph::Easy::Node' ) {
193 $node = $node->name();
195 my @ident = $self->identical_nodes( $node );
198 my $pool = $self->{'identical_nodes'}->{ $ident[0] };
199 foreach my $i ( 0 .. scalar(@$pool)-1 ) {
200 if( $pool->[$i] eq $node ) {
201 splice( @$pool, $i, 1 );
206 delete $self->{'identical_nodes'}->{ $node };
208 # Now delete the node.
209 return $self->{'graph'}->del_node( @_ );
218 return $self->{'graph'}->del_edge( @_ );
227 return $self->{'graph'}->nodes( @_ );
236 return $self->{'graph'}->edges( @_ );
245 return $self->{'graph'}->merge_nodes( @_ );
248 ### Helper methods for navigating the tree
252 =head2 Graph navigation methods
258 my $node = $graph->start();
260 Returns the beginning node of the graph.
265 # Return the beginning node of the graph.
267 my( $new_start ) = @_;
269 # Fix the node transposition data
270 delete $self->{'identical_nodes'}->{ $new_start->name() };
271 $self->{'identical_nodes'}->{'#START#'} = [ '#START#' ];
272 $self->{'graph'}->rename_node( $new_start, '#START#' );
274 return $self->{'graph'}->node('#START#');
279 my $next_node = $graph->next_word( $node, $path );
281 Returns the node that follows the given node along the given witness
282 path. TODO These are badly named.
287 # Return the successor via the corresponding edge.
289 return $self->_find_linked_word( 'next', @_ );
294 my $prior_node = $graph->prior_word( $node, $path );
296 Returns the node that precedes the given node along the given witness
297 path. TODO These are badly named.
302 # Return the predecessor via the corresponding edge.
304 return $self->_find_linked_word( 'prior', @_ );
307 sub _find_linked_word {
308 my( $self, $direction, $node, $edge ) = @_;
309 $edge = 'base text' unless $edge;
310 my @linked_edges = $direction eq 'next'
311 ? $node->outgoing() : $node->incoming();
312 return undef unless scalar( @linked_edges );
314 # We have to find the linked edge that contains all of the
315 # witnesses supplied in $edge.
316 my @edge_wits = split( /, /, $edge );
317 foreach my $le ( @linked_edges ) {
318 my @le_wits = split( /, /, $le->name() );
319 if( _is_within( \@edge_wits, \@le_wits ) ) {
320 # This is the right edge.
321 return $direction eq 'next' ? $le->to() : $le->from();
324 warn "Could not find $direction node from " . $node->label
325 . " along edge $edge";
331 my( $set1, $set2 ) = @_;
333 foreach my $el ( @$set1 ) {
334 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
339 =item B<node_sequence>
341 my @nodes = $graph->node_sequence( $first, $last, $path );
343 Returns the ordered list of nodes, starting with $first and ending
344 with $last, along the given witness path.
349 my( $self, $start, $end, $witness, $backup ) = @_;
350 unless( ref( $start ) eq 'Graph::Easy::Node'
351 && ref( $end ) eq 'Graph::Easy::Node' ) {
352 warn "Called node_sequence without two nodes!";
355 $witness = 'base text' unless $witness;
356 my @nodes = ( $start );
359 while( $n ne $end ) {
360 if( exists( $seen{$n->name()} ) ) {
361 warn "Detected loop at " . $n->name();
364 $seen{$n->name()} = 1;
366 my @edges = $n->outgoing();
367 my @relevant_edges = grep { my @w = split( /, /, $_->label );
368 grep { /^\Q$witness\E$/ } @w } @edges;
369 unless( @relevant_edges ) {
370 @relevant_edges = grep { my @w = split( /, /, $_->label );
371 grep { /^\Q$backup\E$/ } @w } @edges
374 unless( @relevant_edges ) {
375 @relevant_edges = grep { $_->label() eq 'base text' } @edges;
378 warn "Did not find an edge for $witness from node " . $n->label
379 unless scalar @relevant_edges;
380 my $next = $relevant_edges[0]->to();
381 push( @nodes, $next );
384 # Check that the last node is our end node.
385 my $last = $nodes[$#nodes];
386 warn "Last node found from " . $start->label() .
387 " for witness $witness is not the end!"
388 unless $last eq $end;
393 =item B<string_lemma>
395 my $text = $graph->string_lemma( $first, $last, $path );
397 Returns the whitespace-separated text, starting with $first and ending
398 with $last, represented in the graph along the given path.
403 my( $self, $start, $end, $label ) = @_;
405 my @nodes = $self->node_sequence( $start, $end, $label );
406 my @words = map { $_->label() } @nodes;
407 return join( ' ', @words );
412 =head2 Transposition handling methods
414 These should really move to their own module. For use when the graph
415 has split transposed nodes in order to avoid edges that travel
420 =item B<set_identical_node>
422 $graph->set_identical_node( $node, $other_node )
424 Tell the graph that these two nodes contain the same (transposed) reading.
428 sub set_identical_node {
429 my( $self, $node, $same_node ) = @_;
430 my $pool = $self->{'identical_nodes'}->{ $node };
431 my $same_pool = $self->{'identical_nodes'}->{ $same_node };
436 foreach( @$same_pool ) {
437 push( @$pool, $_ ) unless $poolhash{$_};
440 $self->{'identical_nodes'}->{ $same_node } = $pool;
443 =item B<set_identical_node>
445 my @nodes = $graph->identical_nodes( $node )
447 Get a list of nodes that contain the same (transposed) reading as the
452 sub identical_nodes {
453 my( $self, $node ) = @_;
454 my @others = grep { $_ !~ /^$node$/ }
455 @{$self->{'identical_nodes'}->{ $node }};
461 =head2 Output method(s)
467 print $graph->as_svg( $recalculate );
469 Returns an SVG string that represents the graph. Uses GraphViz to do
470 this, because Graph::Easy doesn't cope well with long graphs. Unless
471 $recalculate is passed (and is a true value), the method will return a
472 cached copy of the SVG after the first call to the method.
477 my( $self, $recalc ) = @_;
478 return $self->{'svg'} if( exists $self->{'svg'} && !$recalc );
480 $self->{'graphviz'} = $self->{'graph'}->as_graphviz()
481 unless( exists $self->{'graphviz'} && !$recalc );
483 my @cmd = qw/dot -Tsvg/;
485 my $in = $self->{'graphviz'};
486 run( \@cmd, \$in, ">", binary(), \$svg );
487 $self->{'svg'} = $svg;
493 =head2 Lemmatization methods
497 =item B<init_lemmatizer>
501 sub init_lemmatizer {
503 # Initialize the 'lemma' hash, going through all the nodes and seeing
504 # which ones are common nodes. This should only be run once.
506 return if( $self->{'lemmatizer_initialized'} );
507 my @active_names = map { $_->name } grep { $self->is_common( $_ ) }
509 $self->{'positions'}->init_lemmatizer( @active_names );
510 $self->{'lemmatizer_initialized'} = 1;
516 my @nodes_turned_off = $graph->toggle_node( $node );
518 Takes a node name, and either lemmatizes or de-lemmatizes it. Returns
519 a list of all nodes that are de-lemmatized as a result of the toggle.
524 my( $self, $node ) = @_;
526 # In case this is being called for the first time.
527 $self->init_lemmatizer();
529 if( !$node || $self->is_common( $node ) ) {
530 # Do nothing, it's a common node.
534 my $pos = $self->{'positions'}->node_position( $node );
535 my $old_state = $self->{'positions'}->state( $pos );
537 if( $old_state && $old_state eq $node ) {
538 # Turn off the node. We turn on no others by default.
539 push( @nodes_off, $node );
542 $self->{'positions'}->set_state( $pos, $node );
543 # Any other 'on' nodes in the same position should be off.
544 push( @nodes_off, $self->colocated_nodes( $node ) );
545 # Any node that is an identical transposed one should be off.
546 push( @nodes_off, $self->identical_nodes( $node ) )
547 if $self->identical_nodes( $node );
549 @nodes_off = unique_list( @nodes_off );
551 # Turn off the nodes that need to be turned off.
552 my @nodes_turned_off;
553 foreach my $n ( @nodes_off ) {
554 my $npos = $self->{'positions'}->node_position( $n );
555 my $state = $self->{'positions'}->state( $npos );
556 if( $state && $state eq $n ) {
557 # this node is still on
558 push( @nodes_turned_off, $n );
559 my $new_state = undef;
561 # This is the node that was clicked, so if there are no
562 # other nodes there, turn off the position. In all other
563 # cases, restore the ellipsis.
564 my @all_n = $self->{'positions'}->nodes_at_position( $pos );
565 $new_state = 0 if scalar( @all_n ) == 1;
567 $self->{'positions'}->set_state( $npos, $new_state );
568 } elsif( $old_state && $old_state eq $n ) {
569 # another node has already been turned on here
570 push( @nodes_turned_off, $n );
571 } # else some other node was on anyway, so pass.
573 return @nodes_turned_off;
576 =item B<active_nodes>
578 my @state = $graph->active_nodes( @nodes_turned_off );
580 Takes a list of nodes that have just been turned off, and returns a
581 set of tuples of the form ['node', 'state'] that indicates what
582 changes need to be made to the graph.
588 A state of 1 means 'turn on this node'
592 A state of 0 means 'turn off this node'
596 A state of undef means 'an ellipsis belongs in the text here because
597 no decision has been made'
604 my( $self, @toggled_off_nodes ) = @_;
606 # In case this is the first run
607 $self->init_lemmatizer();
608 # First get the positions of those nodes which have been
610 my $positions_off = {};
611 map { $positions_off->{ $self->{'positions'}->node_position( $_ ) } = $_ }
615 # Now for each position, we have to see if a node is on, and we
616 # have to see if a node has been turned off.
618 foreach my $pos ( $self->{'positions'}->all() ) {
619 # Find the state of this position. If there is an active node,
620 # its name will be the state; otherwise the state will be 0
621 # (nothing at this position) or undef (ellipsis at this position)
622 my $active = $self->{'positions'}->state( $pos );
624 # Is there a formerly active node that was toggled off?
625 if( exists( $positions_off->{$pos} ) ) {
626 my $off_node = $positions_off->{$pos};
627 if( $active && $active ne $off_node) {
628 push( @answer, [ $off_node, 0 ], [ $active, 1 ] );
630 push( @answer, [ $off_node, $active ] );
633 # No formerly active node, so we just see if there is a currently
636 # Push the active node, whatever it is.
637 push( @answer, [ $active, 1 ] );
639 # Push the state that is there. Arbitrarily use the first node
641 my @pos_nodes = $self->{'positions'}->nodes_at_position( $pos );
643 [ $pos_nodes[0], $self->{'positions'}->state( $pos ) ] );
650 # A couple of helpers.
653 my( $self, $node ) = @_;
654 $node = $self->_nodeobj( $node );
655 return $node->get_attribute('class') eq 'common';
659 my( $self, $node ) = @_;
660 unless( ref $node eq 'Graph::Easy::Node' ) {
661 $node = $self->node( $node );
666 sub colocated_nodes {
668 return $self->{'positions'}->colocated_nodes( @_ );
672 my( $self, $node_id ) = @_;
673 # This is the label of the given node.
674 return $self->node( $node_id )->label();
677 sub text_for_witness {
678 my( $self, $wit ) = @_;
680 my @nodes = $self->{'positions'}->witness_path( $wit );
681 my @words = map { $self->node( $_ )->label() } @nodes;
682 return join( ' ', @words );
688 map { $h{$_} = 1 } @list;
696 This package is free software and is provided "as is" without express
697 or implied warranty. You can redistribute it and/or modify it under
698 the same terms as Perl itself.
702 Tara L Andrews, aurum@cpan.org