1 package Traditions::Graph;
6 use IPC::Run qw( run binary );
8 use Traditions::Graph::Position;
12 (Text?)::Traditions::Graph
16 use Traditions::Graph;
18 my $text = Traditions::Graph->new( 'GraphML' => '/my/graphml/file.xml' );
19 my $text = Traditions::Graph->new( 'TEI' => '/my/tei/file.xml' );
20 my $text = Traditions::Graph->new( 'CSV' => '/my/csv/file.csv',
21 'base' => '/my/basefile.txt' );
22 my $text = Traditions::Graph->new( 'CTE' => '/my/cte/file.txt',
23 'base' => '/my/basefile.txt' );
25 my $svg_string = $text->as_svg();
27 my $lemma_nodes = $text->active_nodes();
28 $text->toggle_node( 'some_word' );
32 A text tradition is the representation of our knowledge of a text that
33 has been passed down via manuscript copies from a time before printing
34 presses. Each text has a number of witnesses, that is, manuscripts
35 that bear a version of the text. The tradition is the aggregation of
36 these witnesses, which is to say, the collation of the text.
38 This module takes a text collation and represents it as a horizontal
39 directed graph, suitable for SVG rendering and for analysis of various
40 forms. Since this module was written by a medievalist, it also
41 provides a facility for making a critical text reconstruction by
42 choosing certain variants to be 'lemma' text - that is, text which
43 should be considered the 'standard' reading.
45 Although the graph is a very good way to render text collation, and is
46 visually very easy for a human to interpret, it doesn't have any
47 inherent information about which nodes 'go together' - that is, which
48 text readings appear in the same place as other readings. This module
49 therefore calculates 'positions' on the graph, thus holding some
50 information about which readings can and can't be substituted for
59 Constructor. Takes a source collation file from which to construct
60 the initial graph. This file can be TEI (parallel segmentation) XML,
61 CSV in a format yet to be documented, GraphML as documented (someday)
62 by CollateX, or a Classical Text Editor apparatus. For CSV and
63 Classical Text Editor files, the user must also supply a base text to
64 which the line numbering in the collation file refers.
70 my $class = ref( $proto ) || $proto;
71 my %opts = ( 'on_color' => 'yellow',
72 'off_color' => 'white',
76 # opts can be: GraphML, base+CSV, base+CTE, TEI. We need
78 my @formats = grep { /^(GraphML|CSV|CTE|TEI)$/ } keys( %opts );
79 my $format = shift( @formats );
81 warn "No data given to create a graph: need GraphML, CSV, or TEI";
84 if( $format =~ /^(CSV|CTE)$/ && !exists $opts{'base'} ) {
85 warn "Cannot make a graph from $format without a base text";
89 # Make a graph object.
90 my $collation_graph = Graph::Easy->new();
91 $collation_graph->set_attribute( 'node', 'shape', 'ellipse' );
92 # Starting point for all texts
93 my $last_node = $collation_graph->add_node( '#START#' );
95 $self->{'graph'} = $collation_graph;
96 bless( $self, $class );
99 my $mod = "Traditions::Parser::$format";
101 my @args = ( $opts{ $format } );
102 if( $format =~ /^(CSV|CTE)$/ ) {
103 push( @args, $opts{'base'} );
105 $mod->can('parse')->( $self, @args );
111 ### Graph::Easy object accessor methods
114 return $self->{'graph'}->node( @_ );
119 return $self->{'graph'}->edge( @_ );
124 return $self->{'graph'}->add_node( @_ );
129 return $self->{'graph'}->add_edge( @_ );
134 return $self->{'graph'}->del_node( @_ );
139 return $self->{'graph'}->del_edge( @_ );
144 return $self->{'graph'}->nodes( @_ );
149 return $self->{'graph'}->edges( @_ );
154 return $self->{'graph'}->merge_nodes( @_ );
157 ### Helper methods for navigating the tree
160 # Return the beginning node of the graph.
162 my( $new_start ) = @_;
164 $self->{'graph'}->rename_node( $new_start, '#START#' );
166 return $self->{'graph'}->node('#START#');
169 sub set_identical_nodes {
170 my( $self, $node_hash ) = @_;
171 $self->{'identical_nodes'} = $node_hash;
175 # Return the successor via the corresponding edge.
176 my( $self, $node, $edge ) = @_;
177 $edge = "base text" unless $edge;
178 my @next_edges = $node->outgoing();
179 return undef unless scalar( @next_edges );
181 foreach my $e ( @next_edges ) {
182 next unless $e->label() eq $edge;
186 warn "Could not find node connected to edge $edge";
191 # Return the predecessor via the corresponding edge.
192 my( $self, $node, $edge ) = @_;
193 $edge = "base text" unless $edge;
194 my @prior_edges = $node->incoming();
195 return undef unless scalar( @prior_edges );
197 foreach my $e ( @prior_edges ) {
198 next unless $e->label() eq $edge;
202 warn "Could not find node connected from edge $edge";
207 my( $self, $start, $end, $label ) = @_;
208 # TODO make label able to follow a single MS
209 unless( ref( $start ) eq 'Graph::Easy::Node'
210 && ref( $end ) eq 'Graph::Easy::Node' ) {
211 warn "Called node_sequence without two nodes!";
214 $label = 'base text' unless $label;
215 my @nodes = ( $start );
218 while( $n ne $end ) {
219 if( exists( $seen{$n->name()} ) ) {
220 warn "Detected loop at " . $n->name();
223 $seen{$n->name()} = 1;
225 my @edges = $n->outgoing();
226 my @relevant_edges = grep { $_->label =~ /^$label$/ } @edges;
227 warn "Did not find an edge $label from node " . $n->label
228 unless scalar @relevant_edges;
229 warn "Found more than one edge $label from node " . $n->label
230 unless scalar @relevant_edges == 1;
231 my $next = $relevant_edges[0]->to();
232 push( @nodes, $next );
235 # Check that the last node is our end node.
236 my $last = $nodes[$#nodes];
237 warn "Last node found from " . $start->label() .
238 " via path $label is not the end!"
239 unless $last eq $end;
245 my( $self, $start, $end, $label ) = @_;
247 my @nodes = $self->node_sequence( $start, $end, $label );
248 my @words = map { $_->label() } @nodes;
249 return join( ' ', @words );
252 ## Output. We use GraphViz for the layout because it handles large
253 ## graphs better than Graph::Easy does natively.
256 my( $self, $recalc ) = @_;
257 return $self->{'svg'} if( exists $self->{'svg'} && !$recalc );
259 $self->{'graphviz'} = $self->{'graph'}->as_graphviz()
260 unless( exists $self->{'graphviz'} && !$recalc );
262 my @cmd = qw/dot -Tsvg/;
264 my $in = $self->{'graphviz'};
265 run( \@cmd, \$in, ">", binary(), \$svg );
266 $self->{'svg'} = $svg;
270 ## Methods for lemmatizing a text.
272 sub init_lemmatizer {
274 # Initialize the 'lemma' hash, going through all the nodes and seeing
275 # which ones are common nodes. This should only be run once.
277 return if( $self->{'lemmatizer_initialized'} );
278 my @active_names = map { $_->name } grep { $self->is_common( $_ ) }
280 $self->{'positions'}->init_lemmatizer( @active_names );
281 $self->{'lemmatizer_initialized'} = 1;
286 my( $self, $common_nodes, $paths ) = @_;
287 my $positions = Traditions::Graph::Position->new( $common_nodes, $paths );
288 $self->{'positions'} = $positions;
291 # Takes a list of nodes that have just been turned off, and returns a
292 # set of tuples of the form ['node', 'state'] that indicates what
293 # changes need to be made to the graph.
294 # A state of 1 means 'turn on this node'
295 # A state of 0 means 'turn off this node'
296 # A state of undef means 'an ellipsis belongs in the text here because
297 # no decision has been made'
299 my( $self, @toggled_off_nodes ) = @_;
301 # In case this is the first run
302 $self->init_lemmatizer();
303 # First get the positions of those nodes which have been
305 my $positions_off = {};
306 map { $positions_off->{ $self->{'positions'}->node_position( $_ ) } = $_ }
310 # Now for each position, we have to see if a node is on, and we
311 # have to see if a node has been turned off.
313 foreach my $pos ( $self->{'positions'}->all() ) {
314 # Find the state of this position. If there is an active node,
315 # its name will be the state; otherwise the state will be 0
316 # (nothing at this position) or undef (ellipsis at this position)
317 my $active = $self->{'positions'}->state( $pos );
319 # Is there a formerly active node that was toggled off?
320 if( exists( $positions_off->{$pos} ) ) {
321 my $off_node = $positions_off->{$pos};
322 if( $active && $active ne $off_node) {
323 push( @answer, [ $off_node, 0 ], [ $active, 1 ] );
325 push( @answer, [ $off_node, $active ] );
328 # No formerly active node, so we just see if there is a currently
331 # Push the active node, whatever it is.
332 push( @answer, [ $active, 1 ] );
334 # Push the state that is there. Arbitrarily use the first node
336 my @pos_nodes = $self->{'positions'}->nodes_at_position( $pos );
338 [ $pos_nodes[0], $self->{'positions'}->state( $pos ) ] );
345 # A couple of helpers. TODO These should be gathered in the same place
349 my( $self, $node ) = @_;
350 $node = $self->_nodeobj( $node );
351 return $node->get_attribute('class') eq 'common';
355 my( $self, $node ) = @_;
356 unless( ref $node eq 'Graph::Easy::Node' ) {
357 $node = $self->node( $node );
362 # toggle_node takes a node name, and either lemmatizes or de-lemmatizes it.
363 # Returns a list of nodes that are de-lemmatized as a result of the toggle.
366 my( $self, $node ) = @_;
368 # In case this is being called for the first time.
369 $self->init_lemmatizer();
371 if( $self->is_common( $node ) ) {
372 # Do nothing, it's a common node.
376 my $pos = $self->{'positions'}->node_position( $node );
377 my $old_state = $self->{'positions'}->state( $pos );
379 if( $old_state && $old_state eq $node ) {
380 # Turn off the node. We turn on no others by default.
381 push( @nodes_off, $node );
384 $self->{'positions'}->set_state( $pos, $node );
385 # Any other 'on' nodes in the same position should be off.
386 push( @nodes_off, $self->colocated_nodes( $node ) );
387 # Any node that is an identical transposed one should be off.
388 push( @nodes_off, $self->identical_nodes( $node ) )
389 if $self->identical_nodes( $node );
391 @nodes_off = unique_list( @nodes_off );
393 # Turn off the nodes that need to be turned off.
394 my @nodes_turned_off;
395 foreach my $n ( @nodes_off ) {
396 my $npos = $self->{'positions'}->node_position( $n );
397 my $state = $self->{'positions'}->state( $npos );
398 if( $state && $state eq $n ) {
399 # this node is still on
400 push( @nodes_turned_off, $n );
401 my $new_state = undef;
403 # This is the node that was clicked, so if there are no
404 # other nodes there, turn off the position. In all other
405 # cases, restore the ellipsis.
406 my @all_n = $self->{'positions'}->nodes_at_position( $pos );
407 $new_state = 0 if scalar( @all_n ) == 1;
409 $self->{'positions'}->set_state( $npos, $new_state );
410 } elsif( $old_state && $old_state eq $n ) {
411 # another node has already been turned on here
412 push( @nodes_turned_off, $n );
413 } # else some other node was on anyway, so pass.
415 return @nodes_turned_off;
418 sub colocated_nodes {
420 return $self->{'positions'}->colocated_nodes( @_ );
423 sub identical_nodes {
424 my( $self, $node ) = @_;
425 return undef unless exists $self->{'identical_nodes'} &&
426 exists $self->{'identical_nodes'}->{$node};
427 return $self->{'identical_nodes'}->{$node};
431 my( $self, $node_id ) = @_;
432 # This is the label of the given node.
433 return $self->node( $node_id )->label();
436 sub text_for_witness {
437 my( $self, $wit ) = @_;
439 my @nodes = $self->{'positions'}->witness_path( $wit );
440 my @words = map { $self->node( $_ )->label() } @nodes;
441 return join( ' ', @words );
447 map { $h{$_} = 1 } @list;