1 package Text::Tradition::Graph;
6 use IPC::Run qw( run binary );
8 use Text::Tradition::Graph::Position;
12 Text::Tradition::Graph
16 use Text::Tradition::Graph;
18 my $text = Text::Tradition::Graph->new( 'GraphML' => '/my/graphml/file.xml' );
19 my $text = Text::Tradition::Graph->new( 'TEI' => '/my/tei/file.xml' );
20 my $text = Text::Tradition::Graph->new( 'CSV' => '/my/csv/file.csv',
21 'base' => '/my/basefile.txt' );
22 my $text = Text::Tradition::Graph->new( 'CTE' => '/my/cte/file.txt',
23 'base' => '/my/basefile.txt' );
25 my $svg_string = $text->as_svg();
27 my $lemma_nodes = $text->active_nodes();
28 $text->toggle_node( 'some_word' );
32 A text tradition is the representation of our knowledge of a text that
33 has been passed down via manuscript copies from a time before printing
34 presses. Each text has a number of witnesses, that is, manuscripts
35 that bear a version of the text. The tradition is the aggregation of
36 these witnesses, which is to say, the collation of the text.
38 This module takes a text collation and represents it as a horizontal
39 directed graph, suitable for SVG rendering and for analysis of various
40 forms. Since this module was written by a medievalist, it also
41 provides a facility for making a critical text reconstruction by
42 choosing certain variants to be 'lemma' text - that is, text which
43 should be considered the 'standard' reading.
45 Although the graph is a very good way to render text collation, and is
46 visually very easy for a human to interpret, it doesn't have any
47 inherent information about which nodes 'go together' - that is, which
48 text readings appear in the same place as other readings. This module
49 therefore calculates 'positions' on the graph, thus holding some
50 information about which readings can and can't be substituted for
59 Constructor. Takes a source collation file from which to construct
60 the initial graph. This file can be TEI (parallel segmentation) XML,
61 CSV in a format yet to be documented, GraphML as documented (someday)
62 by CollateX, or a Classical Text Editor apparatus. For CSV and
63 Classical Text Editor files, the user must also supply a base text to
64 which the line numbering in the collation file refers.
70 my $class = ref( $proto ) || $proto;
71 my %opts = ( 'on_color' => 'yellow',
72 'off_color' => 'white',
76 # opts can be: GraphML, base+CSV, base+CTE, TEI. We need
78 my @formats = grep { /^(GraphML|CSV|CTE|TEI)$/ } keys( %opts );
79 my $format = shift( @formats );
81 warn "No data given to create a graph: need GraphML, CSV, or TEI";
84 if( $format =~ /^(CSV|CTE)$/ && !exists $opts{'base'} ) {
85 warn "Cannot make a graph from $format without a base text";
89 # Make a graph object.
90 my $collation_graph = Graph::Easy->new();
91 $collation_graph->set_attribute( 'node', 'shape', 'ellipse' );
92 # Starting point for all texts
93 my $last_node = $collation_graph->add_node( '#START#' );
95 $self->{'graph'} = $collation_graph;
96 bless( $self, $class );
99 my $mod = "Text::Tradition::Parser::$format";
101 my @args = ( $opts{ $format } );
102 if( $format =~ /^(CSV|CTE)$/ ) {
103 push( @args, $opts{'base'} );
105 $mod->can('parse')->( $self, @args );
111 ### Graph::Easy object accessor methods
114 return $self->{'graph'}->node( @_ );
119 return $self->{'graph'}->edge( @_ );
122 # Not only adds the node, but also initializes internal data
126 my $node = $self->{'graph'}->add_node( @_ );
127 $self->{'identical_nodes'}->{ $node->name() } = [ $node->name() ];
133 return $self->{'graph'}->add_edge( @_ );
140 # Delete this node out of any relevant transposition pool.
141 if( ref $node eq 'Graph::Easy::Node' ) {
142 $node = $node->name();
144 my @ident = $self->identical_nodes( $node );
147 my $pool = $self->{'identical_nodes'}->{ $ident[0] };
148 foreach my $i ( 0 .. scalar(@$pool)-1 ) {
149 if( $pool->[$i] eq $node ) {
150 splice( @$pool, $i, 1 );
155 delete $self->{'identical_nodes'}->{ $node };
157 # Now delete the node.
158 return $self->{'graph'}->del_node( @_ );
163 return $self->{'graph'}->del_edge( @_ );
168 return $self->{'graph'}->nodes( @_ );
173 return $self->{'graph'}->edges( @_ );
178 return $self->{'graph'}->merge_nodes( @_ );
181 ### Helper methods for navigating the tree
184 # Return the beginning node of the graph.
186 my( $new_start ) = @_;
188 # Fix the node transposition data
189 delete $self->{'identical_nodes'}->{ $new_start->name() };
190 $self->{'identical_nodes'}->{'#START#'} = [ '#START#' ];
191 $self->{'graph'}->rename_node( $new_start, '#START#' );
193 return $self->{'graph'}->node('#START#');
196 # Record that nodes A and B are really the same (transposed) node.
197 # We do this by maintaining some pools of transposed nodes, and
198 # we have a lookup hash so that each member of that
199 sub set_identical_node {
200 my( $self, $node, $same_node ) = @_;
201 my $pool = $self->{'identical_nodes'}->{ $node };
202 my $same_pool = $self->{'identical_nodes'}->{ $same_node };
207 foreach( @$same_pool ) {
208 push( @$pool, $_ ) unless $poolhash{$_};
211 $self->{'identical_nodes'}->{ $same_node } = $pool;
214 sub identical_nodes {
215 my( $self, $node ) = @_;
216 my @others = grep { $_ !~ /^$node$/ }
217 @{$self->{'identical_nodes'}->{ $node }};
222 # Return the successor via the corresponding edge.
223 my( $self, $node, $edge ) = @_;
224 $edge = "base text" unless $edge;
225 my @next_edges = $node->outgoing();
226 return undef unless scalar( @next_edges );
228 foreach my $e ( @next_edges ) {
229 next unless $e->label() eq $edge;
233 warn "Could not find node connected to edge $edge";
238 # Return the predecessor via the corresponding edge.
239 my( $self, $node, $edge ) = @_;
240 $edge = "base text" unless $edge;
241 my @prior_edges = $node->incoming();
242 return undef unless scalar( @prior_edges );
244 foreach my $e ( @prior_edges ) {
245 next unless $e->label() eq $edge;
249 warn "Could not find node connected from edge $edge";
254 my( $self, $start, $end, $label ) = @_;
255 # TODO make label able to follow a single MS
256 unless( ref( $start ) eq 'Graph::Easy::Node'
257 && ref( $end ) eq 'Graph::Easy::Node' ) {
258 warn "Called node_sequence without two nodes!";
261 $label = 'base text' unless $label;
262 my @nodes = ( $start );
265 while( $n ne $end ) {
266 if( exists( $seen{$n->name()} ) ) {
267 warn "Detected loop at " . $n->name();
270 $seen{$n->name()} = 1;
272 my @edges = $n->outgoing();
273 my @relevant_edges = grep { $_->label =~ /^$label$/ } @edges;
274 warn "Did not find an edge $label from node " . $n->label
275 unless scalar @relevant_edges;
276 warn "Found more than one edge $label from node " . $n->label
277 unless scalar @relevant_edges == 1;
278 my $next = $relevant_edges[0]->to();
279 push( @nodes, $next );
282 # Check that the last node is our end node.
283 my $last = $nodes[$#nodes];
284 warn "Last node found from " . $start->label() .
285 " via path $label is not the end!"
286 unless $last eq $end;
292 my( $self, $start, $end, $label ) = @_;
294 my @nodes = $self->node_sequence( $start, $end, $label );
295 my @words = map { $_->label() } @nodes;
296 return join( ' ', @words );
299 ## Output. We use GraphViz for the layout because it handles large
300 ## graphs better than Graph::Easy does natively.
303 my( $self, $recalc ) = @_;
304 return $self->{'svg'} if( exists $self->{'svg'} && !$recalc );
306 $self->{'graphviz'} = $self->{'graph'}->as_graphviz()
307 unless( exists $self->{'graphviz'} && !$recalc );
309 my @cmd = qw/dot -Tsvg/;
311 my $in = $self->{'graphviz'};
312 run( \@cmd, \$in, ">", binary(), \$svg );
313 $self->{'svg'} = $svg;
317 ## Methods for lemmatizing a text.
319 sub init_lemmatizer {
321 # Initialize the 'lemma' hash, going through all the nodes and seeing
322 # which ones are common nodes. This should only be run once.
324 return if( $self->{'lemmatizer_initialized'} );
325 my @active_names = map { $_->name } grep { $self->is_common( $_ ) }
327 $self->{'positions'}->init_lemmatizer( @active_names );
328 $self->{'lemmatizer_initialized'} = 1;
333 my( $self, $common_nodes, $paths ) = @_;
334 my $positions = Text::Tradition::Graph::Position->new( $common_nodes, $paths );
335 $self->{'positions'} = $positions;
338 # Takes a list of nodes that have just been turned off, and returns a
339 # set of tuples of the form ['node', 'state'] that indicates what
340 # changes need to be made to the graph.
341 # A state of 1 means 'turn on this node'
342 # A state of 0 means 'turn off this node'
343 # A state of undef means 'an ellipsis belongs in the text here because
344 # no decision has been made'
346 my( $self, @toggled_off_nodes ) = @_;
348 # In case this is the first run
349 $self->init_lemmatizer();
350 # First get the positions of those nodes which have been
352 my $positions_off = {};
353 map { $positions_off->{ $self->{'positions'}->node_position( $_ ) } = $_ }
357 # Now for each position, we have to see if a node is on, and we
358 # have to see if a node has been turned off.
360 foreach my $pos ( $self->{'positions'}->all() ) {
361 # Find the state of this position. If there is an active node,
362 # its name will be the state; otherwise the state will be 0
363 # (nothing at this position) or undef (ellipsis at this position)
364 my $active = $self->{'positions'}->state( $pos );
366 # Is there a formerly active node that was toggled off?
367 if( exists( $positions_off->{$pos} ) ) {
368 my $off_node = $positions_off->{$pos};
369 if( $active && $active ne $off_node) {
370 push( @answer, [ $off_node, 0 ], [ $active, 1 ] );
372 push( @answer, [ $off_node, $active ] );
375 # No formerly active node, so we just see if there is a currently
378 # Push the active node, whatever it is.
379 push( @answer, [ $active, 1 ] );
381 # Push the state that is there. Arbitrarily use the first node
383 my @pos_nodes = $self->{'positions'}->nodes_at_position( $pos );
385 [ $pos_nodes[0], $self->{'positions'}->state( $pos ) ] );
392 # A couple of helpers. TODO These should be gathered in the same place
396 my( $self, $node ) = @_;
397 $node = $self->_nodeobj( $node );
398 return $node->get_attribute('class') eq 'common';
402 my( $self, $node ) = @_;
403 unless( ref $node eq 'Graph::Easy::Node' ) {
404 $node = $self->node( $node );
409 # toggle_node takes a node name, and either lemmatizes or de-lemmatizes it.
410 # Returns a list of nodes that are de-lemmatized as a result of the toggle.
413 my( $self, $node ) = @_;
415 # In case this is being called for the first time.
416 $self->init_lemmatizer();
418 if( $self->is_common( $node ) ) {
419 # Do nothing, it's a common node.
423 my $pos = $self->{'positions'}->node_position( $node );
424 my $old_state = $self->{'positions'}->state( $pos );
426 if( $old_state && $old_state eq $node ) {
427 # Turn off the node. We turn on no others by default.
428 push( @nodes_off, $node );
431 $self->{'positions'}->set_state( $pos, $node );
432 # Any other 'on' nodes in the same position should be off.
433 push( @nodes_off, $self->colocated_nodes( $node ) );
434 # Any node that is an identical transposed one should be off.
435 push( @nodes_off, $self->identical_nodes( $node ) )
436 if $self->identical_nodes( $node );
438 @nodes_off = unique_list( @nodes_off );
440 # Turn off the nodes that need to be turned off.
441 my @nodes_turned_off;
442 foreach my $n ( @nodes_off ) {
443 my $npos = $self->{'positions'}->node_position( $n );
444 my $state = $self->{'positions'}->state( $npos );
445 if( $state && $state eq $n ) {
446 # this node is still on
447 push( @nodes_turned_off, $n );
448 my $new_state = undef;
450 # This is the node that was clicked, so if there are no
451 # other nodes there, turn off the position. In all other
452 # cases, restore the ellipsis.
453 my @all_n = $self->{'positions'}->nodes_at_position( $pos );
454 $new_state = 0 if scalar( @all_n ) == 1;
456 $self->{'positions'}->set_state( $npos, $new_state );
457 } elsif( $old_state && $old_state eq $n ) {
458 # another node has already been turned on here
459 push( @nodes_turned_off, $n );
460 } # else some other node was on anyway, so pass.
462 return @nodes_turned_off;
465 sub colocated_nodes {
467 return $self->{'positions'}->colocated_nodes( @_ );
471 my( $self, $node_id ) = @_;
472 # This is the label of the given node.
473 return $self->node( $node_id )->label();
476 sub text_for_witness {
477 my( $self, $wit ) = @_;
479 my @nodes = $self->{'positions'}->witness_path( $wit );
480 my @words = map { $self->node( $_ )->label() } @nodes;
481 return join( ' ', @words );
487 map { $h{$_} = 1 } @list;