Change namespace
[scpubgit/stemmatology.git] / lib / Text / Tradition / Graph.pm
CommitLineData
e58153d6 1package Text::Tradition::Graph;
b49c4318 2
3use strict;
4use warnings;
5use Graph::Easy;
6use IPC::Run qw( run binary );
7use Module::Load;
e58153d6 8use Text::Tradition::Graph::Position;
a25d4374 9
10=head1 NAME
11
e58153d6 12Text::Tradition::Graph
a25d4374 13
14=head1 SYNOPSIS
15
e58153d6 16use Text::Tradition::Graph;
a25d4374 17
e58153d6 18my $text = Text::Tradition::Graph->new( 'GraphML' => '/my/graphml/file.xml' );
19my $text = Text::Tradition::Graph->new( 'TEI' => '/my/tei/file.xml' );
20my $text = Text::Tradition::Graph->new( 'CSV' => '/my/csv/file.csv',
a25d4374 21 'base' => '/my/basefile.txt' );
e58153d6 22my $text = Text::Tradition::Graph->new( 'CTE' => '/my/cte/file.txt',
a25d4374 23 'base' => '/my/basefile.txt' );
24
25my $svg_string = $text->as_svg();
26
27my $lemma_nodes = $text->active_nodes();
28$text->toggle_node( 'some_word' );
29
30=head1 DESCRIPTION
31
32A text tradition is the representation of our knowledge of a text that
33has been passed down via manuscript copies from a time before printing
34presses. Each text has a number of witnesses, that is, manuscripts
35that bear a version of the text. The tradition is the aggregation of
36these witnesses, which is to say, the collation of the text.
37
38This module takes a text collation and represents it as a horizontal
39directed graph, suitable for SVG rendering and for analysis of various
40forms. Since this module was written by a medievalist, it also
41provides a facility for making a critical text reconstruction by
42choosing certain variants to be 'lemma' text - that is, text which
43should be considered the 'standard' reading.
44
45Although the graph is a very good way to render text collation, and is
46visually very easy for a human to interpret, it doesn't have any
47inherent information about which nodes 'go together' - that is, which
48text readings appear in the same place as other readings. This module
49therefore calculates 'positions' on the graph, thus holding some
50information about which readings can and can't be substituted for
51others.
52
53=head1 METHODS
54
55=over 4
56
57=item B<new>
58
59Constructor. Takes a source collation file from which to construct
60the initial graph. This file can be TEI (parallel segmentation) XML,
61CSV in a format yet to be documented, GraphML as documented (someday)
62by CollateX, or a Classical Text Editor apparatus. For CSV and
63Classical Text Editor files, the user must also supply a base text to
64which the line numbering in the collation file refers.
65
66=cut
b49c4318 67
68sub new {
69 my $proto = shift;
70 my $class = ref( $proto ) || $proto;
71 my %opts = ( 'on_color' => 'yellow',
72 'off_color' => 'white',
73 @_ );
74 my $self = {};
75
76 # opts can be: GraphML, base+CSV, base+CTE, TEI. We need
77 # something to parse.
78 my @formats = grep { /^(GraphML|CSV|CTE|TEI)$/ } keys( %opts );
79 my $format = shift( @formats );
80 unless( $format ) {
81 warn "No data given to create a graph: need GraphML, CSV, or TEI";
82 return;
83 }
84 if( $format =~ /^(CSV|CTE)$/ && !exists $opts{'base'} ) {
85 warn "Cannot make a graph from $format without a base text";
86 return;
87 }
88
89 # Make a graph object.
90 my $collation_graph = Graph::Easy->new();
91 $collation_graph->set_attribute( 'node', 'shape', 'ellipse' );
92 # Starting point for all texts
93 my $last_node = $collation_graph->add_node( '#START#' );
94
95 $self->{'graph'} = $collation_graph;
96 bless( $self, $class );
97
98 # Now do the parsing.
e58153d6 99 my $mod = "Text::Tradition::Parser::$format";
b49c4318 100 load( $mod );
101 my @args = ( $opts{ $format } );
102 if( $format =~ /^(CSV|CTE)$/ ) {
103 push( @args, $opts{'base'} );
104 }
105 $mod->can('parse')->( $self, @args );
106
107 return $self;
108}
109
110
111### Graph::Easy object accessor methods
112sub node {
113 my $self = shift;
114 return $self->{'graph'}->node( @_ );
115}
116
117sub edge {
118 my $self = shift;
119 return $self->{'graph'}->edge( @_ );
120}
121
c2d16875 122# Not only adds the node, but also initializes internal data
123# about the node.
b49c4318 124sub add_node {
125 my $self = shift;
c2d16875 126 my $node = $self->{'graph'}->add_node( @_ );
127 $self->{'identical_nodes'}->{ $node->name() } = [ $node->name() ];
128 return $node;
b49c4318 129}
130
131sub add_edge {
132 my $self = shift;
133 return $self->{'graph'}->add_edge( @_ );
134}
135
136sub del_node {
137 my $self = shift;
c2d16875 138 my $node = $_[0];
139
140 # Delete this node out of any relevant transposition pool.
141 if( ref $node eq 'Graph::Easy::Node' ) {
142 $node = $node->name();
143 }
144 my @ident = $self->identical_nodes( $node );
145 if( @ident ) {
146 # Get the pool.
147 my $pool = $self->{'identical_nodes'}->{ $ident[0] };
148 foreach my $i ( 0 .. scalar(@$pool)-1 ) {
149 if( $pool->[$i] eq $node ) {
150 splice( @$pool, $i, 1 );
151 last;
152 }
153 }
154 }
155 delete $self->{'identical_nodes'}->{ $node };
156
157 # Now delete the node.
b49c4318 158 return $self->{'graph'}->del_node( @_ );
159}
160
161sub del_edge {
162 my $self = shift;
163 return $self->{'graph'}->del_edge( @_ );
164}
165
166sub nodes {
167 my $self = shift;
168 return $self->{'graph'}->nodes( @_ );
169}
170
171sub edges {
172 my $self = shift;
173 return $self->{'graph'}->edges( @_ );
174}
175
176sub merge_nodes {
177 my $self = shift;
178 return $self->{'graph'}->merge_nodes( @_ );
179}
180
181### Helper methods for navigating the tree
182
183sub start {
184 # Return the beginning node of the graph.
185 my $self = shift;
186 my( $new_start ) = @_;
187 if( $new_start ) {
c2d16875 188 # Fix the node transposition data
189 delete $self->{'identical_nodes'}->{ $new_start->name() };
190 $self->{'identical_nodes'}->{'#START#'} = [ '#START#' ];
b49c4318 191 $self->{'graph'}->rename_node( $new_start, '#START#' );
192 }
193 return $self->{'graph'}->node('#START#');
194}
195
c2d16875 196# Record that nodes A and B are really the same (transposed) node.
197# We do this by maintaining some pools of transposed nodes, and
198# we have a lookup hash so that each member of that
199sub set_identical_node {
200 my( $self, $node, $same_node ) = @_;
201 my $pool = $self->{'identical_nodes'}->{ $node };
202 my $same_pool = $self->{'identical_nodes'}->{ $same_node };
203 my %poolhash;
204 foreach ( @$pool ) {
205 $poolhash{$_} = 1;
206 }
207 foreach( @$same_pool ) {
208 push( @$pool, $_ ) unless $poolhash{$_};
209 }
210
211 $self->{'identical_nodes'}->{ $same_node } = $pool;
212}
213
214sub identical_nodes {
215 my( $self, $node ) = @_;
216 my @others = grep { $_ !~ /^$node$/ }
217 @{$self->{'identical_nodes'}->{ $node }};
218 return @others;
b49c4318 219}
220
221sub next_word {
222 # Return the successor via the corresponding edge.
223 my( $self, $node, $edge ) = @_;
224 $edge = "base text" unless $edge;
225 my @next_edges = $node->outgoing();
226 return undef unless scalar( @next_edges );
227
228 foreach my $e ( @next_edges ) {
229 next unless $e->label() eq $edge;
230 return $e->to();
231 }
232
233 warn "Could not find node connected to edge $edge";
234 return undef;
235}
236
237sub prior_word {
238 # Return the predecessor via the corresponding edge.
239 my( $self, $node, $edge ) = @_;
240 $edge = "base text" unless $edge;
241 my @prior_edges = $node->incoming();
242 return undef unless scalar( @prior_edges );
243
244 foreach my $e ( @prior_edges ) {
245 next unless $e->label() eq $edge;
246 return $e->from();
247 }
248
249 warn "Could not find node connected from edge $edge";
250 return undef;
251}
252
253sub node_sequence {
254 my( $self, $start, $end, $label ) = @_;
255 # TODO make label able to follow a single MS
256 unless( ref( $start ) eq 'Graph::Easy::Node'
257 && ref( $end ) eq 'Graph::Easy::Node' ) {
258 warn "Called node_sequence without two nodes!";
259 return ();
260 }
261 $label = 'base text' unless $label;
262 my @nodes = ( $start );
263 my %seen;
264 my $n = $start;
265 while( $n ne $end ) {
266 if( exists( $seen{$n->name()} ) ) {
267 warn "Detected loop at " . $n->name();
268 last;
269 }
270 $seen{$n->name()} = 1;
271
272 my @edges = $n->outgoing();
273 my @relevant_edges = grep { $_->label =~ /^$label$/ } @edges;
274 warn "Did not find an edge $label from node " . $n->label
275 unless scalar @relevant_edges;
276 warn "Found more than one edge $label from node " . $n->label
277 unless scalar @relevant_edges == 1;
278 my $next = $relevant_edges[0]->to();
279 push( @nodes, $next );
280 $n = $next;
281 }
282 # Check that the last node is our end node.
283 my $last = $nodes[$#nodes];
284 warn "Last node found from " . $start->label() .
285 " via path $label is not the end!"
286 unless $last eq $end;
287
288 return @nodes;
289}
290
291sub string_lemma {
292 my( $self, $start, $end, $label ) = @_;
293
294 my @nodes = $self->node_sequence( $start, $end, $label );
295 my @words = map { $_->label() } @nodes;
296 return join( ' ', @words );
297}
298
299## Output. We use GraphViz for the layout because it handles large
300## graphs better than Graph::Easy does natively.
301
302sub as_svg {
303 my( $self, $recalc ) = @_;
304 return $self->{'svg'} if( exists $self->{'svg'} && !$recalc );
305
306 $self->{'graphviz'} = $self->{'graph'}->as_graphviz()
307 unless( exists $self->{'graphviz'} && !$recalc );
308
309 my @cmd = qw/dot -Tsvg/;
310 my( $svg, $err );
311 my $in = $self->{'graphviz'};
312 run( \@cmd, \$in, ">", binary(), \$svg );
313 $self->{'svg'} = $svg;
314 return $svg;
315}
316
a25d4374 317## Methods for lemmatizing a text.
b49c4318 318
a25d4374 319sub init_lemmatizer {
320 my $self = shift;
321 # Initialize the 'lemma' hash, going through all the nodes and seeing
58a3c424 322 # which ones are common nodes. This should only be run once.
a25d4374 323
58a3c424 324 return if( $self->{'lemmatizer_initialized'} );
325 my @active_names = map { $_->name } grep { $self->is_common( $_ ) }
326 $self->nodes();
327 $self->{'positions'}->init_lemmatizer( @active_names );
328 $self->{'lemmatizer_initialized'} = 1;
a25d4374 329
a25d4374 330}
331
332sub make_positions {
333 my( $self, $common_nodes, $paths ) = @_;
e58153d6 334 my $positions = Text::Tradition::Graph::Position->new( $common_nodes, $paths );
a25d4374 335 $self->{'positions'} = $positions;
336}
337
338# Takes a list of nodes that have just been turned off, and returns a
339# set of tuples of the form ['node', 'state'] that indicates what
340# changes need to be made to the graph.
341# A state of 1 means 'turn on this node'
342# A state of 0 means 'turn off this node'
343# A state of undef means 'an ellipsis belongs in the text here because
344# no decision has been made'
b49c4318 345sub active_nodes {
346 my( $self, @toggled_off_nodes ) = @_;
a25d4374 347
348 # In case this is the first run
349 $self->init_lemmatizer();
350 # First get the positions of those nodes which have been
351 # toggled off.
b49c4318 352 my $positions_off = {};
a25d4374 353 map { $positions_off->{ $self->{'positions'}->node_position( $_ ) } = $_ }
354 @toggled_off_nodes;
355
b49c4318 356
357 # Now for each position, we have to see if a node is on, and we
358 # have to see if a node has been turned off.
359 my @answer;
a25d4374 360 foreach my $pos ( $self->{'positions'}->all() ) {
58a3c424 361 # Find the state of this position. If there is an active node,
362 # its name will be the state; otherwise the state will be 0
363 # (nothing at this position) or undef (ellipsis at this position)
364 my $active = $self->{'positions'}->state( $pos );
a25d4374 365
b49c4318 366 # Is there a formerly active node that was toggled off?
367 if( exists( $positions_off->{$pos} ) ) {
a25d4374 368 my $off_node = $positions_off->{$pos};
58a3c424 369 if( $active && $active ne $off_node) {
b49c4318 370 push( @answer, [ $off_node, 0 ], [ $active, 1 ] );
b49c4318 371 } else {
58a3c424 372 push( @answer, [ $off_node, $active ] );
b49c4318 373 }
58a3c424 374
b49c4318 375 # No formerly active node, so we just see if there is a currently
376 # active one.
377 } elsif( $active ) {
378 # Push the active node, whatever it is.
379 push( @answer, [ $active, 1 ] );
380 } else {
58a3c424 381 # Push the state that is there. Arbitrarily use the first node
382 # at that position.
383 my @pos_nodes = $self->{'positions'}->nodes_at_position( $pos );
384 push( @answer,
385 [ $pos_nodes[0], $self->{'positions'}->state( $pos ) ] );
b49c4318 386 }
387 }
58a3c424 388
b49c4318 389 return @answer;
390}
391
a25d4374 392# A couple of helpers. TODO These should be gathered in the same place
393# eventually
b49c4318 394
a25d4374 395sub is_common {
396 my( $self, $node ) = @_;
397 $node = $self->_nodeobj( $node );
398 return $node->get_attribute('class') eq 'common';
b49c4318 399}
400
a25d4374 401sub _nodeobj {
402 my( $self, $node ) = @_;
403 unless( ref $node eq 'Graph::Easy::Node' ) {
404 $node = $self->node( $node );
b49c4318 405 }
a25d4374 406 return $node;
b49c4318 407}
408
a25d4374 409# toggle_node takes a node name, and either lemmatizes or de-lemmatizes it.
410# Returns a list of nodes that are de-lemmatized as a result of the toggle.
b49c4318 411
b49c4318 412sub toggle_node {
a25d4374 413 my( $self, $node ) = @_;
414
415 # In case this is being called for the first time.
416 $self->init_lemmatizer();
b49c4318 417
a25d4374 418 if( $self->is_common( $node ) ) {
419 # Do nothing, it's a common node.
420 return;
421 }
58a3c424 422
423 my $pos = $self->{'positions'}->node_position( $node );
424 my $old_state = $self->{'positions'}->state( $pos );
b49c4318 425 my @nodes_off;
58a3c424 426 if( $old_state && $old_state eq $node ) {
427 # Turn off the node. We turn on no others by default.
428 push( @nodes_off, $node );
429 } else {
b49c4318 430 # Turn on the node.
58a3c424 431 $self->{'positions'}->set_state( $pos, $node );
432 # Any other 'on' nodes in the same position should be off.
a25d4374 433 push( @nodes_off, $self->colocated_nodes( $node ) );
58a3c424 434 # Any node that is an identical transposed one should be off.
a25d4374 435 push( @nodes_off, $self->identical_nodes( $node ) )
436 if $self->identical_nodes( $node );
b49c4318 437 }
a25d4374 438 @nodes_off = unique_list( @nodes_off );
b49c4318 439
440 # Turn off the nodes that need to be turned off.
58a3c424 441 my @nodes_turned_off;
442 foreach my $n ( @nodes_off ) {
443 my $npos = $self->{'positions'}->node_position( $n );
444 my $state = $self->{'positions'}->state( $npos );
445 if( $state && $state eq $n ) {
446 # this node is still on
447 push( @nodes_turned_off, $n );
448 my $new_state = undef;
449 if( $n eq $node ) {
450 # This is the node that was clicked, so if there are no
451 # other nodes there, turn off the position. In all other
452 # cases, restore the ellipsis.
453 my @all_n = $self->{'positions'}->nodes_at_position( $pos );
454 $new_state = 0 if scalar( @all_n ) == 1;
455 }
456 $self->{'positions'}->set_state( $npos, $new_state );
457 } elsif( $old_state && $old_state eq $n ) {
458 # another node has already been turned on here
459 push( @nodes_turned_off, $n );
460 } # else some other node was on anyway, so pass.
461 }
462 return @nodes_turned_off;
b49c4318 463}
464
b49c4318 465sub colocated_nodes {
a25d4374 466 my $self = shift;
467 return $self->{'positions'}->colocated_nodes( @_ );
b49c4318 468}
469
a25d4374 470sub text_of_node {
471 my( $self, $node_id ) = @_;
472 # This is the label of the given node.
473 return $self->node( $node_id )->label();
b49c4318 474}
475
476sub text_for_witness {
477 my( $self, $wit ) = @_;
b49c4318 478
a25d4374 479 my @nodes = $self->{'positions'}->witness_path( $wit );
480 my @words = map { $self->node( $_ )->label() } @nodes;
b49c4318 481 return join( ' ', @words );
482}
483
a25d4374 484sub unique_list {
485 my( @list ) = @_;
486 my %h;
487 map { $h{$_} = 1 } @list;
488 return keys( %h );
b49c4318 489}
a25d4374 490
b49c4318 4911;
a25d4374 492