Commit | Line | Data |
b49c4318 |
1 | package Traditions::Graph; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Graph::Easy; |
6 | use IPC::Run qw( run binary ); |
7 | use Module::Load; |
a25d4374 |
8 | use Traditions::Graph::Position; |
9 | |
10 | =head1 NAME |
11 | |
12 | (Text?)::Traditions::Graph |
13 | |
14 | =head1 SYNOPSIS |
15 | |
16 | use Traditions::Graph; |
17 | |
18 | my $text = Traditions::Graph->new( 'GraphML' => '/my/graphml/file.xml' ); |
19 | my $text = Traditions::Graph->new( 'TEI' => '/my/tei/file.xml' ); |
20 | my $text = Traditions::Graph->new( 'CSV' => '/my/csv/file.csv', |
21 | 'base' => '/my/basefile.txt' ); |
22 | my $text = Traditions::Graph->new( 'CTE' => '/my/cte/file.txt', |
23 | 'base' => '/my/basefile.txt' ); |
24 | |
25 | my $svg_string = $text->as_svg(); |
26 | |
27 | my $lemma_nodes = $text->active_nodes(); |
28 | $text->toggle_node( 'some_word' ); |
29 | |
30 | =head1 DESCRIPTION |
31 | |
32 | A text tradition is the representation of our knowledge of a text that |
33 | has been passed down via manuscript copies from a time before printing |
34 | presses. Each text has a number of witnesses, that is, manuscripts |
35 | that bear a version of the text. The tradition is the aggregation of |
36 | these witnesses, which is to say, the collation of the text. |
37 | |
38 | This module takes a text collation and represents it as a horizontal |
39 | directed graph, suitable for SVG rendering and for analysis of various |
40 | forms. Since this module was written by a medievalist, it also |
41 | provides a facility for making a critical text reconstruction by |
42 | choosing certain variants to be 'lemma' text - that is, text which |
43 | should be considered the 'standard' reading. |
44 | |
45 | Although the graph is a very good way to render text collation, and is |
46 | visually very easy for a human to interpret, it doesn't have any |
47 | inherent information about which nodes 'go together' - that is, which |
48 | text readings appear in the same place as other readings. This module |
49 | therefore calculates 'positions' on the graph, thus holding some |
50 | information about which readings can and can't be substituted for |
51 | others. |
52 | |
53 | =head1 METHODS |
54 | |
55 | =over 4 |
56 | |
57 | =item B<new> |
58 | |
59 | Constructor. Takes a source collation file from which to construct |
60 | the initial graph. This file can be TEI (parallel segmentation) XML, |
61 | CSV in a format yet to be documented, GraphML as documented (someday) |
62 | by CollateX, or a Classical Text Editor apparatus. For CSV and |
63 | Classical Text Editor files, the user must also supply a base text to |
64 | which the line numbering in the collation file refers. |
65 | |
66 | =cut |
b49c4318 |
67 | |
68 | sub new { |
69 | my $proto = shift; |
70 | my $class = ref( $proto ) || $proto; |
71 | my %opts = ( 'on_color' => 'yellow', |
72 | 'off_color' => 'white', |
73 | @_ ); |
74 | my $self = {}; |
75 | |
76 | # opts can be: GraphML, base+CSV, base+CTE, TEI. We need |
77 | # something to parse. |
78 | my @formats = grep { /^(GraphML|CSV|CTE|TEI)$/ } keys( %opts ); |
79 | my $format = shift( @formats ); |
80 | unless( $format ) { |
81 | warn "No data given to create a graph: need GraphML, CSV, or TEI"; |
82 | return; |
83 | } |
84 | if( $format =~ /^(CSV|CTE)$/ && !exists $opts{'base'} ) { |
85 | warn "Cannot make a graph from $format without a base text"; |
86 | return; |
87 | } |
88 | |
89 | # Make a graph object. |
90 | my $collation_graph = Graph::Easy->new(); |
91 | $collation_graph->set_attribute( 'node', 'shape', 'ellipse' ); |
92 | # Starting point for all texts |
93 | my $last_node = $collation_graph->add_node( '#START#' ); |
94 | |
95 | $self->{'graph'} = $collation_graph; |
96 | bless( $self, $class ); |
97 | |
98 | # Now do the parsing. |
99 | my $mod = "Traditions::Parser::$format"; |
100 | load( $mod ); |
101 | my @args = ( $opts{ $format } ); |
102 | if( $format =~ /^(CSV|CTE)$/ ) { |
103 | push( @args, $opts{'base'} ); |
104 | } |
105 | $mod->can('parse')->( $self, @args ); |
106 | |
107 | return $self; |
108 | } |
109 | |
110 | |
111 | ### Graph::Easy object accessor methods |
112 | sub node { |
113 | my $self = shift; |
114 | return $self->{'graph'}->node( @_ ); |
115 | } |
116 | |
117 | sub edge { |
118 | my $self = shift; |
119 | return $self->{'graph'}->edge( @_ ); |
120 | } |
121 | |
c2d16875 |
122 | # Not only adds the node, but also initializes internal data |
123 | # about the node. |
b49c4318 |
124 | sub add_node { |
125 | my $self = shift; |
c2d16875 |
126 | my $node = $self->{'graph'}->add_node( @_ ); |
127 | $self->{'identical_nodes'}->{ $node->name() } = [ $node->name() ]; |
128 | return $node; |
b49c4318 |
129 | } |
130 | |
131 | sub add_edge { |
132 | my $self = shift; |
133 | return $self->{'graph'}->add_edge( @_ ); |
134 | } |
135 | |
136 | sub del_node { |
137 | my $self = shift; |
c2d16875 |
138 | my $node = $_[0]; |
139 | |
140 | # Delete this node out of any relevant transposition pool. |
141 | if( ref $node eq 'Graph::Easy::Node' ) { |
142 | $node = $node->name(); |
143 | } |
144 | my @ident = $self->identical_nodes( $node ); |
145 | if( @ident ) { |
146 | # Get the pool. |
147 | my $pool = $self->{'identical_nodes'}->{ $ident[0] }; |
148 | foreach my $i ( 0 .. scalar(@$pool)-1 ) { |
149 | if( $pool->[$i] eq $node ) { |
150 | splice( @$pool, $i, 1 ); |
151 | last; |
152 | } |
153 | } |
154 | } |
155 | delete $self->{'identical_nodes'}->{ $node }; |
156 | |
157 | # Now delete the node. |
b49c4318 |
158 | return $self->{'graph'}->del_node( @_ ); |
159 | } |
160 | |
161 | sub del_edge { |
162 | my $self = shift; |
163 | return $self->{'graph'}->del_edge( @_ ); |
164 | } |
165 | |
166 | sub nodes { |
167 | my $self = shift; |
168 | return $self->{'graph'}->nodes( @_ ); |
169 | } |
170 | |
171 | sub edges { |
172 | my $self = shift; |
173 | return $self->{'graph'}->edges( @_ ); |
174 | } |
175 | |
176 | sub merge_nodes { |
177 | my $self = shift; |
178 | return $self->{'graph'}->merge_nodes( @_ ); |
179 | } |
180 | |
181 | ### Helper methods for navigating the tree |
182 | |
183 | sub start { |
184 | # Return the beginning node of the graph. |
185 | my $self = shift; |
186 | my( $new_start ) = @_; |
187 | if( $new_start ) { |
c2d16875 |
188 | # Fix the node transposition data |
189 | delete $self->{'identical_nodes'}->{ $new_start->name() }; |
190 | $self->{'identical_nodes'}->{'#START#'} = [ '#START#' ]; |
b49c4318 |
191 | $self->{'graph'}->rename_node( $new_start, '#START#' ); |
192 | } |
193 | return $self->{'graph'}->node('#START#'); |
194 | } |
195 | |
c2d16875 |
196 | # Record that nodes A and B are really the same (transposed) node. |
197 | # We do this by maintaining some pools of transposed nodes, and |
198 | # we have a lookup hash so that each member of that |
199 | sub set_identical_node { |
200 | my( $self, $node, $same_node ) = @_; |
201 | my $pool = $self->{'identical_nodes'}->{ $node }; |
202 | my $same_pool = $self->{'identical_nodes'}->{ $same_node }; |
203 | my %poolhash; |
204 | foreach ( @$pool ) { |
205 | $poolhash{$_} = 1; |
206 | } |
207 | foreach( @$same_pool ) { |
208 | push( @$pool, $_ ) unless $poolhash{$_}; |
209 | } |
210 | |
211 | $self->{'identical_nodes'}->{ $same_node } = $pool; |
212 | } |
213 | |
214 | sub identical_nodes { |
215 | my( $self, $node ) = @_; |
216 | my @others = grep { $_ !~ /^$node$/ } |
217 | @{$self->{'identical_nodes'}->{ $node }}; |
218 | return @others; |
b49c4318 |
219 | } |
220 | |
221 | sub next_word { |
222 | # Return the successor via the corresponding edge. |
223 | my( $self, $node, $edge ) = @_; |
224 | $edge = "base text" unless $edge; |
225 | my @next_edges = $node->outgoing(); |
226 | return undef unless scalar( @next_edges ); |
227 | |
228 | foreach my $e ( @next_edges ) { |
229 | next unless $e->label() eq $edge; |
230 | return $e->to(); |
231 | } |
232 | |
233 | warn "Could not find node connected to edge $edge"; |
234 | return undef; |
235 | } |
236 | |
237 | sub prior_word { |
238 | # Return the predecessor via the corresponding edge. |
239 | my( $self, $node, $edge ) = @_; |
240 | $edge = "base text" unless $edge; |
241 | my @prior_edges = $node->incoming(); |
242 | return undef unless scalar( @prior_edges ); |
243 | |
244 | foreach my $e ( @prior_edges ) { |
245 | next unless $e->label() eq $edge; |
246 | return $e->from(); |
247 | } |
248 | |
249 | warn "Could not find node connected from edge $edge"; |
250 | return undef; |
251 | } |
252 | |
253 | sub node_sequence { |
254 | my( $self, $start, $end, $label ) = @_; |
255 | # TODO make label able to follow a single MS |
256 | unless( ref( $start ) eq 'Graph::Easy::Node' |
257 | && ref( $end ) eq 'Graph::Easy::Node' ) { |
258 | warn "Called node_sequence without two nodes!"; |
259 | return (); |
260 | } |
261 | $label = 'base text' unless $label; |
262 | my @nodes = ( $start ); |
263 | my %seen; |
264 | my $n = $start; |
265 | while( $n ne $end ) { |
266 | if( exists( $seen{$n->name()} ) ) { |
267 | warn "Detected loop at " . $n->name(); |
268 | last; |
269 | } |
270 | $seen{$n->name()} = 1; |
271 | |
272 | my @edges = $n->outgoing(); |
273 | my @relevant_edges = grep { $_->label =~ /^$label$/ } @edges; |
274 | warn "Did not find an edge $label from node " . $n->label |
275 | unless scalar @relevant_edges; |
276 | warn "Found more than one edge $label from node " . $n->label |
277 | unless scalar @relevant_edges == 1; |
278 | my $next = $relevant_edges[0]->to(); |
279 | push( @nodes, $next ); |
280 | $n = $next; |
281 | } |
282 | # Check that the last node is our end node. |
283 | my $last = $nodes[$#nodes]; |
284 | warn "Last node found from " . $start->label() . |
285 | " via path $label is not the end!" |
286 | unless $last eq $end; |
287 | |
288 | return @nodes; |
289 | } |
290 | |
291 | sub string_lemma { |
292 | my( $self, $start, $end, $label ) = @_; |
293 | |
294 | my @nodes = $self->node_sequence( $start, $end, $label ); |
295 | my @words = map { $_->label() } @nodes; |
296 | return join( ' ', @words ); |
297 | } |
298 | |
299 | ## Output. We use GraphViz for the layout because it handles large |
300 | ## graphs better than Graph::Easy does natively. |
301 | |
302 | sub as_svg { |
303 | my( $self, $recalc ) = @_; |
304 | return $self->{'svg'} if( exists $self->{'svg'} && !$recalc ); |
305 | |
306 | $self->{'graphviz'} = $self->{'graph'}->as_graphviz() |
307 | unless( exists $self->{'graphviz'} && !$recalc ); |
308 | |
309 | my @cmd = qw/dot -Tsvg/; |
310 | my( $svg, $err ); |
311 | my $in = $self->{'graphviz'}; |
312 | run( \@cmd, \$in, ">", binary(), \$svg ); |
313 | $self->{'svg'} = $svg; |
314 | return $svg; |
315 | } |
316 | |
a25d4374 |
317 | ## Methods for lemmatizing a text. |
b49c4318 |
318 | |
a25d4374 |
319 | sub init_lemmatizer { |
320 | my $self = shift; |
321 | # Initialize the 'lemma' hash, going through all the nodes and seeing |
58a3c424 |
322 | # which ones are common nodes. This should only be run once. |
a25d4374 |
323 | |
58a3c424 |
324 | return if( $self->{'lemmatizer_initialized'} ); |
325 | my @active_names = map { $_->name } grep { $self->is_common( $_ ) } |
326 | $self->nodes(); |
327 | $self->{'positions'}->init_lemmatizer( @active_names ); |
328 | $self->{'lemmatizer_initialized'} = 1; |
a25d4374 |
329 | |
a25d4374 |
330 | } |
331 | |
332 | sub make_positions { |
333 | my( $self, $common_nodes, $paths ) = @_; |
334 | my $positions = Traditions::Graph::Position->new( $common_nodes, $paths ); |
335 | $self->{'positions'} = $positions; |
336 | } |
337 | |
338 | # Takes a list of nodes that have just been turned off, and returns a |
339 | # set of tuples of the form ['node', 'state'] that indicates what |
340 | # changes need to be made to the graph. |
341 | # A state of 1 means 'turn on this node' |
342 | # A state of 0 means 'turn off this node' |
343 | # A state of undef means 'an ellipsis belongs in the text here because |
344 | # no decision has been made' |
b49c4318 |
345 | sub active_nodes { |
346 | my( $self, @toggled_off_nodes ) = @_; |
a25d4374 |
347 | |
348 | # In case this is the first run |
349 | $self->init_lemmatizer(); |
350 | # First get the positions of those nodes which have been |
351 | # toggled off. |
b49c4318 |
352 | my $positions_off = {}; |
a25d4374 |
353 | map { $positions_off->{ $self->{'positions'}->node_position( $_ ) } = $_ } |
354 | @toggled_off_nodes; |
355 | |
b49c4318 |
356 | |
357 | # Now for each position, we have to see if a node is on, and we |
358 | # have to see if a node has been turned off. |
359 | my @answer; |
a25d4374 |
360 | foreach my $pos ( $self->{'positions'}->all() ) { |
58a3c424 |
361 | # Find the state of this position. If there is an active node, |
362 | # its name will be the state; otherwise the state will be 0 |
363 | # (nothing at this position) or undef (ellipsis at this position) |
364 | my $active = $self->{'positions'}->state( $pos ); |
a25d4374 |
365 | |
b49c4318 |
366 | # Is there a formerly active node that was toggled off? |
367 | if( exists( $positions_off->{$pos} ) ) { |
a25d4374 |
368 | my $off_node = $positions_off->{$pos}; |
58a3c424 |
369 | if( $active && $active ne $off_node) { |
b49c4318 |
370 | push( @answer, [ $off_node, 0 ], [ $active, 1 ] ); |
b49c4318 |
371 | } else { |
58a3c424 |
372 | push( @answer, [ $off_node, $active ] ); |
b49c4318 |
373 | } |
58a3c424 |
374 | |
b49c4318 |
375 | # No formerly active node, so we just see if there is a currently |
376 | # active one. |
377 | } elsif( $active ) { |
378 | # Push the active node, whatever it is. |
379 | push( @answer, [ $active, 1 ] ); |
380 | } else { |
58a3c424 |
381 | # Push the state that is there. Arbitrarily use the first node |
382 | # at that position. |
383 | my @pos_nodes = $self->{'positions'}->nodes_at_position( $pos ); |
384 | push( @answer, |
385 | [ $pos_nodes[0], $self->{'positions'}->state( $pos ) ] ); |
b49c4318 |
386 | } |
387 | } |
58a3c424 |
388 | |
b49c4318 |
389 | return @answer; |
390 | } |
391 | |
a25d4374 |
392 | # A couple of helpers. TODO These should be gathered in the same place |
393 | # eventually |
b49c4318 |
394 | |
a25d4374 |
395 | sub is_common { |
396 | my( $self, $node ) = @_; |
397 | $node = $self->_nodeobj( $node ); |
398 | return $node->get_attribute('class') eq 'common'; |
b49c4318 |
399 | } |
400 | |
a25d4374 |
401 | sub _nodeobj { |
402 | my( $self, $node ) = @_; |
403 | unless( ref $node eq 'Graph::Easy::Node' ) { |
404 | $node = $self->node( $node ); |
b49c4318 |
405 | } |
a25d4374 |
406 | return $node; |
b49c4318 |
407 | } |
408 | |
a25d4374 |
409 | # toggle_node takes a node name, and either lemmatizes or de-lemmatizes it. |
410 | # Returns a list of nodes that are de-lemmatized as a result of the toggle. |
b49c4318 |
411 | |
b49c4318 |
412 | sub toggle_node { |
a25d4374 |
413 | my( $self, $node ) = @_; |
414 | |
415 | # In case this is being called for the first time. |
416 | $self->init_lemmatizer(); |
b49c4318 |
417 | |
a25d4374 |
418 | if( $self->is_common( $node ) ) { |
419 | # Do nothing, it's a common node. |
420 | return; |
421 | } |
58a3c424 |
422 | |
423 | my $pos = $self->{'positions'}->node_position( $node ); |
424 | my $old_state = $self->{'positions'}->state( $pos ); |
b49c4318 |
425 | my @nodes_off; |
58a3c424 |
426 | if( $old_state && $old_state eq $node ) { |
427 | # Turn off the node. We turn on no others by default. |
428 | push( @nodes_off, $node ); |
429 | } else { |
b49c4318 |
430 | # Turn on the node. |
58a3c424 |
431 | $self->{'positions'}->set_state( $pos, $node ); |
432 | # Any other 'on' nodes in the same position should be off. |
a25d4374 |
433 | push( @nodes_off, $self->colocated_nodes( $node ) ); |
58a3c424 |
434 | # Any node that is an identical transposed one should be off. |
a25d4374 |
435 | push( @nodes_off, $self->identical_nodes( $node ) ) |
436 | if $self->identical_nodes( $node ); |
b49c4318 |
437 | } |
a25d4374 |
438 | @nodes_off = unique_list( @nodes_off ); |
b49c4318 |
439 | |
440 | # Turn off the nodes that need to be turned off. |
58a3c424 |
441 | my @nodes_turned_off; |
442 | foreach my $n ( @nodes_off ) { |
443 | my $npos = $self->{'positions'}->node_position( $n ); |
444 | my $state = $self->{'positions'}->state( $npos ); |
445 | if( $state && $state eq $n ) { |
446 | # this node is still on |
447 | push( @nodes_turned_off, $n ); |
448 | my $new_state = undef; |
449 | if( $n eq $node ) { |
450 | # This is the node that was clicked, so if there are no |
451 | # other nodes there, turn off the position. In all other |
452 | # cases, restore the ellipsis. |
453 | my @all_n = $self->{'positions'}->nodes_at_position( $pos ); |
454 | $new_state = 0 if scalar( @all_n ) == 1; |
455 | } |
456 | $self->{'positions'}->set_state( $npos, $new_state ); |
457 | } elsif( $old_state && $old_state eq $n ) { |
458 | # another node has already been turned on here |
459 | push( @nodes_turned_off, $n ); |
460 | } # else some other node was on anyway, so pass. |
461 | } |
462 | return @nodes_turned_off; |
b49c4318 |
463 | } |
464 | |
b49c4318 |
465 | sub colocated_nodes { |
a25d4374 |
466 | my $self = shift; |
467 | return $self->{'positions'}->colocated_nodes( @_ ); |
b49c4318 |
468 | } |
469 | |
a25d4374 |
470 | sub text_of_node { |
471 | my( $self, $node_id ) = @_; |
472 | # This is the label of the given node. |
473 | return $self->node( $node_id )->label(); |
b49c4318 |
474 | } |
475 | |
476 | sub text_for_witness { |
477 | my( $self, $wit ) = @_; |
b49c4318 |
478 | |
a25d4374 |
479 | my @nodes = $self->{'positions'}->witness_path( $wit ); |
480 | my @words = map { $self->node( $_ )->label() } @nodes; |
b49c4318 |
481 | return join( ' ', @words ); |
482 | } |
483 | |
a25d4374 |
484 | sub unique_list { |
485 | my( @list ) = @_; |
486 | my %h; |
487 | map { $h{$_} = 1 } @list; |
488 | return keys( %h ); |
b49c4318 |
489 | } |
a25d4374 |
490 | |
b49c4318 |
491 | 1; |
a25d4374 |
492 | |