Move some active node logic into the positions library
[scpubgit/stemmatology.git] / lib / Traditions / Graph.pm
CommitLineData
b49c4318 1package Traditions::Graph;
2
3use strict;
4use warnings;
5use Graph::Easy;
6use IPC::Run qw( run binary );
7use Module::Load;
a25d4374 8use Traditions::Graph::Position;
9
10=head1 NAME
11
12(Text?)::Traditions::Graph
13
14=head1 SYNOPSIS
15
16use Traditions::Graph;
17
18my $text = Traditions::Graph->new( 'GraphML' => '/my/graphml/file.xml' );
19my $text = Traditions::Graph->new( 'TEI' => '/my/tei/file.xml' );
20my $text = Traditions::Graph->new( 'CSV' => '/my/csv/file.csv',
21 'base' => '/my/basefile.txt' );
22my $text = Traditions::Graph->new( 'CTE' => '/my/cte/file.txt',
23 'base' => '/my/basefile.txt' );
24
25my $svg_string = $text->as_svg();
26
27my $lemma_nodes = $text->active_nodes();
28$text->toggle_node( 'some_word' );
29
30=head1 DESCRIPTION
31
32A text tradition is the representation of our knowledge of a text that
33has been passed down via manuscript copies from a time before printing
34presses. Each text has a number of witnesses, that is, manuscripts
35that bear a version of the text. The tradition is the aggregation of
36these witnesses, which is to say, the collation of the text.
37
38This module takes a text collation and represents it as a horizontal
39directed graph, suitable for SVG rendering and for analysis of various
40forms. Since this module was written by a medievalist, it also
41provides a facility for making a critical text reconstruction by
42choosing certain variants to be 'lemma' text - that is, text which
43should be considered the 'standard' reading.
44
45Although the graph is a very good way to render text collation, and is
46visually very easy for a human to interpret, it doesn't have any
47inherent information about which nodes 'go together' - that is, which
48text readings appear in the same place as other readings. This module
49therefore calculates 'positions' on the graph, thus holding some
50information about which readings can and can't be substituted for
51others.
52
53=head1 METHODS
54
55=over 4
56
57=item B<new>
58
59Constructor. Takes a source collation file from which to construct
60the initial graph. This file can be TEI (parallel segmentation) XML,
61CSV in a format yet to be documented, GraphML as documented (someday)
62by CollateX, or a Classical Text Editor apparatus. For CSV and
63Classical Text Editor files, the user must also supply a base text to
64which the line numbering in the collation file refers.
65
66=cut
b49c4318 67
68sub new {
69 my $proto = shift;
70 my $class = ref( $proto ) || $proto;
71 my %opts = ( 'on_color' => 'yellow',
72 'off_color' => 'white',
73 @_ );
74 my $self = {};
75
76 # opts can be: GraphML, base+CSV, base+CTE, TEI. We need
77 # something to parse.
78 my @formats = grep { /^(GraphML|CSV|CTE|TEI)$/ } keys( %opts );
79 my $format = shift( @formats );
80 unless( $format ) {
81 warn "No data given to create a graph: need GraphML, CSV, or TEI";
82 return;
83 }
84 if( $format =~ /^(CSV|CTE)$/ && !exists $opts{'base'} ) {
85 warn "Cannot make a graph from $format without a base text";
86 return;
87 }
88
89 # Make a graph object.
90 my $collation_graph = Graph::Easy->new();
91 $collation_graph->set_attribute( 'node', 'shape', 'ellipse' );
92 # Starting point for all texts
93 my $last_node = $collation_graph->add_node( '#START#' );
94
95 $self->{'graph'} = $collation_graph;
96 bless( $self, $class );
97
98 # Now do the parsing.
99 my $mod = "Traditions::Parser::$format";
100 load( $mod );
101 my @args = ( $opts{ $format } );
102 if( $format =~ /^(CSV|CTE)$/ ) {
103 push( @args, $opts{'base'} );
104 }
105 $mod->can('parse')->( $self, @args );
106
107 return $self;
108}
109
110
111### Graph::Easy object accessor methods
112sub node {
113 my $self = shift;
114 return $self->{'graph'}->node( @_ );
115}
116
117sub edge {
118 my $self = shift;
119 return $self->{'graph'}->edge( @_ );
120}
121
122sub add_node {
123 my $self = shift;
124 return $self->{'graph'}->add_node( @_ );
125}
126
127sub add_edge {
128 my $self = shift;
129 return $self->{'graph'}->add_edge( @_ );
130}
131
132sub del_node {
133 my $self = shift;
134 return $self->{'graph'}->del_node( @_ );
135}
136
137sub del_edge {
138 my $self = shift;
139 return $self->{'graph'}->del_edge( @_ );
140}
141
142sub nodes {
143 my $self = shift;
144 return $self->{'graph'}->nodes( @_ );
145}
146
147sub edges {
148 my $self = shift;
149 return $self->{'graph'}->edges( @_ );
150}
151
152sub merge_nodes {
153 my $self = shift;
154 return $self->{'graph'}->merge_nodes( @_ );
155}
156
157### Helper methods for navigating the tree
158
159sub start {
160 # Return the beginning node of the graph.
161 my $self = shift;
162 my( $new_start ) = @_;
163 if( $new_start ) {
164 $self->{'graph'}->rename_node( $new_start, '#START#' );
165 }
166 return $self->{'graph'}->node('#START#');
167}
168
a25d4374 169sub set_identical_nodes {
170 my( $self, $node_hash ) = @_;
171 $self->{'identical_nodes'} = $node_hash;
b49c4318 172}
173
174sub next_word {
175 # Return the successor via the corresponding edge.
176 my( $self, $node, $edge ) = @_;
177 $edge = "base text" unless $edge;
178 my @next_edges = $node->outgoing();
179 return undef unless scalar( @next_edges );
180
181 foreach my $e ( @next_edges ) {
182 next unless $e->label() eq $edge;
183 return $e->to();
184 }
185
186 warn "Could not find node connected to edge $edge";
187 return undef;
188}
189
190sub prior_word {
191 # Return the predecessor via the corresponding edge.
192 my( $self, $node, $edge ) = @_;
193 $edge = "base text" unless $edge;
194 my @prior_edges = $node->incoming();
195 return undef unless scalar( @prior_edges );
196
197 foreach my $e ( @prior_edges ) {
198 next unless $e->label() eq $edge;
199 return $e->from();
200 }
201
202 warn "Could not find node connected from edge $edge";
203 return undef;
204}
205
206sub node_sequence {
207 my( $self, $start, $end, $label ) = @_;
208 # TODO make label able to follow a single MS
209 unless( ref( $start ) eq 'Graph::Easy::Node'
210 && ref( $end ) eq 'Graph::Easy::Node' ) {
211 warn "Called node_sequence without two nodes!";
212 return ();
213 }
214 $label = 'base text' unless $label;
215 my @nodes = ( $start );
216 my %seen;
217 my $n = $start;
218 while( $n ne $end ) {
219 if( exists( $seen{$n->name()} ) ) {
220 warn "Detected loop at " . $n->name();
221 last;
222 }
223 $seen{$n->name()} = 1;
224
225 my @edges = $n->outgoing();
226 my @relevant_edges = grep { $_->label =~ /^$label$/ } @edges;
227 warn "Did not find an edge $label from node " . $n->label
228 unless scalar @relevant_edges;
229 warn "Found more than one edge $label from node " . $n->label
230 unless scalar @relevant_edges == 1;
231 my $next = $relevant_edges[0]->to();
232 push( @nodes, $next );
233 $n = $next;
234 }
235 # Check that the last node is our end node.
236 my $last = $nodes[$#nodes];
237 warn "Last node found from " . $start->label() .
238 " via path $label is not the end!"
239 unless $last eq $end;
240
241 return @nodes;
242}
243
244sub string_lemma {
245 my( $self, $start, $end, $label ) = @_;
246
247 my @nodes = $self->node_sequence( $start, $end, $label );
248 my @words = map { $_->label() } @nodes;
249 return join( ' ', @words );
250}
251
252## Output. We use GraphViz for the layout because it handles large
253## graphs better than Graph::Easy does natively.
254
255sub as_svg {
256 my( $self, $recalc ) = @_;
257 return $self->{'svg'} if( exists $self->{'svg'} && !$recalc );
258
259 $self->{'graphviz'} = $self->{'graph'}->as_graphviz()
260 unless( exists $self->{'graphviz'} && !$recalc );
261
262 my @cmd = qw/dot -Tsvg/;
263 my( $svg, $err );
264 my $in = $self->{'graphviz'};
265 run( \@cmd, \$in, ">", binary(), \$svg );
266 $self->{'svg'} = $svg;
267 return $svg;
268}
269
a25d4374 270## Methods for lemmatizing a text.
b49c4318 271
a25d4374 272sub init_lemmatizer {
273 my $self = shift;
274 # Initialize the 'lemma' hash, going through all the nodes and seeing
58a3c424 275 # which ones are common nodes. This should only be run once.
a25d4374 276
58a3c424 277 return if( $self->{'lemmatizer_initialized'} );
278 my @active_names = map { $_->name } grep { $self->is_common( $_ ) }
279 $self->nodes();
280 $self->{'positions'}->init_lemmatizer( @active_names );
281 $self->{'lemmatizer_initialized'} = 1;
a25d4374 282
a25d4374 283}
284
285sub make_positions {
286 my( $self, $common_nodes, $paths ) = @_;
287 my $positions = Traditions::Graph::Position->new( $common_nodes, $paths );
288 $self->{'positions'} = $positions;
289}
290
291# Takes a list of nodes that have just been turned off, and returns a
292# set of tuples of the form ['node', 'state'] that indicates what
293# changes need to be made to the graph.
294# A state of 1 means 'turn on this node'
295# A state of 0 means 'turn off this node'
296# A state of undef means 'an ellipsis belongs in the text here because
297# no decision has been made'
b49c4318 298sub active_nodes {
299 my( $self, @toggled_off_nodes ) = @_;
a25d4374 300
301 # In case this is the first run
302 $self->init_lemmatizer();
303 # First get the positions of those nodes which have been
304 # toggled off.
b49c4318 305 my $positions_off = {};
a25d4374 306 map { $positions_off->{ $self->{'positions'}->node_position( $_ ) } = $_ }
307 @toggled_off_nodes;
308
b49c4318 309
310 # Now for each position, we have to see if a node is on, and we
311 # have to see if a node has been turned off.
312 my @answer;
a25d4374 313 foreach my $pos ( $self->{'positions'}->all() ) {
58a3c424 314 # Find the state of this position. If there is an active node,
315 # its name will be the state; otherwise the state will be 0
316 # (nothing at this position) or undef (ellipsis at this position)
317 my $active = $self->{'positions'}->state( $pos );
a25d4374 318
b49c4318 319 # Is there a formerly active node that was toggled off?
320 if( exists( $positions_off->{$pos} ) ) {
a25d4374 321 my $off_node = $positions_off->{$pos};
58a3c424 322 if( $active && $active ne $off_node) {
b49c4318 323 push( @answer, [ $off_node, 0 ], [ $active, 1 ] );
b49c4318 324 } else {
58a3c424 325 push( @answer, [ $off_node, $active ] );
b49c4318 326 }
58a3c424 327
b49c4318 328 # No formerly active node, so we just see if there is a currently
329 # active one.
330 } elsif( $active ) {
331 # Push the active node, whatever it is.
332 push( @answer, [ $active, 1 ] );
333 } else {
58a3c424 334 # Push the state that is there. Arbitrarily use the first node
335 # at that position.
336 my @pos_nodes = $self->{'positions'}->nodes_at_position( $pos );
337 push( @answer,
338 [ $pos_nodes[0], $self->{'positions'}->state( $pos ) ] );
b49c4318 339 }
340 }
58a3c424 341
b49c4318 342 return @answer;
343}
344
a25d4374 345# A couple of helpers. TODO These should be gathered in the same place
346# eventually
b49c4318 347
a25d4374 348sub is_common {
349 my( $self, $node ) = @_;
350 $node = $self->_nodeobj( $node );
351 return $node->get_attribute('class') eq 'common';
b49c4318 352}
353
a25d4374 354sub _nodeobj {
355 my( $self, $node ) = @_;
356 unless( ref $node eq 'Graph::Easy::Node' ) {
357 $node = $self->node( $node );
b49c4318 358 }
a25d4374 359 return $node;
b49c4318 360}
361
a25d4374 362# toggle_node takes a node name, and either lemmatizes or de-lemmatizes it.
363# Returns a list of nodes that are de-lemmatized as a result of the toggle.
b49c4318 364
b49c4318 365sub toggle_node {
a25d4374 366 my( $self, $node ) = @_;
367
368 # In case this is being called for the first time.
369 $self->init_lemmatizer();
b49c4318 370
a25d4374 371 if( $self->is_common( $node ) ) {
372 # Do nothing, it's a common node.
373 return;
374 }
58a3c424 375
376 my $pos = $self->{'positions'}->node_position( $node );
377 my $old_state = $self->{'positions'}->state( $pos );
b49c4318 378 my @nodes_off;
58a3c424 379 if( $old_state && $old_state eq $node ) {
380 # Turn off the node. We turn on no others by default.
381 push( @nodes_off, $node );
382 } else {
b49c4318 383 # Turn on the node.
58a3c424 384 $self->{'positions'}->set_state( $pos, $node );
385 # Any other 'on' nodes in the same position should be off.
a25d4374 386 push( @nodes_off, $self->colocated_nodes( $node ) );
58a3c424 387 # Any node that is an identical transposed one should be off.
a25d4374 388 push( @nodes_off, $self->identical_nodes( $node ) )
389 if $self->identical_nodes( $node );
b49c4318 390 }
a25d4374 391 @nodes_off = unique_list( @nodes_off );
b49c4318 392
393 # Turn off the nodes that need to be turned off.
58a3c424 394 my @nodes_turned_off;
395 foreach my $n ( @nodes_off ) {
396 my $npos = $self->{'positions'}->node_position( $n );
397 my $state = $self->{'positions'}->state( $npos );
398 if( $state && $state eq $n ) {
399 # this node is still on
400 push( @nodes_turned_off, $n );
401 my $new_state = undef;
402 if( $n eq $node ) {
403 # This is the node that was clicked, so if there are no
404 # other nodes there, turn off the position. In all other
405 # cases, restore the ellipsis.
406 my @all_n = $self->{'positions'}->nodes_at_position( $pos );
407 $new_state = 0 if scalar( @all_n ) == 1;
408 }
409 $self->{'positions'}->set_state( $npos, $new_state );
410 } elsif( $old_state && $old_state eq $n ) {
411 # another node has already been turned on here
412 push( @nodes_turned_off, $n );
413 } # else some other node was on anyway, so pass.
414 }
415 return @nodes_turned_off;
b49c4318 416}
417
b49c4318 418sub colocated_nodes {
a25d4374 419 my $self = shift;
420 return $self->{'positions'}->colocated_nodes( @_ );
b49c4318 421}
422
423sub identical_nodes {
424 my( $self, $node ) = @_;
a25d4374 425 return undef unless exists $self->{'identical_nodes'} &&
426 exists $self->{'identical_nodes'}->{$node};
427 return $self->{'identical_nodes'}->{$node};
428}
429
430sub text_of_node {
431 my( $self, $node_id ) = @_;
432 # This is the label of the given node.
433 return $self->node( $node_id )->label();
b49c4318 434}
435
436sub text_for_witness {
437 my( $self, $wit ) = @_;
b49c4318 438
a25d4374 439 my @nodes = $self->{'positions'}->witness_path( $wit );
440 my @words = map { $self->node( $_ )->label() } @nodes;
b49c4318 441 return join( ' ', @words );
442}
443
a25d4374 444sub unique_list {
445 my( @list ) = @_;
446 my %h;
447 map { $h{$_} = 1 } @list;
448 return keys( %h );
b49c4318 449}
a25d4374 450
b49c4318 4511;
a25d4374 452