Add some documentation
[scpubgit/stemmatology.git] / lib / Text / Tradition / Graph.pm
CommitLineData
e58153d6 1package Text::Tradition::Graph;
b49c4318 2
3use strict;
4use warnings;
5use Graph::Easy;
6use IPC::Run qw( run binary );
7use Module::Load;
e58153d6 8use Text::Tradition::Graph::Position;
a25d4374 9
10=head1 NAME
11
e58153d6 12Text::Tradition::Graph
a25d4374 13
14=head1 SYNOPSIS
15
2ceca8c3 16 use Text::Tradition::Graph;
a25d4374 17
2ceca8c3 18 my $text = Text::Tradition::Graph->new( 'GraphML' => '/my/graphml/file.xml' );
19 my $text = Text::Tradition::Graph->new( 'TEI' => '/my/tei/file.xml' );
20 my $text = Text::Tradition::Graph->new( 'CSV' => '/my/csv/file.csv',
21 'base' => '/my/basefile.txt' );
22 my $text = Text::Tradition::Graph->new( 'CTE' => '/my/cte/file.txt',
23 'base' => '/my/basefile.txt' );
a25d4374 24
2ceca8c3 25 my $svg_string = $text->as_svg();
a25d4374 26
2ceca8c3 27 my $lemma_nodes = $text->active_nodes();
28 $text->toggle_node( 'some_word' );
a25d4374 29
30=head1 DESCRIPTION
31
32A text tradition is the representation of our knowledge of a text that
33has been passed down via manuscript copies from a time before printing
34presses. Each text has a number of witnesses, that is, manuscripts
35that bear a version of the text. The tradition is the aggregation of
36these witnesses, which is to say, the collation of the text.
37
38This module takes a text collation and represents it as a horizontal
39directed graph, suitable for SVG rendering and for analysis of various
40forms. Since this module was written by a medievalist, it also
41provides a facility for making a critical text reconstruction by
42choosing certain variants to be 'lemma' text - that is, text which
43should be considered the 'standard' reading.
44
45Although the graph is a very good way to render text collation, and is
46visually very easy for a human to interpret, it doesn't have any
47inherent information about which nodes 'go together' - that is, which
48text readings appear in the same place as other readings. This module
49therefore calculates 'positions' on the graph, thus holding some
50information about which readings can and can't be substituted for
51others.
52
53=head1 METHODS
54
55=over 4
56
57=item B<new>
58
59Constructor. Takes a source collation file from which to construct
60the initial graph. This file can be TEI (parallel segmentation) XML,
2ceca8c3 61CSV in a format yet to be documented, GraphML as documented by the
62CollateX tool (L<http://gregor.middell.net/collatex/>), or a Classical
63Text Editor apparatus. For CSV and Classical Text Editor files, the
64user must also supply a base text to which the line numbering in the
65collation file refers.
66
6720/04/2011 Currently only CSV and GraphML are really supported.
a25d4374 68
69=cut
b49c4318 70
71sub new {
72 my $proto = shift;
73 my $class = ref( $proto ) || $proto;
74 my %opts = ( 'on_color' => 'yellow',
75 'off_color' => 'white',
76 @_ );
77 my $self = {};
78
79 # opts can be: GraphML, base+CSV, base+CTE, TEI. We need
80 # something to parse.
81 my @formats = grep { /^(GraphML|CSV|CTE|TEI)$/ } keys( %opts );
82 my $format = shift( @formats );
83 unless( $format ) {
84 warn "No data given to create a graph: need GraphML, CSV, or TEI";
85 return;
86 }
87 if( $format =~ /^(CSV|CTE)$/ && !exists $opts{'base'} ) {
88 warn "Cannot make a graph from $format without a base text";
89 return;
90 }
91
92 # Make a graph object.
93 my $collation_graph = Graph::Easy->new();
94 $collation_graph->set_attribute( 'node', 'shape', 'ellipse' );
95 # Starting point for all texts
96 my $last_node = $collation_graph->add_node( '#START#' );
97
98 $self->{'graph'} = $collation_graph;
99 bless( $self, $class );
100
101 # Now do the parsing.
e58153d6 102 my $mod = "Text::Tradition::Parser::$format";
b49c4318 103 load( $mod );
104 my @args = ( $opts{ $format } );
105 if( $format =~ /^(CSV|CTE)$/ ) {
106 push( @args, $opts{'base'} );
107 }
108 $mod->can('parse')->( $self, @args );
109
110 return $self;
111}
112
2ceca8c3 113=item B<make_positions>
114
115$graph->make_positions( $common_nodes, $paths )
116
117Create an associated Graph::Positions object that records the position
118of each node in the graph. This method call is probably in the wrong
119place and will move.
120
121=cut
122
123sub make_positions {
124 my( $self, $common_nodes, $paths ) = @_;
125 my $positions = Text::Tradition::Graph::Position->new( $common_nodes, $paths );
126 $self->{'positions'} = $positions;
127}
128
129=back
130
131=head2 Graph::Easy object accessor methods
132
133See the Graph::Easy documentation for descriptions of these functions.
134
135=over
136
137=item B<node>
138
139=cut
b49c4318 140
b49c4318 141sub node {
142 my $self = shift;
143 return $self->{'graph'}->node( @_ );
144}
145
2ceca8c3 146=item B<edge>
147
148=cut
149
b49c4318 150sub edge {
151 my $self = shift;
152 return $self->{'graph'}->edge( @_ );
153}
154
2ceca8c3 155=item B<add_node>
156
157=cut
158
c2d16875 159# Not only adds the node, but also initializes internal data
160# about the node.
2ceca8c3 161
b49c4318 162sub add_node {
163 my $self = shift;
c2d16875 164 my $node = $self->{'graph'}->add_node( @_ );
165 $self->{'identical_nodes'}->{ $node->name() } = [ $node->name() ];
166 return $node;
b49c4318 167}
168
2ceca8c3 169=item B<add_edge>
170
171=cut
172
b49c4318 173sub add_edge {
174 my $self = shift;
175 return $self->{'graph'}->add_edge( @_ );
176}
177
2ceca8c3 178=item B<del_node>
179
180=cut
181
b49c4318 182sub del_node {
183 my $self = shift;
c2d16875 184 my $node = $_[0];
185
186 # Delete this node out of any relevant transposition pool.
187 if( ref $node eq 'Graph::Easy::Node' ) {
188 $node = $node->name();
189 }
190 my @ident = $self->identical_nodes( $node );
191 if( @ident ) {
192 # Get the pool.
193 my $pool = $self->{'identical_nodes'}->{ $ident[0] };
194 foreach my $i ( 0 .. scalar(@$pool)-1 ) {
195 if( $pool->[$i] eq $node ) {
196 splice( @$pool, $i, 1 );
197 last;
198 }
199 }
200 }
201 delete $self->{'identical_nodes'}->{ $node };
202
203 # Now delete the node.
b49c4318 204 return $self->{'graph'}->del_node( @_ );
205}
206
2ceca8c3 207=item B<del_edge>
208
209=cut
210
b49c4318 211sub del_edge {
212 my $self = shift;
213 return $self->{'graph'}->del_edge( @_ );
214}
215
2ceca8c3 216=item B<nodes>
217
218=cut
219
b49c4318 220sub nodes {
221 my $self = shift;
222 return $self->{'graph'}->nodes( @_ );
223}
224
2ceca8c3 225=item B<edges>
226
227=cut
228
b49c4318 229sub edges {
230 my $self = shift;
231 return $self->{'graph'}->edges( @_ );
232}
233
2ceca8c3 234=item B<merge_nodes>
235
236=cut
237
b49c4318 238sub merge_nodes {
239 my $self = shift;
240 return $self->{'graph'}->merge_nodes( @_ );
241}
242
243### Helper methods for navigating the tree
244
2ceca8c3 245=back
246
247=head2 Graph navigation methods
248
249=over
250
251=item B<start>
252
253my $node = $graph->start();
254
255Returns the beginning node of the graph.
256
257=cut
258
b49c4318 259sub start {
260 # Return the beginning node of the graph.
261 my $self = shift;
262 my( $new_start ) = @_;
263 if( $new_start ) {
c2d16875 264 # Fix the node transposition data
265 delete $self->{'identical_nodes'}->{ $new_start->name() };
266 $self->{'identical_nodes'}->{'#START#'} = [ '#START#' ];
b49c4318 267 $self->{'graph'}->rename_node( $new_start, '#START#' );
268 }
269 return $self->{'graph'}->node('#START#');
270}
271
2ceca8c3 272=item B<next_word>
c2d16875 273
2ceca8c3 274my $next_node = $graph->next_word( $node, $path );
c2d16875 275
2ceca8c3 276Returns the node that follows the given node along the given witness
277path. TODO These are badly named.
278
279=cut
b49c4318 280
281sub next_word {
282 # Return the successor via the corresponding edge.
283 my( $self, $node, $edge ) = @_;
284 $edge = "base text" unless $edge;
285 my @next_edges = $node->outgoing();
286 return undef unless scalar( @next_edges );
287
288 foreach my $e ( @next_edges ) {
289 next unless $e->label() eq $edge;
290 return $e->to();
291 }
292
293 warn "Could not find node connected to edge $edge";
294 return undef;
295}
296
2ceca8c3 297=item B<prior_word>
298
299my $prior_node = $graph->prior_word( $node, $path );
300
301Returns the node that precedes the given node along the given witness
302path. TODO These are badly named.
303
304=cut
305
b49c4318 306sub prior_word {
307 # Return the predecessor via the corresponding edge.
308 my( $self, $node, $edge ) = @_;
309 $edge = "base text" unless $edge;
310 my @prior_edges = $node->incoming();
311 return undef unless scalar( @prior_edges );
312
313 foreach my $e ( @prior_edges ) {
314 next unless $e->label() eq $edge;
315 return $e->from();
316 }
317
318 warn "Could not find node connected from edge $edge";
319 return undef;
320}
321
2ceca8c3 322=item B<node_sequence>
323
324my @nodes = $graph->node_sequence( $first, $last, $path );
325
326Returns the ordered list of nodes, starting with $first and ending
327with $last, along the given witness path.
328
329=cut
330
b49c4318 331sub node_sequence {
332 my( $self, $start, $end, $label ) = @_;
333 # TODO make label able to follow a single MS
334 unless( ref( $start ) eq 'Graph::Easy::Node'
335 && ref( $end ) eq 'Graph::Easy::Node' ) {
336 warn "Called node_sequence without two nodes!";
337 return ();
338 }
339 $label = 'base text' unless $label;
340 my @nodes = ( $start );
341 my %seen;
342 my $n = $start;
343 while( $n ne $end ) {
344 if( exists( $seen{$n->name()} ) ) {
345 warn "Detected loop at " . $n->name();
346 last;
347 }
348 $seen{$n->name()} = 1;
349
350 my @edges = $n->outgoing();
351 my @relevant_edges = grep { $_->label =~ /^$label$/ } @edges;
352 warn "Did not find an edge $label from node " . $n->label
353 unless scalar @relevant_edges;
354 warn "Found more than one edge $label from node " . $n->label
355 unless scalar @relevant_edges == 1;
356 my $next = $relevant_edges[0]->to();
357 push( @nodes, $next );
358 $n = $next;
359 }
360 # Check that the last node is our end node.
361 my $last = $nodes[$#nodes];
362 warn "Last node found from " . $start->label() .
363 " via path $label is not the end!"
364 unless $last eq $end;
365
366 return @nodes;
367}
368
2ceca8c3 369=item B<string_lemma>
370
371my $text = $graph->string_lemma( $first, $last, $path );
372
373Returns the whitespace-separated text, starting with $first and ending
374with $last, represented in the graph along the given path.
375
376=cut
377
b49c4318 378sub string_lemma {
379 my( $self, $start, $end, $label ) = @_;
380
381 my @nodes = $self->node_sequence( $start, $end, $label );
382 my @words = map { $_->label() } @nodes;
383 return join( ' ', @words );
384}
385
2ceca8c3 386=back
387
388=head2 Transposition handling methods
389
390These should really move to their own module. For use when the graph
391has split transposed nodes in order to avoid edges that travel
392backward.
393
394=over
395
396=item B<set_identical_node>
397
398$graph->set_identical_node( $node, $other_node )
399
400Tell the graph that these two nodes contain the same (transposed) reading.
401
402=cut
403
404sub set_identical_node {
405 my( $self, $node, $same_node ) = @_;
406 my $pool = $self->{'identical_nodes'}->{ $node };
407 my $same_pool = $self->{'identical_nodes'}->{ $same_node };
408 my %poolhash;
409 foreach ( @$pool ) {
410 $poolhash{$_} = 1;
411 }
412 foreach( @$same_pool ) {
413 push( @$pool, $_ ) unless $poolhash{$_};
414 }
415
416 $self->{'identical_nodes'}->{ $same_node } = $pool;
417}
418
419=item B<set_identical_node>
420
421my @nodes = $graph->identical_nodes( $node )
422
423Get a list of nodes that contain the same (transposed) reading as the
424given node.
425
426=cut
427
428sub identical_nodes {
429 my( $self, $node ) = @_;
430 my @others = grep { $_ !~ /^$node$/ }
431 @{$self->{'identical_nodes'}->{ $node }};
432 return @others;
433}
434
435=back
436
437=head2 Output method(s)
438
439=over
440
441=item B<as_svg>
442
443print $graph->as_svg( $recalculate );
444
445Returns an SVG string that represents the graph. Uses GraphViz to do
446this, because Graph::Easy doesn't cope well with long graphs. Unless
447$recalculate is passed (and is a true value), the method will return a
448cached copy of the SVG after the first call to the method.
449
450=cut
b49c4318 451
452sub as_svg {
453 my( $self, $recalc ) = @_;
454 return $self->{'svg'} if( exists $self->{'svg'} && !$recalc );
455
456 $self->{'graphviz'} = $self->{'graph'}->as_graphviz()
457 unless( exists $self->{'graphviz'} && !$recalc );
458
459 my @cmd = qw/dot -Tsvg/;
460 my( $svg, $err );
461 my $in = $self->{'graphviz'};
462 run( \@cmd, \$in, ">", binary(), \$svg );
463 $self->{'svg'} = $svg;
464 return $svg;
465}
466
2ceca8c3 467=back
468
469=head2 Lemmatization methods
470
471=over
472
473=item B<init_lemmatizer>
474
475=cut
b49c4318 476
a25d4374 477sub init_lemmatizer {
478 my $self = shift;
479 # Initialize the 'lemma' hash, going through all the nodes and seeing
58a3c424 480 # which ones are common nodes. This should only be run once.
a25d4374 481
58a3c424 482 return if( $self->{'lemmatizer_initialized'} );
483 my @active_names = map { $_->name } grep { $self->is_common( $_ ) }
484 $self->nodes();
485 $self->{'positions'}->init_lemmatizer( @active_names );
486 $self->{'lemmatizer_initialized'} = 1;
a25d4374 487
a25d4374 488}
489
a25d4374 490# Takes a list of nodes that have just been turned off, and returns a
491# set of tuples of the form ['node', 'state'] that indicates what
492# changes need to be made to the graph.
493# A state of 1 means 'turn on this node'
494# A state of 0 means 'turn off this node'
495# A state of undef means 'an ellipsis belongs in the text here because
496# no decision has been made'
b49c4318 497sub active_nodes {
498 my( $self, @toggled_off_nodes ) = @_;
a25d4374 499
500 # In case this is the first run
501 $self->init_lemmatizer();
502 # First get the positions of those nodes which have been
503 # toggled off.
b49c4318 504 my $positions_off = {};
a25d4374 505 map { $positions_off->{ $self->{'positions'}->node_position( $_ ) } = $_ }
506 @toggled_off_nodes;
507
b49c4318 508
509 # Now for each position, we have to see if a node is on, and we
510 # have to see if a node has been turned off.
511 my @answer;
a25d4374 512 foreach my $pos ( $self->{'positions'}->all() ) {
58a3c424 513 # Find the state of this position. If there is an active node,
514 # its name will be the state; otherwise the state will be 0
515 # (nothing at this position) or undef (ellipsis at this position)
516 my $active = $self->{'positions'}->state( $pos );
a25d4374 517
b49c4318 518 # Is there a formerly active node that was toggled off?
519 if( exists( $positions_off->{$pos} ) ) {
a25d4374 520 my $off_node = $positions_off->{$pos};
58a3c424 521 if( $active && $active ne $off_node) {
b49c4318 522 push( @answer, [ $off_node, 0 ], [ $active, 1 ] );
b49c4318 523 } else {
58a3c424 524 push( @answer, [ $off_node, $active ] );
b49c4318 525 }
58a3c424 526
b49c4318 527 # No formerly active node, so we just see if there is a currently
528 # active one.
529 } elsif( $active ) {
530 # Push the active node, whatever it is.
531 push( @answer, [ $active, 1 ] );
532 } else {
58a3c424 533 # Push the state that is there. Arbitrarily use the first node
534 # at that position.
535 my @pos_nodes = $self->{'positions'}->nodes_at_position( $pos );
536 push( @answer,
537 [ $pos_nodes[0], $self->{'positions'}->state( $pos ) ] );
b49c4318 538 }
539 }
58a3c424 540
b49c4318 541 return @answer;
542}
543
a25d4374 544# A couple of helpers. TODO These should be gathered in the same place
545# eventually
b49c4318 546
a25d4374 547sub is_common {
548 my( $self, $node ) = @_;
549 $node = $self->_nodeobj( $node );
550 return $node->get_attribute('class') eq 'common';
b49c4318 551}
552
a25d4374 553sub _nodeobj {
554 my( $self, $node ) = @_;
555 unless( ref $node eq 'Graph::Easy::Node' ) {
556 $node = $self->node( $node );
b49c4318 557 }
a25d4374 558 return $node;
b49c4318 559}
560
a25d4374 561# toggle_node takes a node name, and either lemmatizes or de-lemmatizes it.
562# Returns a list of nodes that are de-lemmatized as a result of the toggle.
b49c4318 563
b49c4318 564sub toggle_node {
a25d4374 565 my( $self, $node ) = @_;
566
567 # In case this is being called for the first time.
568 $self->init_lemmatizer();
b49c4318 569
a25d4374 570 if( $self->is_common( $node ) ) {
571 # Do nothing, it's a common node.
572 return;
573 }
58a3c424 574
575 my $pos = $self->{'positions'}->node_position( $node );
576 my $old_state = $self->{'positions'}->state( $pos );
b49c4318 577 my @nodes_off;
58a3c424 578 if( $old_state && $old_state eq $node ) {
579 # Turn off the node. We turn on no others by default.
580 push( @nodes_off, $node );
581 } else {
b49c4318 582 # Turn on the node.
58a3c424 583 $self->{'positions'}->set_state( $pos, $node );
584 # Any other 'on' nodes in the same position should be off.
a25d4374 585 push( @nodes_off, $self->colocated_nodes( $node ) );
58a3c424 586 # Any node that is an identical transposed one should be off.
a25d4374 587 push( @nodes_off, $self->identical_nodes( $node ) )
588 if $self->identical_nodes( $node );
b49c4318 589 }
a25d4374 590 @nodes_off = unique_list( @nodes_off );
b49c4318 591
592 # Turn off the nodes that need to be turned off.
58a3c424 593 my @nodes_turned_off;
594 foreach my $n ( @nodes_off ) {
595 my $npos = $self->{'positions'}->node_position( $n );
596 my $state = $self->{'positions'}->state( $npos );
597 if( $state && $state eq $n ) {
598 # this node is still on
599 push( @nodes_turned_off, $n );
600 my $new_state = undef;
601 if( $n eq $node ) {
602 # This is the node that was clicked, so if there are no
603 # other nodes there, turn off the position. In all other
604 # cases, restore the ellipsis.
605 my @all_n = $self->{'positions'}->nodes_at_position( $pos );
606 $new_state = 0 if scalar( @all_n ) == 1;
607 }
608 $self->{'positions'}->set_state( $npos, $new_state );
609 } elsif( $old_state && $old_state eq $n ) {
610 # another node has already been turned on here
611 push( @nodes_turned_off, $n );
612 } # else some other node was on anyway, so pass.
613 }
614 return @nodes_turned_off;
b49c4318 615}
616
b49c4318 617sub colocated_nodes {
a25d4374 618 my $self = shift;
619 return $self->{'positions'}->colocated_nodes( @_ );
b49c4318 620}
621
a25d4374 622sub text_of_node {
623 my( $self, $node_id ) = @_;
624 # This is the label of the given node.
625 return $self->node( $node_id )->label();
b49c4318 626}
627
628sub text_for_witness {
629 my( $self, $wit ) = @_;
b49c4318 630
a25d4374 631 my @nodes = $self->{'positions'}->witness_path( $wit );
632 my @words = map { $self->node( $_ )->label() } @nodes;
b49c4318 633 return join( ' ', @words );
634}
635
a25d4374 636sub unique_list {
637 my( @list ) = @_;
638 my %h;
639 map { $h{$_} = 1 } @list;
640 return keys( %h );
b49c4318 641}
a25d4374 642
2ceca8c3 643=back
644
645=head1 LICENSE
646
647This package is free software and is provided "as is" without express
648or implied warranty. You can redistribute it and/or modify it under
649the same terms as Perl itself.
650
651=head1 AUTHOR
652
653Tara L Andrews, aurum@cpan.org
654
655=cut
656
b49c4318 6571;
a25d4374 658