made CSV parser standalone, lots of changes to Base, etc.
[scpubgit/stemmatology.git] / lib / Text / Tradition / Graph.pm
CommitLineData
e58153d6 1package Text::Tradition::Graph;
b49c4318 2
3use strict;
4use warnings;
5use Graph::Easy;
6use IPC::Run qw( run binary );
7use Module::Load;
e58153d6 8use Text::Tradition::Graph::Position;
a25d4374 9
10=head1 NAME
11
e58153d6 12Text::Tradition::Graph
a25d4374 13
14=head1 SYNOPSIS
15
2ceca8c3 16 use Text::Tradition::Graph;
a25d4374 17
2ceca8c3 18 my $text = Text::Tradition::Graph->new( 'GraphML' => '/my/graphml/file.xml' );
19 my $text = Text::Tradition::Graph->new( 'TEI' => '/my/tei/file.xml' );
20 my $text = Text::Tradition::Graph->new( 'CSV' => '/my/csv/file.csv',
21 'base' => '/my/basefile.txt' );
22 my $text = Text::Tradition::Graph->new( 'CTE' => '/my/cte/file.txt',
23 'base' => '/my/basefile.txt' );
a25d4374 24
2ceca8c3 25 my $svg_string = $text->as_svg();
a25d4374 26
2ceca8c3 27 my $lemma_nodes = $text->active_nodes();
28 $text->toggle_node( 'some_word' );
a25d4374 29
30=head1 DESCRIPTION
31
32A text tradition is the representation of our knowledge of a text that
33has been passed down via manuscript copies from a time before printing
34presses. Each text has a number of witnesses, that is, manuscripts
35that bear a version of the text. The tradition is the aggregation of
36these witnesses, which is to say, the collation of the text.
37
38This module takes a text collation and represents it as a horizontal
39directed graph, suitable for SVG rendering and for analysis of various
40forms. Since this module was written by a medievalist, it also
41provides a facility for making a critical text reconstruction by
42choosing certain variants to be 'lemma' text - that is, text which
43should be considered the 'standard' reading.
44
45Although the graph is a very good way to render text collation, and is
46visually very easy for a human to interpret, it doesn't have any
47inherent information about which nodes 'go together' - that is, which
48text readings appear in the same place as other readings. This module
49therefore calculates 'positions' on the graph, thus holding some
50information about which readings can and can't be substituted for
51others.
52
53=head1 METHODS
54
55=over 4
56
57=item B<new>
58
59Constructor. Takes a source collation file from which to construct
60the initial graph. This file can be TEI (parallel segmentation) XML,
2ceca8c3 61CSV in a format yet to be documented, GraphML as documented by the
62CollateX tool (L<http://gregor.middell.net/collatex/>), or a Classical
63Text Editor apparatus. For CSV and Classical Text Editor files, the
64user must also supply a base text to which the line numbering in the
65collation file refers.
66
6720/04/2011 Currently only CSV and GraphML are really supported.
a25d4374 68
69=cut
b49c4318 70
71sub new {
72 my $proto = shift;
73 my $class = ref( $proto ) || $proto;
74 my %opts = ( 'on_color' => 'yellow',
75 'off_color' => 'white',
76 @_ );
77 my $self = {};
78
79 # opts can be: GraphML, base+CSV, base+CTE, TEI. We need
80 # something to parse.
81 my @formats = grep { /^(GraphML|CSV|CTE|TEI)$/ } keys( %opts );
82 my $format = shift( @formats );
83 unless( $format ) {
52ce987f 84 warn "No data given to create a graph; will initialize an empty one";
b49c4318 85 }
86 if( $format =~ /^(CSV|CTE)$/ && !exists $opts{'base'} ) {
87 warn "Cannot make a graph from $format without a base text";
88 return;
89 }
90
91 # Make a graph object.
92 my $collation_graph = Graph::Easy->new();
93 $collation_graph->set_attribute( 'node', 'shape', 'ellipse' );
94 # Starting point for all texts
95 my $last_node = $collation_graph->add_node( '#START#' );
96
97 $self->{'graph'} = $collation_graph;
98 bless( $self, $class );
99
100 # Now do the parsing.
52ce987f 101 if( $format ) {
102 my @args;
103 if( $format =~ /^(CSV|CTE)$/ ) {
104 @args = ( 'base' => $opts{'base'},
105 'data' => $opts{$format},
106 'format' => $format );
107 $format = 'BaseText';
108 } else {
109 @args = ( $opts{ $format } );
110 }
111 my $mod = "Text::Tradition::Parser::$format";
112 load( $mod );
113 $mod->can('parse')->( $self, @args );
b49c4318 114 }
b49c4318 115 return $self;
116}
117
2ceca8c3 118=item B<make_positions>
119
120$graph->make_positions( $common_nodes, $paths )
121
122Create an associated Graph::Positions object that records the position
123of each node in the graph. This method call is probably in the wrong
124place and will move.
125
126=cut
127
128sub make_positions {
129 my( $self, $common_nodes, $paths ) = @_;
130 my $positions = Text::Tradition::Graph::Position->new( $common_nodes, $paths );
131 $self->{'positions'} = $positions;
132}
133
134=back
135
136=head2 Graph::Easy object accessor methods
137
138See the Graph::Easy documentation for descriptions of these functions.
139
140=over
141
142=item B<node>
143
144=cut
b49c4318 145
b49c4318 146sub node {
147 my $self = shift;
148 return $self->{'graph'}->node( @_ );
149}
150
2ceca8c3 151=item B<edge>
152
153=cut
154
b49c4318 155sub edge {
156 my $self = shift;
157 return $self->{'graph'}->edge( @_ );
158}
159
2ceca8c3 160=item B<add_node>
161
162=cut
163
c2d16875 164# Not only adds the node, but also initializes internal data
165# about the node.
2ceca8c3 166
b49c4318 167sub add_node {
168 my $self = shift;
c2d16875 169 my $node = $self->{'graph'}->add_node( @_ );
170 $self->{'identical_nodes'}->{ $node->name() } = [ $node->name() ];
171 return $node;
b49c4318 172}
173
2ceca8c3 174=item B<add_edge>
175
176=cut
177
b49c4318 178sub add_edge {
179 my $self = shift;
180 return $self->{'graph'}->add_edge( @_ );
181}
182
2ceca8c3 183=item B<del_node>
184
185=cut
186
b49c4318 187sub del_node {
188 my $self = shift;
c2d16875 189 my $node = $_[0];
190
191 # Delete this node out of any relevant transposition pool.
192 if( ref $node eq 'Graph::Easy::Node' ) {
193 $node = $node->name();
194 }
195 my @ident = $self->identical_nodes( $node );
196 if( @ident ) {
197 # Get the pool.
198 my $pool = $self->{'identical_nodes'}->{ $ident[0] };
199 foreach my $i ( 0 .. scalar(@$pool)-1 ) {
200 if( $pool->[$i] eq $node ) {
201 splice( @$pool, $i, 1 );
202 last;
203 }
204 }
205 }
206 delete $self->{'identical_nodes'}->{ $node };
207
208 # Now delete the node.
b49c4318 209 return $self->{'graph'}->del_node( @_ );
210}
211
2ceca8c3 212=item B<del_edge>
213
214=cut
215
b49c4318 216sub del_edge {
217 my $self = shift;
218 return $self->{'graph'}->del_edge( @_ );
219}
220
2ceca8c3 221=item B<nodes>
222
223=cut
224
b49c4318 225sub nodes {
226 my $self = shift;
227 return $self->{'graph'}->nodes( @_ );
228}
229
2ceca8c3 230=item B<edges>
231
232=cut
233
b49c4318 234sub edges {
235 my $self = shift;
236 return $self->{'graph'}->edges( @_ );
237}
238
2ceca8c3 239=item B<merge_nodes>
240
241=cut
242
b49c4318 243sub merge_nodes {
244 my $self = shift;
245 return $self->{'graph'}->merge_nodes( @_ );
246}
247
248### Helper methods for navigating the tree
249
2ceca8c3 250=back
251
252=head2 Graph navigation methods
253
254=over
255
256=item B<start>
257
258my $node = $graph->start();
259
260Returns the beginning node of the graph.
261
262=cut
263
b49c4318 264sub start {
265 # Return the beginning node of the graph.
266 my $self = shift;
267 my( $new_start ) = @_;
268 if( $new_start ) {
c2d16875 269 # Fix the node transposition data
270 delete $self->{'identical_nodes'}->{ $new_start->name() };
271 $self->{'identical_nodes'}->{'#START#'} = [ '#START#' ];
b49c4318 272 $self->{'graph'}->rename_node( $new_start, '#START#' );
273 }
274 return $self->{'graph'}->node('#START#');
275}
276
2ceca8c3 277=item B<next_word>
c2d16875 278
2ceca8c3 279my $next_node = $graph->next_word( $node, $path );
c2d16875 280
2ceca8c3 281Returns the node that follows the given node along the given witness
282path. TODO These are badly named.
283
284=cut
b49c4318 285
286sub next_word {
287 # Return the successor via the corresponding edge.
e49731d7 288 my $self = shift;
289 return $self->_find_linked_word( 'next', @_ );
b49c4318 290}
291
2ceca8c3 292=item B<prior_word>
293
294my $prior_node = $graph->prior_word( $node, $path );
295
296Returns the node that precedes the given node along the given witness
297path. TODO These are badly named.
298
299=cut
300
b49c4318 301sub prior_word {
302 # Return the predecessor via the corresponding edge.
e49731d7 303 my $self = shift;
304 return $self->_find_linked_word( 'prior', @_ );
305}
306
307sub _find_linked_word {
308 my( $self, $direction, $node, $edge ) = @_;
309 $edge = 'base text' unless $edge;
310 my @linked_edges = $direction eq 'next'
311 ? $node->outgoing() : $node->incoming();
312 return undef unless scalar( @linked_edges );
b49c4318 313
e49731d7 314 # We have to find the linked edge that contains all of the
315 # witnesses supplied in $edge.
316 my @edge_wits = split( /, /, $edge );
317 foreach my $le ( @linked_edges ) {
318 my @le_wits = split( /, /, $le->name() );
319 if( _is_within( \@edge_wits, \@le_wits ) ) {
320 # This is the right edge.
321 return $direction eq 'next' ? $le->to() : $le->from();
322 }
b49c4318 323 }
e49731d7 324 warn "Could not find $direction node from " . $node->label
325 . " along edge $edge";
b49c4318 326 return undef;
327}
328
e49731d7 329# Some set logic.
330sub _is_within {
331 my( $set1, $set2 ) = @_;
332 my $ret = 1;
333 foreach my $el ( @$set1 ) {
334 $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
335 }
336 return $ret;
337}
338
2ceca8c3 339=item B<node_sequence>
340
341my @nodes = $graph->node_sequence( $first, $last, $path );
342
343Returns the ordered list of nodes, starting with $first and ending
344with $last, along the given witness path.
345
346=cut
347
b49c4318 348sub node_sequence {
52ce987f 349 my( $self, $start, $end, $witness, $backup ) = @_;
b49c4318 350 unless( ref( $start ) eq 'Graph::Easy::Node'
351 && ref( $end ) eq 'Graph::Easy::Node' ) {
352 warn "Called node_sequence without two nodes!";
353 return ();
354 }
52ce987f 355 $witness = 'base text' unless $witness;
b49c4318 356 my @nodes = ( $start );
357 my %seen;
358 my $n = $start;
359 while( $n ne $end ) {
360 if( exists( $seen{$n->name()} ) ) {
361 warn "Detected loop at " . $n->name();
362 last;
363 }
364 $seen{$n->name()} = 1;
365
366 my @edges = $n->outgoing();
52ce987f 367 my @relevant_edges = grep { my @w = split( /, /, $_->label );
368 grep { /^\Q$witness\E$/ } @w } @edges;
369 unless( @relevant_edges ) {
370 @relevant_edges = grep { my @w = split( /, /, $_->label );
371 grep { /^\Q$backup\E$/ } @w } @edges
372 if $backup;
373 }
374 unless( @relevant_edges ) {
375 @relevant_edges = grep { $_->label() eq 'base text' } @edges;
376 }
377
378 warn "Did not find an edge for $witness from node " . $n->label
b49c4318 379 unless scalar @relevant_edges;
b49c4318 380 my $next = $relevant_edges[0]->to();
381 push( @nodes, $next );
382 $n = $next;
383 }
384 # Check that the last node is our end node.
385 my $last = $nodes[$#nodes];
386 warn "Last node found from " . $start->label() .
52ce987f 387 " for witness $witness is not the end!"
b49c4318 388 unless $last eq $end;
389
390 return @nodes;
391}
392
2ceca8c3 393=item B<string_lemma>
394
395my $text = $graph->string_lemma( $first, $last, $path );
396
397Returns the whitespace-separated text, starting with $first and ending
398with $last, represented in the graph along the given path.
399
400=cut
401
b49c4318 402sub string_lemma {
403 my( $self, $start, $end, $label ) = @_;
404
405 my @nodes = $self->node_sequence( $start, $end, $label );
406 my @words = map { $_->label() } @nodes;
407 return join( ' ', @words );
408}
409
2ceca8c3 410=back
411
412=head2 Transposition handling methods
413
414These should really move to their own module. For use when the graph
415has split transposed nodes in order to avoid edges that travel
416backward.
417
418=over
419
420=item B<set_identical_node>
421
422$graph->set_identical_node( $node, $other_node )
423
424Tell the graph that these two nodes contain the same (transposed) reading.
425
426=cut
427
428sub set_identical_node {
429 my( $self, $node, $same_node ) = @_;
430 my $pool = $self->{'identical_nodes'}->{ $node };
431 my $same_pool = $self->{'identical_nodes'}->{ $same_node };
432 my %poolhash;
433 foreach ( @$pool ) {
434 $poolhash{$_} = 1;
435 }
436 foreach( @$same_pool ) {
437 push( @$pool, $_ ) unless $poolhash{$_};
438 }
439
440 $self->{'identical_nodes'}->{ $same_node } = $pool;
441}
442
443=item B<set_identical_node>
444
445my @nodes = $graph->identical_nodes( $node )
446
447Get a list of nodes that contain the same (transposed) reading as the
448given node.
449
450=cut
451
452sub identical_nodes {
453 my( $self, $node ) = @_;
454 my @others = grep { $_ !~ /^$node$/ }
455 @{$self->{'identical_nodes'}->{ $node }};
456 return @others;
457}
458
459=back
460
461=head2 Output method(s)
462
463=over
464
465=item B<as_svg>
466
467print $graph->as_svg( $recalculate );
468
469Returns an SVG string that represents the graph. Uses GraphViz to do
470this, because Graph::Easy doesn't cope well with long graphs. Unless
471$recalculate is passed (and is a true value), the method will return a
472cached copy of the SVG after the first call to the method.
473
474=cut
b49c4318 475
476sub as_svg {
477 my( $self, $recalc ) = @_;
478 return $self->{'svg'} if( exists $self->{'svg'} && !$recalc );
479
480 $self->{'graphviz'} = $self->{'graph'}->as_graphviz()
481 unless( exists $self->{'graphviz'} && !$recalc );
482
483 my @cmd = qw/dot -Tsvg/;
484 my( $svg, $err );
485 my $in = $self->{'graphviz'};
486 run( \@cmd, \$in, ">", binary(), \$svg );
487 $self->{'svg'} = $svg;
488 return $svg;
489}
490
2ceca8c3 491=back
492
493=head2 Lemmatization methods
494
495=over
496
497=item B<init_lemmatizer>
498
499=cut
b49c4318 500
a25d4374 501sub init_lemmatizer {
502 my $self = shift;
503 # Initialize the 'lemma' hash, going through all the nodes and seeing
58a3c424 504 # which ones are common nodes. This should only be run once.
a25d4374 505
58a3c424 506 return if( $self->{'lemmatizer_initialized'} );
507 my @active_names = map { $_->name } grep { $self->is_common( $_ ) }
508 $self->nodes();
509 $self->{'positions'}->init_lemmatizer( @active_names );
510 $self->{'lemmatizer_initialized'} = 1;
a25d4374 511
a25d4374 512}
513
e49731d7 514=item B<toggle_node>
515
516my @nodes_turned_off = $graph->toggle_node( $node );
517
518Takes a node name, and either lemmatizes or de-lemmatizes it. Returns
519a list of all nodes that are de-lemmatized as a result of the toggle.
520
521=cut
522
523sub toggle_node {
524 my( $self, $node ) = @_;
525
526 # In case this is being called for the first time.
527 $self->init_lemmatizer();
528
52ce987f 529 if( !$node || $self->is_common( $node ) ) {
e49731d7 530 # Do nothing, it's a common node.
531 return;
532 }
533
534 my $pos = $self->{'positions'}->node_position( $node );
535 my $old_state = $self->{'positions'}->state( $pos );
536 my @nodes_off;
537 if( $old_state && $old_state eq $node ) {
538 # Turn off the node. We turn on no others by default.
539 push( @nodes_off, $node );
540 } else {
541 # Turn on the node.
542 $self->{'positions'}->set_state( $pos, $node );
543 # Any other 'on' nodes in the same position should be off.
544 push( @nodes_off, $self->colocated_nodes( $node ) );
545 # Any node that is an identical transposed one should be off.
546 push( @nodes_off, $self->identical_nodes( $node ) )
547 if $self->identical_nodes( $node );
548 }
549 @nodes_off = unique_list( @nodes_off );
550
551 # Turn off the nodes that need to be turned off.
552 my @nodes_turned_off;
553 foreach my $n ( @nodes_off ) {
554 my $npos = $self->{'positions'}->node_position( $n );
555 my $state = $self->{'positions'}->state( $npos );
556 if( $state && $state eq $n ) {
557 # this node is still on
558 push( @nodes_turned_off, $n );
559 my $new_state = undef;
560 if( $n eq $node ) {
561 # This is the node that was clicked, so if there are no
562 # other nodes there, turn off the position. In all other
563 # cases, restore the ellipsis.
564 my @all_n = $self->{'positions'}->nodes_at_position( $pos );
565 $new_state = 0 if scalar( @all_n ) == 1;
566 }
567 $self->{'positions'}->set_state( $npos, $new_state );
568 } elsif( $old_state && $old_state eq $n ) {
569 # another node has already been turned on here
570 push( @nodes_turned_off, $n );
571 } # else some other node was on anyway, so pass.
572 }
573 return @nodes_turned_off;
574}
575
576=item B<active_nodes>
577
578my @state = $graph->active_nodes( @nodes_turned_off );
579
580Takes a list of nodes that have just been turned off, and returns a
581set of tuples of the form ['node', 'state'] that indicates what
582changes need to be made to the graph.
583
584=over
585
586=item *
587
588A state of 1 means 'turn on this node'
589
590=item *
591
592A state of 0 means 'turn off this node'
593
594=item *
595
596A state of undef means 'an ellipsis belongs in the text here because
597no decision has been made'
598
599=back
600
601=cut
602
b49c4318 603sub active_nodes {
604 my( $self, @toggled_off_nodes ) = @_;
a25d4374 605
606 # In case this is the first run
607 $self->init_lemmatizer();
608 # First get the positions of those nodes which have been
609 # toggled off.
b49c4318 610 my $positions_off = {};
a25d4374 611 map { $positions_off->{ $self->{'positions'}->node_position( $_ ) } = $_ }
612 @toggled_off_nodes;
613
b49c4318 614
615 # Now for each position, we have to see if a node is on, and we
616 # have to see if a node has been turned off.
617 my @answer;
a25d4374 618 foreach my $pos ( $self->{'positions'}->all() ) {
58a3c424 619 # Find the state of this position. If there is an active node,
620 # its name will be the state; otherwise the state will be 0
621 # (nothing at this position) or undef (ellipsis at this position)
622 my $active = $self->{'positions'}->state( $pos );
a25d4374 623
b49c4318 624 # Is there a formerly active node that was toggled off?
625 if( exists( $positions_off->{$pos} ) ) {
a25d4374 626 my $off_node = $positions_off->{$pos};
58a3c424 627 if( $active && $active ne $off_node) {
b49c4318 628 push( @answer, [ $off_node, 0 ], [ $active, 1 ] );
b49c4318 629 } else {
58a3c424 630 push( @answer, [ $off_node, $active ] );
b49c4318 631 }
58a3c424 632
b49c4318 633 # No formerly active node, so we just see if there is a currently
634 # active one.
635 } elsif( $active ) {
636 # Push the active node, whatever it is.
637 push( @answer, [ $active, 1 ] );
638 } else {
58a3c424 639 # Push the state that is there. Arbitrarily use the first node
640 # at that position.
641 my @pos_nodes = $self->{'positions'}->nodes_at_position( $pos );
642 push( @answer,
643 [ $pos_nodes[0], $self->{'positions'}->state( $pos ) ] );
b49c4318 644 }
645 }
58a3c424 646
b49c4318 647 return @answer;
648}
649
52ce987f 650# A couple of helpers.
b49c4318 651
a25d4374 652sub is_common {
653 my( $self, $node ) = @_;
654 $node = $self->_nodeobj( $node );
655 return $node->get_attribute('class') eq 'common';
b49c4318 656}
657
a25d4374 658sub _nodeobj {
659 my( $self, $node ) = @_;
660 unless( ref $node eq 'Graph::Easy::Node' ) {
661 $node = $self->node( $node );
b49c4318 662 }
a25d4374 663 return $node;
b49c4318 664}
665
b49c4318 666sub colocated_nodes {
a25d4374 667 my $self = shift;
668 return $self->{'positions'}->colocated_nodes( @_ );
b49c4318 669}
670
a25d4374 671sub text_of_node {
672 my( $self, $node_id ) = @_;
673 # This is the label of the given node.
674 return $self->node( $node_id )->label();
b49c4318 675}
676
677sub text_for_witness {
678 my( $self, $wit ) = @_;
b49c4318 679
a25d4374 680 my @nodes = $self->{'positions'}->witness_path( $wit );
681 my @words = map { $self->node( $_ )->label() } @nodes;
b49c4318 682 return join( ' ', @words );
683}
684
a25d4374 685sub unique_list {
686 my( @list ) = @_;
687 my %h;
688 map { $h{$_} = 1 } @list;
689 return keys( %h );
b49c4318 690}
a25d4374 691
2ceca8c3 692=back
693
694=head1 LICENSE
695
696This package is free software and is provided "as is" without express
697or implied warranty. You can redistribute it and/or modify it under
698the same terms as Perl itself.
699
700=head1 AUTHOR
701
702Tara L Andrews, aurum@cpan.org
703
704=cut
705
b49c4318 7061;
a25d4374 707