lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use Graph::Easy;
   4 use IPC::Run qw( run binary );
   5 use Text::Tradition::Collation::Path;
   6 use Text::Tradition::Collation::Reading;
   7 use Text::Tradition::Collation::Relationship;
   8 use Text::Tradition::Collation::Segment;
   9 use XML::LibXML;
  10 use Moose;
  11
  12 has 'graph' => (
  13     is => 'ro',
  14     isa => 'Graph::Easy',
  15     handles => {
  16         add_reading => 'add_node',
  17         del_reading => 'del_node',
  18         add_path => 'add_edge',
  19         del_path => 'del_edge',
  20         reading => 'node',
  21         path => 'edge',
  22         readings => 'nodes',
  23         segments => 'nodes',
  24         paths => 'edges',
  25         relationships => 'edges',
  26     },
  27     default => sub { Graph::Easy->new( undirected => 0 ) },
  28     );
  29
  30
  31 has 'tradition' => (
  32     is => 'rw',
  33     isa => 'Text::Tradition',
  34     );
  35
  36 has 'svg' => (
  37     is => 'ro',
  38     isa => 'Str',
  39     writer => '_save_svg',
  40     predicate => 'has_svg',
  41     );
  42
  43 has 'graphml' => (
  44     is => 'ro',
  45     isa => 'Str',
  46     writer => '_save_graphml',
  47     predicate => 'has_graphml',
  48     );
  49
  50 # Keeps track of the lemmas within the collation.  At most one lemma
  51 # per position in the graph.
  52 has 'lemmata' => (
  53     is => 'ro',
  54     isa => 'HashRef[Maybe[Str]]',
  55     default => sub { {} },
  56     );
  57
  58 has 'wit_list_separator' => (
  59     is => 'rw',
  60     isa => 'Str',
  61     default => ', ',
  62     );
  63
  64 has 'baselabel' => (
  65     is => 'rw',
  66     isa => 'Str',
  67     default => 'base text',
  68     );
  69
  70 has 'collapsed' => (
  71     is => 'rw',
  72     isa => 'Bool',
  73     );
  74
  75 has 'linear' => (
  76     is => 'rw',
  77     isa => 'Bool',
  78     default => 1,
  79     );
  80
  81 has 'ac_label' => (
  82     is => 'rw',
  83     isa => 'Str',
  84     default => ' (a.c.)',
  85     );
  86
  87
  88 # The collation can be created two ways:
  89 # 1. Collate a set of witnesses (with CollateX I guess) and process
  90 #    the results as in 2.
  91 # 2. Read a pre-prepared collation in one of a variety of formats,
  92 #    and make the graph from that.
  93
  94 # The graph itself will (for now) be immutable, and the positions
  95 # within the graph will also be immutable.  We need to calculate those
  96 # positions upon graph construction.  The equivalences between graph
  97 # nodes will be mutable, entirely determined by the user (or possibly
  98 # by some semantic pre-processing provided by the user.)  So the
  99 # constructor should just make an empty equivalences object.  The
 100 # constructor will also need to make the witness objects, if we didn't
 101 # come through option 1.
 102
 103 sub BUILD {
 104     my( $self, $args ) = @_;
 105     $self->graph->use_class('node', 'Text::Tradition::Collation::Reading');
 106     $self->graph->use_class('edge', 'Text::Tradition::Collation::Path');
 107
 108     # Pass through any graph-specific options.
 109     my $shape = exists( $args->{'shape'} ) ? $args->{'shape'} : 'ellipse';
 110     $self->graph->set_attribute( 'node', 'shape', $shape );
 111 }
 112
 113 # Wrapper around add_path
 114
 115 around add_path => sub {
 116     my $orig = shift;
 117     my $self = shift;
 118
 119     # Make sure there are three arguments
 120     unless( @_ == 3 ) {
 121         warn "Call add_path with args source, target, witness";
 122         return;
 123     }
 124     # Make sure the proposed path does not yet exist
 125     # NOTE 'reading' will currently return readings and segments
 126     my( $source, $target, $wit ) = @_;
 127     $source = $self->reading( $source )
 128         unless ref( $source ) eq 'Text::Tradition::Collation::Reading';
 129     $target = $self->reading( $target )
 130         unless ref( $target ) eq 'Text::Tradition::Collation::Reading';
 131     foreach my $path ( $source->edges_to( $target ) ) {
 132         if( $path->label eq $wit && $path->class eq 'edge.path' ) {
 133             return;
 134         }
 135     }
 136     # Do the deed
 137     $self->$orig( @_ );
 138 };
 139
 140 # Wrapper around paths
 141 around paths => sub {
 142     my $orig = shift;
 143     my $self = shift;
 144
 145     my @result = grep { $_->sub_class eq 'path' } $self->$orig( @_ );
 146     return @result;
 147 };
 148
 149 around relationships => sub {
 150     my $orig = shift;
 151     my $self = shift;
 152     my @result = grep { $_->sub_class eq 'relationship' } $self->$orig( @_ );
 153     return @result;
 154 };
 155
 156 around readings => sub {
 157     my $orig = shift;
 158     my $self = shift;
 159     my @result = grep { $_->sub_class ne 'segment' } $self->$orig( @_ );
 160     return @result;
 161 };
 162
 163 around segments => sub {
 164     my $orig = shift;
 165     my $self = shift;
 166     my @result = grep { $_->sub_class eq 'segment' } $self->$orig( @_ );
 167     return @result;
 168 };
 169
 170 # Wrapper around merge_nodes
 171
 172 sub merge_readings {
 173     my $self = shift;
 174     my $first_node = shift;
 175     my $second_node = shift;
 176     $first_node->merge_from( $second_node );
 177     unshift( @_, $first_node, $second_node );
 178     return $self->graph->merge_nodes( @_ );
 179 }
 180
 181 # Extra graph-alike utility
 182 sub has_path {
 183     my( $self, $source, $target, $label ) = @_;
 184     my @paths = $source->edges_to( $target );
 185     my @relevant = grep { $_->label eq $label } @paths;
 186     return scalar @relevant;
 187 }
 188
 189 ## Dealing with groups of readings, i.e. segments.
 190
 191 sub add_segment {
 192     my( $self, @items ) = @_;
 193     my $segment = Text::Tradition::Collation::Segment->new( 'members' => \@items );
 194     return $segment;
 195 }
 196
 197 ## Dealing with relationships between readings.  This is a different
 198 ## sort of graph edge.
 199
 200 sub add_relationship {
 201     my( $self, $source, $target, $options ) = @_;
 202
 203     # Make sure there is not another relationship between these two
 204     # readings or segments already
 205     $source = $self->reading( $source )
 206         unless ref( $source ) && $source->isa( 'Graph::Easy::Node' );
 207     $target = $self->reading( $target )
 208         unless ref( $target ) && $target->isa( 'Graph::Easy::Node' );
 209     foreach my $rel ( $source->edges_to( $target ) ) {
 210         if( $rel->label eq $options->{'type'} && $rel->class eq 'edge.relationship' ) {
 211             return;
 212         }
 213     }
 214     $options->{'orig_relation'} = [ $source, $target ];
 215
 216     my $rel = Text::Tradition::Collation::Relationship->new( %$options );
 217     $self->graph->add_edge( $source, $target, $rel );
 218     if( $options->{'global'} ) {
 219         # Look for all readings with the source label, and if there are
 220         # colocated readings with the target label, join them too.
 221         foreach my $r ( $self->readings() ) {
 222             next unless $r->label eq $source->label;
 223             my @colocated = grep { $_->label eq $target->label }
 224                 $self->same_position_as( $r );
 225             if( @colocated ) {
 226                 warn "Multiple readings with same label at same position!"
 227                     if @colocated > 1;
 228                 my $dup_rel = Text::Tradition::Collation::Relationship->new( %$options );
 229                 $self->graph->add_edge( $r, $colocated[0], $dup_rel );
 230             }
 231         }
 232     }
 233 }
 234
 235 =head2 Output method(s)
 236
 237 =over
 238
 239 =item B<as_svg>
 240
 241 print $graph->as_svg( $recalculate );
 242
 243 Returns an SVG string that represents the graph.  Uses GraphViz to do
 244 this, because Graph::Easy doesn\'t cope well with long graphs. Unless
 245 $recalculate is passed (and is a true value), the method will return a
 246 cached copy of the SVG after the first call to the method.
 247
 248 =cut
 249
 250 sub as_svg {
 251     my( $self, $recalc ) = @_;
 252     return $self->svg if $self->has_svg;
 253
 254     $self->collapse_graph_paths();
 255
 256     my @cmd = qw/dot -Tsvg/;
 257     my( $svg, $err );
 258     my $in = $self->as_dot();
 259     run( \@cmd, \$in, ">", binary(), \$svg );
 260     $self->_save_svg( $svg );
 261     $self->expand_graph_paths();
 262     return $svg;
 263 }
 264
 265 =item B<as_dot>
 266
 267 print $graph->as_dot( $view, $recalculate );
 268
 269 Returns a string that is the collation graph expressed in dot
 270 (i.e. GraphViz) format.  The 'view' argument determines what kind of
 271 graph is produced.
 272     * 'path': a graph of witness paths through the collation (DEFAULT)
 273     * 'relationship': a graph of how collation readings relate to
 274       each other
 275
 276 =cut
 277
 278 sub as_dot {
 279     my( $self, $view ) = @_;
 280     $view = 'path' unless $view;
 281     # TODO consider making some of these things configurable
 282     my $dot = sprintf( "digraph %s {\n", $self->tradition->name );
 283     $dot .= "\tedge [ arrowhead=open ];\n";
 284     $dot .= "\tgraph [ rankdir=LR ];\n";
 285     $dot .= sprintf( "\tnode [ fontsize=%d, fillcolor=%s, style=%s, shape=%s ];\n",
 286                      11, "white", "filled", $self->graph->get_attribute( 'node', 'shape' ) );
 287
 288     foreach my $reading ( $self->readings ) {
 289         # Need not output nodes without separate labels
 290         next if $reading->name eq $reading->label;
 291         # TODO output readings or segments, but not both
 292         next if $reading->class eq 'node.segment';
 293         $dot .= sprintf( "\t\"%s\" [ label=\"%s\" ]\n", $reading->name, $reading->label );
 294     }
 295
 296     my @edges = $view eq 'relationship' ? $self->relationships : $self->paths;
 297     foreach my $edge ( @edges ) {
 298         $dot .= sprintf( "\t\"%s\" -> \"%s\" [ color=\"%s\", fontcolor=\"%s\", label=\"%s\" ]\n",
 299                          $edge->from->name, $edge->to->name, '#000000', '#000000', $edge->label );
 300     }
 301
 302     $dot .= "}\n";
 303     return $dot;
 304 }
 305
 306 =item B<as_graphml>
 307
 308 print $graph->as_graphml( $recalculate )
 309
 310 Returns a GraphML representation of the collation graph, with
 311 transposition information and position information. Unless
 312 $recalculate is passed (and is a true value), the method will return a
 313 cached copy of the SVG after the first call to the method.
 314
 315 =cut
 316
 317 sub as_graphml {
 318     my( $self, $recalc ) = @_;
 319     return $self->graphml if $self->has_graphml;
 320
 321     # Some namespaces
 322     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
 323     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
 324     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
 325         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
 326
 327     # Create the document and root node
 328     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
 329     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
 330     $graphml->setDocumentElement( $root );
 331     $root->setNamespace( $xsi_ns, 'xsi', 0 );
 332     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
 333
 334     # Add the data keys for nodes
 335     my %node_data_keys;
 336     my $ndi = 0;
 337     foreach my $datum ( qw/ name reading identical position class / ) {
 338         $node_data_keys{$datum} = 'dn'.$ndi++;
 339         my $key = $root->addNewChild( $graphml_ns, 'key' );
 340         $key->setAttribute( 'attr.name', $datum );
 341         $key->setAttribute( 'attr.type', 'string' );
 342         $key->setAttribute( 'for', 'node' );
 343         $key->setAttribute( 'id', $node_data_keys{$datum} );
 344     }
 345
 346     # Add the data keys for edges, i.e. witnesses
 347     my $edi = 0;
 348     my %edge_data_keys;
 349     foreach my $edge_key( qw/ witness_main witness_ante_corr relationship class / ) {
 350         $edge_data_keys{$edge_key} = 'de'.$edi++;
 351         my $key = $root->addNewChild( $graphml_ns, 'key' );
 352         $key->setAttribute( 'attr.name', $edge_key );
 353         $key->setAttribute( 'attr.type', 'string' );
 354         $key->setAttribute( 'for', 'edge' );
 355         $key->setAttribute( 'id', $edge_data_keys{$edge_key} );
 356     }
 357
 358     # Add the graph, its nodes, and its edges
 359     my $graph = $root->addNewChild( $graphml_ns, 'graph' );
 360     $graph->setAttribute( 'edgedefault', 'directed' );
 361     $graph->setAttribute( 'id', 'g0' ); # TODO make this meaningful
 362     $graph->setAttribute( 'parse.edgeids', 'canonical' );
 363     $graph->setAttribute( 'parse.edges', scalar($self->paths) );
 364     $graph->setAttribute( 'parse.nodeids', 'canonical' );
 365     $graph->setAttribute( 'parse.nodes', scalar($self->readings) );
 366     $graph->setAttribute( 'parse.order', 'nodesfirst' );
 367
 368     my $node_ctr = 0;
 369     my %node_hash;
 370     # Add our readings to the graph
 371     foreach my $n ( sort { $a->name cmp $b->name } $self->readings ) {
 372         my $node_el = $graph->addNewChild( $graphml_ns, 'node' );
 373         my $node_xmlid = 'n' . $node_ctr++;
 374         $node_hash{ $n->name } = $node_xmlid;
 375         $node_el->setAttribute( 'id', $node_xmlid );
 376         _add_graphml_data( $node_el, $node_data_keys{'name'}, $n->name );
 377         _add_graphml_data( $node_el, $node_data_keys{'reading'}, $n->label );
 378         _add_graphml_data( $node_el, $node_data_keys{'position'}, $n->position );
 379         _add_graphml_data( $node_el, $node_data_keys{'class'}, $n->sub_class );
 380         _add_graphml_data( $node_el, $node_data_keys{'identical'}, $n->primary->name )
 381             if $n->has_primary;
 382     }
 383
 384     # Add any segments we have
 385     foreach my $n ( sort { $a->name cmp $b->name } $self->segments ) {
 386         my $node_el = $graph->addNewChild( $graphml_ns, 'node' );
 387         my $node_xmlid = 'n' . $node_ctr++;
 388         $node_hash{ $n->name } = $node_xmlid;
 389         $node_el->setAttribute( 'id', $node_xmlid );
 390         _add_graphml_data( $node_el, $node_data_keys{'class'}, $n->sub_class );
 391         _add_graphml_data( $node_el, $node_data_keys{'name'}, $n->name );
 392     }
 393
 394     # Add the path, relationship, and segment edges
 395     my $edge_ctr = 0;
 396     foreach my $e ( sort { $a->from->name cmp $b->from->name } $self->graph->edges() ) {
 397         my( $name, $from, $to ) = ( 'e'.$edge_ctr++,
 398                                     $node_hash{ $e->from->name() },
 399                                     $node_hash{ $e->to->name() } );
 400         my $edge_el = $graph->addNewChild( $graphml_ns, 'edge' );
 401         $edge_el->setAttribute( 'source', $from );
 402         $edge_el->setAttribute( 'target', $to );
 403         $edge_el->setAttribute( 'id', $name );
 404         # Add the edge class
 405         _add_graphml_data( $edge_el, $edge_data_keys{'class'}, $e->sub_class );
 406         if( $e->sub_class eq 'path' ) {
 407             # It's a witness path, so add the witness
 408             my $base = $e->label;
 409             my $key = $edge_data_keys{'witness_main'};
 410             # TODO kind of hacky
 411             if( $e->label =~ /^(.*?)\s+(\(a\.c\.\))$/ ) {
 412                 $base = $1;
 413                 $key = $edge_data_keys{'witness_ante_corr'};
 414             }
 415             _add_graphml_data( $edge_el, $key, $base );
 416         } elsif( $e->sub_class eq 'relationship' ) {
 417             # It's a relationship
 418             _add_graphml_data( $edge_el, $edge_data_keys{'relationship'}, $e->label );
 419         } # else a segment, nothing to record but source, target, class
 420     }
 421
 422     # Return the thing
 423     $self->_save_graphml( $graphml->toString(1) );
 424     return $graphml->toString(1);
 425 }
 426
 427 sub _add_graphml_data {
 428     my( $el, $key, $value ) = @_;
 429     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
 430     return unless defined $value;
 431     $data_el->setAttribute( 'key', $key );
 432     $data_el->appendText( $value );
 433 }
 434
 435 sub collapse_graph_paths {
 436     my $self = shift;
 437     # Our collation graph has an path per witness.  This is great for
 438     # calculation purposes, but terrible for display.  Thus we want to
 439     # display only one path between any two nodes.
 440
 441     return if $self->collapsed;
 442
 443     print STDERR "Collapsing witness paths in graph...\n";
 444
 445     # Don't list out every witness if we have more than half to list.
 446     my $majority = int( scalar( @{$self->tradition->witnesses} ) / 2 ) + 1;
 447     # But don't compress if there are only a few witnesses.
 448     $majority = 4 if $majority < 4;
 449     foreach my $node ( $self->readings ) {
 450         my $newlabels = {};
 451         # We will visit each node, so we only look ahead.
 452         foreach my $edge ( $node->outgoing() ) {
 453             next unless $edge->class eq 'edge.path';
 454             add_hash_entry( $newlabels, $edge->to->name, $edge->name );
 455             $self->del_path( $edge );
 456         }
 457
 458         foreach my $newdest ( keys %$newlabels ) {
 459             my $label;
 460             my @compressed_wits = ();
 461             if( @{$newlabels->{$newdest}} < $majority ) {
 462                 $label = join( ', ', sort( @{$newlabels->{$newdest}} ) );
 463             } else {
 464                 ## TODO FIX THIS HACK
 465                 my @aclabels;
 466                 foreach my $wit ( @{$newlabels->{$newdest}} ) {
 467                     if( $wit =~ /^(.*?)(\s*\(?a\.\s*c\.\)?)$/ ) {
 468                         push( @aclabels, $wit );
 469                     } else {
 470                         push( @compressed_wits, $wit );
 471                     }
 472                 }
 473                 $label = join( ', ', 'majority', sort( @aclabels ) );
 474             }
 475
 476             my $newpath =
 477                 $self->add_path( $node, $self->reading( $newdest ), $label );
 478             if( @compressed_wits ) {
 479                 $newpath->hidden_witnesses( \@compressed_wits );
 480             }
 481         }
 482     }
 483
 484     $self->collapsed( 1 );
 485 }
 486
 487 sub expand_graph_paths {
 488     my $self = shift;
 489     # Our collation graph has only one path between any two nodes.
 490     # This is great for display, but not so great for analysis.
 491     # Expand this so that each witness has its own path between any
 492     # two reading nodes.
 493     return unless $self->collapsed;
 494
 495     print STDERR "Expanding witness paths in graph...\n";
 496     foreach my $path( $self->paths ) {
 497         my $from = $path->from;
 498         my $to = $path->to;
 499         my @wits = split( /, /, $path->label );
 500         if( $path->has_hidden_witnesses ) {
 501             push( @wits, @{$path->hidden_witnesses} );
 502         }
 503         $self->del_path( $path );
 504         foreach ( @wits ) {
 505             $self->add_path( $from, $to, $_ );
 506         }
 507     }
 508     $self->collapsed( 0 );
 509 }
 510
 511 =back
 512
 513 =head2 Navigation methods
 514
 515 =over
 516
 517 =item B<start>
 518
 519 my $beginning = $collation->start();
 520
 521 Returns the beginning of the collation, a meta-reading with label '#START#'.
 522
 523 =cut
 524
 525 sub start {
 526     # Return the beginning reading of the graph.
 527     my $self = shift;
 528     my( $new_start ) = @_;
 529     if( $new_start ) {
 530         $self->del_reading( '#START#' );
 531         $self->graph->rename_node( $new_start, '#START#' );
 532     }
 533     return $self->reading('#START#');
 534 }
 535
 536 =item B<reading_sequence>
 537
 538 my @readings = $graph->reading_sequence( $first, $last, $path[, $alt_path] );
 539
 540 Returns the ordered list of readings, starting with $first and ending
 541 with $last, along the given witness path.  If no path is specified,
 542 assume that the path is that of the base text (if any.)
 543
 544 =cut
 545
 546 sub reading_sequence {
 547     my( $self, $start, $end, $witness, $backup ) = @_;
 548
 549     $witness = $self->baselabel unless $witness;
 550     my @readings = ( $start );
 551     my %seen;
 552     my $n = $start;
 553     while( $n && $n ne $end ) {
 554         if( exists( $seen{$n->name()} ) ) {
 555             warn "Detected loop at " . $n->name();
 556             last;
 557         }
 558         $seen{$n->name()} = 1;
 559
 560         my $next = $self->next_reading( $n, $witness, $backup );
 561         warn "Did not find any path for $witness from reading " . $n->name
 562             unless $next;
 563         push( @readings, $next );
 564         $n = $next;
 565     }
 566     # Check that the last reading is our end reading.
 567     my $last = $readings[$#readings];
 568     warn "Last reading found from " . $start->label() .
 569         " for witness $witness is not the end!"
 570         unless $last eq $end;
 571
 572     return @readings;
 573 }
 574
 575 =item B<next_reading>
 576
 577 my $next_reading = $graph->next_reading( $reading, $witpath );
 578
 579 Returns the reading that follows the given reading along the given witness
 580 path.
 581
 582 =cut
 583
 584 sub next_reading {
 585     # Return the successor via the corresponding path.
 586     my $self = shift;
 587     return $self->_find_linked_reading( 'next', @_ );
 588 }
 589
 590 =item B<prior_reading>
 591
 592 my $prior_reading = $graph->prior_reading( $reading, $witpath );
 593
 594 Returns the reading that precedes the given reading along the given witness
 595 path.
 596
 597 =cut
 598
 599 sub prior_reading {
 600     # Return the predecessor via the corresponding path.
 601     my $self = shift;
 602     return $self->_find_linked_reading( 'prior', @_ );
 603 }
 604
 605 sub _find_linked_reading {
 606     my( $self, $direction, $node, $path, $alt_path ) = @_;
 607     my @linked_paths = $direction eq 'next'
 608         ? $node->outgoing() : $node->incoming();
 609     return undef unless scalar( @linked_paths );
 610
 611     # We have to find the linked path that contains all of the
 612     # witnesses supplied in $path.
 613     my( @path_wits, @alt_path_wits );
 614     @path_wits = $self->witnesses_of_label( $path ) if $path;
 615     @alt_path_wits = $self->witnesses_of_label( $alt_path ) if $alt_path;
 616     my $base_le;
 617     my $alt_le;
 618     foreach my $le ( @linked_paths ) {
 619         if( $le->name eq $self->baselabel ) {
 620             $base_le = $le;
 621         } else {
 622             my @le_wits = $self->witnesses_of_label( $le->name );
 623             if( _is_within( \@path_wits, \@le_wits ) ) {
 624                 # This is the right path.
 625                 return $direction eq 'next' ? $le->to() : $le->from();
 626             } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
 627                 $alt_le = $le;
 628             }
 629         }
 630     }
 631     # Got this far? Return the alternate path if it exists.
 632     return $direction eq 'next' ? $alt_le->to() : $alt_le->from()
 633         if $alt_le;
 634
 635     # Got this far? Return the base path if it exists.
 636     return $direction eq 'next' ? $base_le->to() : $base_le->from()
 637         if $base_le;
 638
 639     # Got this far? We have no appropriate path.
 640     warn "Could not find $direction node from " . $node->label
 641         . " along path $path";
 642     return undef;
 643 }
 644
 645 # Some set logic.
 646 sub _is_within {
 647     my( $set1, $set2 ) = @_;
 648     my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
 649     foreach my $el ( @$set1 ) {
 650         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
 651     }
 652     return $ret;
 653 }
 654
 655
 656 ## INITIALIZATION METHODS - for use by parsers
 657 # Walk the paths for each witness in the graph, and return the nodes
 658 # that the graph has in common.  If $using_base is true, some
 659 # different logic is needed.
 660
 661 sub walk_witness_paths {
 662     my( $self, $end ) = @_;
 663     # For each witness, walk the path through the graph.
 664     # Then we need to find the common nodes.
 665     # TODO This method is going to fall down if we have a very gappy
 666     # text in the collation.
 667     my $paths = {};
 668     my @common_readings;
 669     foreach my $wit ( @{$self->tradition->witnesses} ) {
 670         my $curr_reading = $self->start;
 671         my @wit_path = $self->reading_sequence( $self->start, $end,
 672                                                 $wit->sigil );
 673         $wit->path( \@wit_path );
 674
 675         # Detect the common readings.
 676         @common_readings = _find_common( \@common_readings, \@wit_path );
 677     }
 678
 679     # Mark all the nodes as either common or not.
 680     foreach my $cn ( @common_readings ) {
 681         print STDERR "Setting " . $cn->name . " / " . $cn->label
 682             . " as common node\n";
 683         $cn->make_common;
 684     }
 685     foreach my $n ( $self->readings() ) {
 686         $n->make_variant unless $n->is_common;
 687     }
 688     # Return an array of the common nodes in order.
 689     return @common_readings;
 690 }
 691
 692 sub _find_common {
 693     my( $common_readings, $new_path ) = @_;
 694     my @cr;
 695     if( @$common_readings ) {
 696         foreach my $n ( @$new_path ) {
 697             push( @cr, $n ) if grep { $_ eq $n } @$common_readings;
 698         }
 699     } else {
 700         push( @cr, @$new_path );
 701     }
 702     return @cr;
 703 }
 704
 705 sub _remove_common {
 706     my( $common_readings, $divergence ) = @_;
 707     my @cr;
 708     my %diverged;
 709     map { $diverged{$_->name} = 1 } @$divergence;
 710     foreach( @$common_readings ) {
 711         push( @cr, $_ ) unless $diverged{$_->name};
 712     }
 713     return @cr;
 714 }
 715
 716
 717 # An alternative to walk_witness_paths, for use when a collation is
 718 # constructed from a base text and an apparatus.  We have the
 719 # sequences of readings and just need to add path edges.
 720
 721 sub make_witness_paths {
 722     my( $self ) = @_;
 723
 724     my @common_readings;
 725     foreach my $wit ( @{$self->tradition->witnesses} ) {
 726         print STDERR "Making path for " . $wit->sigil . "\n";
 727         $self->make_witness_path( $wit );
 728         @common_readings = _find_common( \@common_readings, $wit->path );
 729         @common_readings = _find_common( \@common_readings, $wit->uncorrected_path );
 730     }
 731     map { $_->make_common } @common_readings;
 732     return @common_readings;
 733 }
 734
 735 sub make_witness_path {
 736     my( $self, $wit ) = @_;
 737     my @chain = @{$wit->path};
 738     my $sig = $wit->sigil;
 739     foreach my $idx ( 0 .. $#chain-1 ) {
 740         $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
 741     }
 742     @chain = @{$wit->uncorrected_path};
 743     foreach my $idx( 0 .. $#chain-1 ) {
 744         my $source = $chain[$idx];
 745         my $target = $chain[$idx+1];
 746         $self->add_path( $source, $target, $sig.$self->ac_label )
 747             unless $self->has_path( $source, $target, $sig );
 748     }
 749 }
 750
 751 sub common_readings {
 752     my $self = shift;
 753     my @common = grep { $_->is_common } $self->readings();
 754     return sort { _cmp_position( $a->position, $b->position ) } @common;
 755 }
 756
 757 # Calculate the relative positions of nodes in the graph, if they
 758 # were not given to us.
 759 sub calculate_positions {
 760     my( $self, @ordered_common ) = @_;
 761
 762     # We have to calculate the position identifiers for each word,
 763     # keyed on the common nodes.  This will be 'fun'.  The end result
 764     # is a hash per witness, whose key is the word node and whose
 765     # value is its position in the text.  Common nodes are always N,1
 766     # so have identical positions in each text.
 767
 768     my $node_pos = {};
 769     foreach my $wit ( @{$self->tradition->witnesses} ) {
 770         print STDERR "Calculating positions in " . $wit->sigil . "\n";
 771         _update_positions_from_path( $wit->path, @ordered_common );
 772         _update_positions_from_path( $wit->uncorrected_path, @ordered_common )
 773             if $wit->has_ante_corr;
 774     }
 775
 776     # DEBUG
 777     foreach my $r ( $self->readings() ) {
 778         print STDERR "Reading " . $r->name . "/" . $r->label . " has no position\n"
 779             unless( $r->has_position );
 780     }
 781
 782     $self->init_lemmata();
 783 }
 784
 785 sub _update_positions_from_path {
 786     my( $path, @ordered_common ) = @_;
 787
 788     # First we walk the given path, making a matrix for the witness
 789     # that corresponds to its eventual position identifier.  Common
 790     # nodes always start a new row, and are thus always in the first
 791     # column.
 792     my $wit_matrix = [];
 793     my $cn = 0;  # We should hit the common readings in order.
 794     my $row = [];
 795     foreach my $wn ( @{$path} ) {
 796         if( $wn eq $ordered_common[$cn] ) {
 797             # Set up to look for the next common node, and
 798             # start a new row of words.
 799             $cn++;
 800             push( @$wit_matrix, $row ) if scalar( @$row );
 801             $row = [];
 802         }
 803         push( @$row, $wn );
 804     }
 805     push( @$wit_matrix, $row );  # Push the last row onto the matrix
 806
 807     # Now we have a matrix per witness, so that each row in the
 808     # matrix begins with a common node, and continues with all the
 809     # variant words that appear in the witness.  We turn this into
 810     # real positions in row,cell format.  But we need some
 811     # trickery in order to make sure that each node gets assigned
 812     # to only one position.
 813
 814     foreach my $li ( 1..scalar(@$wit_matrix) ) {
 815         foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) {
 816             my $reading = $wit_matrix->[$li-1]->[$di-1];
 817             my $position = "$li,$di";
 818
 819             # If we have seen this node before, we need to compare
 820             # its position with what went before.
 821             unless( $reading->has_position &&
 822                     _cmp_position( $position, $reading->position ) < 1 ) {
 823                 # The new position ID replaces the old one.
 824                 $reading->position( $position );
 825             } # otherwise, the old position needs to stay.
 826         }
 827     }
 828 }
 829
 830 sub _cmp_position {
 831     my( $a, $b ) = @_;
 832     if ( $a && $b ) {
 833         my @pos_a = split(/,/, $a );
 834         my @pos_b = split(/,/, $b );
 835
 836         my $big_cmp = $pos_a[0] <=> $pos_b[0];
 837         return $big_cmp if $big_cmp;
 838         # else
 839         return $pos_a[1] <=> $pos_b[1];
 840     } elsif ( $b ) { # a is undefined
 841         return -1;
 842     } elsif ( $a ) { # b is undefined
 843         return 1;
 844     }
 845     return 0; # they are both undefined
 846 }
 847
 848 sub all_positions {
 849     my $self = shift;
 850     my %positions = ();
 851     map { $positions{$_->position} = 1 } $self->readings;
 852     my @answer = sort { _cmp_position( $a, $b ) } keys( %positions );
 853     return @answer;
 854 }
 855
 856 sub readings_at_position {
 857     my( $self, $pos ) = @_;
 858     my @answer = grep { $_->position eq $pos } $self->readings;
 859     return @answer;
 860 }
 861
 862 ## Lemmatizer functions
 863
 864 sub init_lemmata {
 865     my $self = shift;
 866
 867     foreach my $position ( $self->all_positions ) {
 868         $self->lemmata->{$position} = undef;
 869     }
 870
 871     foreach my $cr ( $self->common_readings ) {
 872         $self->lemmata->{$cr->position} = $cr->name;
 873     }
 874 }
 875
 876 =item B<lemma_readings>
 877
 878 my @state = $graph->lemma_readings( @readings_delemmatized );
 879
 880 Takes a list of readings that have just been delemmatized, and returns
 881 a set of tuples of the form ['reading', 'state'] that indicates what
 882 changes need to be made to the graph.
 883
 884 =over
 885
 886 =item *
 887
 888 A state of 1 means 'lemmatize this reading'
 889
 890 =item *
 891
 892 A state of 0 means 'delemmatize this reading'
 893
 894 =item *
 895
 896 A state of undef means 'an ellipsis belongs in the text here because
 897 no decision has been made / an earlier decision was backed out'
 898
 899 =back
 900
 901 =cut
 902
 903 sub lemma_readings {
 904     my( $self, @toggled_off_nodes ) = @_;
 905
 906     # First get the positions of those nodes which have been
 907     # toggled off.
 908     my $positions_off = {};
 909     map { $positions_off->{ $_->position } = $_->name } @toggled_off_nodes;
 910
 911     # Now for each position, we have to see if a node is on, and we
 912     # have to see if a node has been turned off.
 913     my @answer;
 914     foreach my $pos ( $self->all_positions() ) {
 915         # Find the state of this position.  If there is an active node,
 916         # its name will be the state; otherwise the state will be 0
 917         # (nothing at this position) or undef (ellipsis at this position)
 918         my $active = $self->lemmata->{$pos};
 919
 920         # Is there a formerly active node that was toggled off?
 921         if( exists( $positions_off->{$pos} ) ) {
 922             my $off_node = $positions_off->{$pos};
 923             if( $active && $active ne $off_node) {
 924                 push( @answer, [ $off_node, 0 ], [ $active, 1 ] );
 925             } else {
 926                 push( @answer, [ $off_node, $active ] );
 927             }
 928
 929         # No formerly active node, so we just see if there is a currently
 930         # active one.
 931         } elsif( $active ) {
 932             # Push the active node, whatever it is.
 933             push( @answer, [ $active, 1 ] );
 934         } else {
 935             # Push the state that is there. Arbitrarily use the first node
 936             # at that position.
 937             my @pos_nodes = $self->readings_at_position( $pos );
 938             push( @answer, [ $pos_nodes[0]->name, $self->lemmata->{$pos} ] );
 939         }
 940     }
 941
 942     return @answer;
 943 }
 944
 945 =item B<toggle_reading>
 946
 947 my @readings_delemmatized = $graph->toggle_reading( $reading_name );
 948
 949 Takes a reading node name, and either lemmatizes or de-lemmatizes
 950 it. Returns a list of all readings that are de-lemmatized as a result
 951 of the toggle.
 952
 953 =cut
 954
 955 sub toggle_reading {
 956     my( $self, $rname ) = @_;
 957
 958     return unless $rname;
 959     my $reading = $self->reading( $rname );
 960     if( !$reading || $reading->is_common() ) {
 961         # Do nothing, it's a common node.
 962         return;
 963     }
 964
 965     my $pos = $reading->position;
 966     my $old_state = $self->lemmata->{$pos};
 967     my @readings_off;
 968     if( $old_state && $old_state eq $rname ) {
 969         # Turn off the node. We turn on no others by default.
 970         push( @readings_off, $reading );
 971     } else {
 972         # Turn on the node.
 973         $self->lemmata->{$pos} = $rname;
 974         # Any other 'on' readings in the same position should be off.
 975         push( @readings_off, $self->same_position_as( $reading ) );
 976         # Any node that is an identical transposed one should be off.
 977         push( @readings_off, $reading->identical_readings );
 978     }
 979     @readings_off = unique_list( @readings_off );
 980
 981     # Turn off the readings that need to be turned off.
 982     my @readings_delemmatized;
 983     foreach my $n ( @readings_off ) {
 984         my $state = $self->lemmata->{$n->position};
 985         if( $state && $state eq $n->name ) {
 986             # this reading is still on, so turn it off
 987             push( @readings_delemmatized, $n );
 988             my $new_state = undef;
 989             if( $n eq $reading ) {
 990                 # This is the reading that was clicked, so if there are no
 991                 # other readings there, turn off the position.  In all other
 992                 # cases, restore the ellipsis.
 993                 my @other_n = $self->same_position_as( $n );
 994                 $new_state = 0 unless @other_n;
 995             }
 996             $self->lemmata->{$n->position} = $new_state;
 997         } elsif( $old_state && $old_state eq $n->name ) {
 998             # another reading has already been turned on here
 999             push( @readings_delemmatized, $n );
1000         } # else some other reading was on anyway, so pass.
1001     }
1002     return @readings_delemmatized;
1003 }
1004
1005 sub same_position_as {
1006     my( $self, $reading ) = @_;
1007     my $pos = $reading->position;
1008     my @same = grep { $_ ne $reading } $self->readings_at_position( $reading->position );
1009     return @same;
1010 }
1011
1012 # Return the string that joins together a list of witnesses for
1013 # display on a single path.
1014 sub path_label {
1015     my $self = shift;
1016     return join( $self->wit_list_separator, @_ );
1017 }
1018
1019 sub witnesses_of_label {
1020     my( $self, $label ) = @_;
1021     my $regex = $self->wit_list_separator;
1022     my @answer = split( /\Q$regex\E/, $label );
1023     return @answer;
1024 }
1025
1026 sub unique_list {
1027     my( @list ) = @_;
1028     my %h;
1029     map { $h{$_->name} = $_ } @list;
1030     return values( %h );
1031 }
1032
1033 sub add_hash_entry {
1034     my( $hash, $key, $entry ) = @_;
1035     if( exists $hash->{$key} ) {
1036         push( @{$hash->{$key}}, $entry );
1037     } else {
1038         $hash->{$key} = [ $entry ];
1039     }
1040 }
1041
1042 no Moose;
1043 __PACKAGE__->meta->make_immutable;