lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use Graph::Easy;
   4 use IPC::Run qw( run binary );
   5 use Text::Tradition::Collation::Reading;
   6 use Moose;
   7
   8 has 'graph' => (
   9     is => 'ro',
  10     isa => 'Graph::Easy',
  11     handles => {
  12         add_reading => 'add_node',
  13         del_reading => 'del_node',
  14         add_path => 'add_edge',
  15         del_path => 'del_edge',
  16         reading => 'node',
  17         path => 'edge',
  18         readings => 'nodes',
  19         paths => 'edges',
  20     },
  21     default => sub { Graph::Easy->new( undirected => 0 ) },
  22     );
  23
  24
  25 has 'tradition' => (
  26     is => 'rw',
  27     isa => 'Text::Tradition',
  28     );
  29
  30 has 'svg' => (
  31     is => 'ro',
  32     isa => 'Str',
  33     writer => '_save_svg',
  34     predicate => 'has_svg',
  35     );
  36
  37 has 'graphviz' => (
  38     is => 'ro',
  39     isa => 'Str',
  40     writer => '_save_graphviz',
  41     predicate => 'has_graphviz',
  42     );
  43
  44 has 'graphml' => (
  45     is => 'ro',
  46     isa => 'XML::LibXML::Document',
  47     writer => '_save_graphml',
  48     predicate => 'has_graphml',
  49     );
  50
  51 # Keeps track of the lemmas within the collation.  At most one lemma
  52 # per position in the graph.
  53 has 'lemmata' => (
  54     is => 'ro',
  55     isa => 'HashRef[Maybe[Str]]',
  56     default => sub { {} },
  57     );
  58
  59 has 'wit_list_separator' => (
  60                              is => 'rw',
  61                              isa => 'Str',
  62                              default => ', ',
  63                              );
  64
  65 # The collation can be created two ways:
  66 # 1. Collate a set of witnesses (with CollateX I guess) and process
  67 #    the results as in 2.
  68 # 2. Read a pre-prepared collation in one of a variety of formats,
  69 #    and make the graph from that.
  70
  71 # The graph itself will (for now) be immutable, and the positions
  72 # within the graph will also be immutable.  We need to calculate those
  73 # positions upon graph construction.  The equivalences between graph
  74 # nodes will be mutable, entirely determined by the user (or possibly
  75 # by some semantic pre-processing provided by the user.)  So the
  76 # constructor should just make an empty equivalences object.  The
  77 # constructor will also need to make the witness objects, if we didn't
  78 # come through option 1.
  79
  80 sub BUILD {
  81     my( $self, $args ) = @_;
  82     $self->graph->use_class('node', 'Text::Tradition::Collation::Reading');
  83
  84     # Pass through any graph-specific options.
  85     my $shape = exists( $args->{'shape'} ) ? $args->{'shape'} : 'ellipse';
  86     $self->graph->set_attribute( 'node', 'shape', $shape );
  87 }
  88
  89 # Wrappes around merge_nodes
  90
  91 sub merge_readings {
  92     my $self = shift;
  93     my $first_node = shift;
  94     my $second_node = shift;
  95     $first_node->merge_from( $second_node );
  96     unshift( @_, $first_node, $second_node );
  97     return $self->graph->merge_nodes( @_ );
  98 }
  99
 100 =head2 Output method(s)
 101
 102 =over
 103
 104 =item B<as_svg>
 105
 106 print $graph->as_svg( $recalculate );
 107
 108 Returns an SVG string that represents the graph.  Uses GraphViz to do
 109 this, because Graph::Easy doesn\'t cope well with long graphs. Unless
 110 $recalculate is passed (and is a true value), the method will return a
 111 cached copy of the SVG after the first call to the method.
 112
 113 =cut
 114
 115 sub as_svg {
 116     my( $self, $recalc ) = @_;
 117     return $self->svg if $self->has_svg;
 118
 119     $self->_save_graphviz( $self->graph->as_graphviz() )
 120         unless( $self->has_graphviz && !$recalc );
 121
 122     my @cmd = qw/dot -Tsvg/;
 123     my( $svg, $err );
 124     my $in = $self->graphviz;
 125     run( \@cmd, \$in, ">", binary(), \$svg );
 126     $self->{'svg'} = $svg;
 127     return $svg;
 128 }
 129
 130 =item B<as_graphml>
 131
 132 print $graph->as_graphml( $recalculate )
 133
 134 Returns a GraphML representation of the collation graph, with
 135 transposition information and position information. Unless
 136 $recalculate is passed (and is a true value), the method will return a
 137 cached copy of the SVG after the first call to the method.
 138
 139 =cut
 140
 141 sub as_graphml {
 142     my( $self, $recalc ) = @_;
 143     return $self->graphml if $self->has_graphml;
 144
 145     # Some namespaces
 146     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
 147     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
 148     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
 149         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
 150
 151     # Create the document and root node
 152     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
 153     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
 154     $graphml->setDocumentElement( $root );
 155     $root->setNamespace( $xsi_ns, 'xsi', 0 );
 156     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
 157
 158     # Add the data keys for nodes
 159     my @node_data = ( 'name', 'token', 'identical', 'position' );
 160     foreach my $ndi ( 0 .. $#node_data ) {
 161         my $key = $root->addNewChild( $graphml_ns, 'key' );
 162         $key->setAttribute( 'attr.name', $node_data[$ndi] );
 163         $key->setAttribute( 'attr.type', 'string' );
 164         $key->setAttribute( 'for', 'node' );
 165         $key->setAttribute( 'id', 'd'.$ndi );
 166     }
 167
 168     # Add the data keys for edges
 169     my %wit_hash;
 170     my $wit_ctr = 0;
 171     foreach my $wit ( $self->getWitnessList ) {
 172         my $wit_key = 'w' . $wit_ctr++;
 173         $wit_hash{$wit} = $wit_key;
 174         my $key = $root->addNewChild( $graphml_ns, 'key' );
 175         $key->setAttribute( 'attr.name', $wit );
 176         $key->setAttribute( 'attr.type', 'string' );
 177         $key->setAttribute( 'for', 'edge' );
 178         $key->setAttribute( 'id', $wit_key );
 179     }
 180
 181     # Add the graph, its nodes, and its edges
 182     my $graph = $root->addNewChild( $graphml_ns, 'graph' );
 183     $graph->setAttribute( 'edgedefault', 'directed' );
 184     $graph->setAttribute( 'id', 'g0' ); # TODO make this meaningful
 185     $graph->setAttribute( 'parse.edgeids', 'canonical' );
 186     $graph->setAttribute( 'parse.edges', $self->edges() );
 187     $graph->setAttribute( 'parse.nodeids', 'canonical' );
 188     $graph->setAttribute( 'parse.nodes', $self->nodes() );
 189     $graph->setAttribute( 'parse.order', 'nodesfirst' );
 190
 191     my $node_ctr = 0;
 192     my %node_hash;
 193     foreach my $n ( $self->readings ) {
 194         my %this_node_data = ();
 195         foreach my $ndi ( 0 .. $#node_data ) {
 196             my $value;
 197             $this_node_data{'d'.$ndi} = $n->name if $node_data[$ndi] eq 'name';
 198             $this_node_data{'d'.$ndi} = $n->label
 199                 if $node_data[$ndi] eq 'token';
 200             $this_node_data{'d'.$ndi} = $n->primary->name if $n->has_primary;
 201             $this_node_data{'d'.$ndi} =
 202                 $self->{'positions'}->node_position( $n )
 203                 if $node_data[$ndi] eq 'position';
 204         }
 205         my $node_el = $graph->addNewChild( $graphml_ns, 'node' );
 206         my $node_xmlid = 'n' . $node_ctr++;
 207         $node_hash{ $n->name } = $node_xmlid;
 208         $node_el->setAttribute( 'id', $node_xmlid );
 209
 210         foreach my $dk ( keys %this_node_data ) {
 211             my $d_el = $node_el->addNewChild( $graphml_ns, 'data' );
 212             $d_el->setAttribute( 'key', $dk );
 213             $d_el->appendTextChild( $this_node_data{$dk} );
 214         }
 215     }
 216
 217     foreach my $e ( $self->edges() ) {
 218         my( $name, $from, $to ) = ( $e->name,
 219                                     $node_hash{ $e->from()->name() },
 220                                     $node_hash{ $e->to()->name() } );
 221         my $edge_el = $graph->addNewChild( $graphml_ns, 'edge' );
 222         $edge_el->setAttribute( 'source', $from );
 223         $edge_el->setAttribute( 'target', $to );
 224         $edge_el->setAttribute( 'id', $name );
 225         # TODO Got to add the witnesses
 226     }
 227
 228     # Return the thing
 229     $self->_save_graphml( $graphml );
 230     return $graphml;
 231 }
 232
 233 =back
 234
 235 =head2 Navigation methods
 236
 237 =over
 238
 239 =item B<start>
 240
 241 my $beginning = $collation->start();
 242
 243 Returns the beginning of the collation, a meta-reading with label '#START#'.
 244
 245 =cut
 246
 247 sub start {
 248     # Return the beginning reading of the graph.
 249     my $self = shift;
 250     my( $new_start ) = @_;
 251     if( $new_start ) {
 252         $self->del_reading( '#START#' );
 253         $self->graph->rename_node( $new_start, '#START#' );
 254     }
 255     return $self->reading('#START#');
 256 }
 257
 258 =item B<next_reading>
 259
 260 my $next_reading = $graph->next_reading( $reading, $witpath );
 261
 262 Returns the reading that follows the given reading along the given witness
 263 path.  TODO These are badly named.
 264
 265 =cut
 266
 267 sub next_reading {
 268     # Return the successor via the corresponding edge.
 269     my $self = shift;
 270     return $self->_find_linked_reading( 'next', @_ );
 271 }
 272
 273 =item B<prior_reading>
 274
 275 my $prior_reading = $graph->prior_reading( $reading, $witpath );
 276
 277 Returns the reading that precedes the given reading along the given witness
 278 path.  TODO These are badly named.
 279
 280 =cut
 281
 282 sub prior_reading {
 283     # Return the predecessor via the corresponding edge.
 284     my $self = shift;
 285     return $self->_find_linked_reading( 'prior', @_ );
 286 }
 287
 288 sub _find_linked_reading {
 289     my( $self, $direction, $node, $edge ) = @_;
 290     $edge = 'base text' unless $edge;
 291     my @linked_edges = $direction eq 'next'
 292         ? $node->outgoing() : $node->incoming();
 293     return undef unless scalar( @linked_edges );
 294
 295     # We have to find the linked edge that contains all of the
 296     # witnesses supplied in $edge.
 297     my @edge_wits = $self->witnesses_of_label( $edge );
 298     foreach my $le ( @linked_edges ) {
 299         my @le_wits = $self->witnesses_of_label( $le->name );
 300         if( _is_within( \@edge_wits, \@le_wits ) ) {
 301             # This is the right edge.
 302             return $direction eq 'next' ? $le->to() : $le->from();
 303         }
 304     }
 305     warn "Could not find $direction node from " . $node->label
 306         . " along edge $edge";
 307     return undef;
 308 }
 309
 310 # Some set logic.
 311 sub _is_within {
 312     my( $set1, $set2 ) = @_;
 313     my $ret = 1;
 314     foreach my $el ( @$set1 ) {
 315         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
 316     }
 317     return $ret;
 318 }
 319
 320
 321 ## INITIALIZATION METHODS - for use by parsers
 322 # Walk the paths for each witness in the graph, and return the nodes
 323 # that the graph has in common.
 324
 325 sub walk_witness_paths {
 326     my( $self, $end ) = @_;
 327     # For each witness, walk the path through the graph.
 328     # Then we need to find the common nodes.
 329     # TODO This method is going to fall down if we have a very gappy
 330     # text in the collation.
 331     my $paths = {};
 332     my @common_readings;
 333     foreach my $wit ( @{$self->tradition->witnesses} ) {
 334         my $curr_reading = $self->start;
 335         my @wit_path = ( $curr_reading );
 336         my %seen_readings;
 337         # TODO Detect loops at some point
 338         while( $curr_reading->name ne $end->name ) {
 339             if( $seen_readings{$curr_reading->name} ) {
 340                 warn "Detected loop walking path for witness " . $wit->sigil
 341                     . " at reading " . $curr_reading->name;
 342                 last;
 343             }
 344             my $next_reading = $self->next_reading( $curr_reading,
 345                                                     $wit->sigil );
 346             push( @wit_path, $next_reading );
 347             $seen_readings{$curr_reading->name} = 1;
 348             $curr_reading = $next_reading;
 349         }
 350         $wit->path( \@wit_path );
 351         if( @common_readings ) {
 352             my @cn;
 353             foreach my $n ( @wit_path ) {
 354                 push( @cn, $n ) if grep { $_ eq $n } @common_readings;
 355             }
 356             @common_readings = ();
 357             push( @common_readings, @cn );
 358         } else {
 359             push( @common_readings, @wit_path );
 360         }
 361     }
 362
 363     # Mark all the nodes as either common or not.
 364     foreach my $cn ( @common_readings ) {
 365         print STDERR "Setting " . $cn->name . " / " . $cn->label . " as common node\n";
 366         $cn->make_common;
 367     }
 368     foreach my $n ( $self->readings() ) {
 369         $n->make_variant unless $n->is_common;
 370     }
 371     # Return an array of the common nodes in order.
 372     return @common_readings;
 373 }
 374
 375 sub common_readings {
 376     my $self = shift;
 377     my @common = grep { $_->is_common } $self->readings();
 378     return sort { _cmp_position( $a->position, $b->position ) } @common;
 379 }
 380
 381 # Calculate the relative positions of nodes in the graph, if they
 382 # were not given to us.
 383 sub calculate_positions {
 384     my( $self, @ordered_common ) = @_;
 385
 386     # We have to calculate the position identifiers for each word,
 387     # keyed on the common nodes.  This will be 'fun'.  The end result
 388     # is a hash per witness, whose key is the word node and whose
 389     # value is its position in the text.  Common nodes are always N,1
 390     # so have identical positions in each text.
 391
 392     my $node_pos = {};
 393     foreach my $wit ( @{$self->tradition->witnesses} ) {
 394         # First we walk each path, making a matrix for each witness that
 395         # corresponds to its eventual position identifier.  Common nodes
 396         # always start a new row, and are thus always in the first column.
 397
 398         my $wit_matrix = [];
 399         my $cn = 0;  # We should hit the common readings in order.
 400         my $row = [];
 401         foreach my $wn ( @{$wit->path} ) {
 402             if( $wn eq $ordered_common[$cn] ) {
 403                 # Set up to look for the next common node, and
 404                 # start a new row of words.
 405                 $cn++;
 406                 push( @$wit_matrix, $row ) if scalar( @$row );
 407                 $row = [];
 408             }
 409             push( @$row, $wn );
 410         }
 411         push( @$wit_matrix, $row );  # Push the last row onto the matrix
 412
 413         # Now we have a matrix per witness, so that each row in the
 414         # matrix begins with a common node, and continues with all the
 415         # variant words that appear in the witness.  We turn this into
 416         # real positions in row,cell format.  But we need some
 417         # trickery in order to make sure that each node gets assigned
 418         # to only one position.
 419
 420         foreach my $li ( 1..scalar(@$wit_matrix) ) {
 421             foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) {
 422                 my $reading = $wit_matrix->[$li-1]->[$di-1];
 423                 my $position = "$li,$di";
 424                 # If we have seen this node before, we need to compare
 425                 # its position with what went before.
 426                 unless( $reading->has_position &&
 427                         _cmp_position( $position, $reading->position ) < 1 ) {
 428                     # The new position ID replaces the old one.
 429                     $reading->position( $position );
 430                 } # otherwise, the old position needs to stay.
 431             }
 432         }
 433     }
 434
 435     $self->init_lemmata();
 436 }
 437
 438 sub _cmp_position {
 439     my( $a, $b ) = @_;
 440     if ( $a && $b ) {
 441         my @pos_a = split(/,/, $a );
 442         my @pos_b = split(/,/, $b );
 443
 444         my $big_cmp = $pos_a[0] <=> $pos_b[0];
 445         return $big_cmp if $big_cmp;
 446         # else
 447         return $pos_a[1] <=> $pos_b[1];
 448     } elsif ( $b ) { # a is undefined
 449         return -1;
 450     } elsif ( $a ) { # b is undefined
 451         return 1;
 452     }
 453     return 0; # they are both undefined
 454 }
 455
 456 sub all_positions {
 457     my $self = shift;
 458     my %positions = ();
 459     map { $positions{$_->position} = 1 } $self->readings;
 460     my @answer = sort { _cmp_position( $a, $b ) } keys( %positions );
 461     return @answer;
 462 }
 463
 464 sub readings_at_position {
 465     my( $self, $pos ) = @_;
 466     my @answer = grep { $_->position eq $pos } $self->readings;
 467     return @answer;
 468 }
 469
 470 ## Lemmatizer functions
 471
 472 sub init_lemmata {
 473     my $self = shift;
 474
 475     foreach my $position ( $self->all_positions ) {
 476         $self->lemmata->{$position} = undef;
 477     }
 478
 479     foreach my $cr ( $self->common_readings ) {
 480         $self->lemmata->{$cr->position} = $cr->name;
 481     }
 482 }
 483
 484 =item B<lemma_readings>
 485
 486 my @state = $graph->lemma_readings( @readings_delemmatized );
 487
 488 Takes a list of readings that have just been delemmatized, and returns
 489 a set of tuples of the form ['reading', 'state'] that indicates what
 490 changes need to be made to the graph.
 491
 492 =over
 493
 494 =item *
 495
 496 A state of 1 means 'lemmatize this reading'
 497
 498 =item *
 499
 500 A state of 0 means 'delemmatize this reading'
 501
 502 =item *
 503
 504 A state of undef means 'an ellipsis belongs in the text here because
 505 no decision has been made / an earlier decision was backed out'
 506
 507 =back
 508
 509 =cut
 510
 511 sub lemma_readings {
 512     my( $self, @toggled_off_nodes ) = @_;
 513
 514     # First get the positions of those nodes which have been
 515     # toggled off.
 516     my $positions_off = {};
 517     map { $positions_off->{ $_->position } = $_->name } @toggled_off_nodes;
 518
 519     # Now for each position, we have to see if a node is on, and we
 520     # have to see if a node has been turned off.
 521     my @answer;
 522     foreach my $pos ( $self->all_positions() ) {
 523         # Find the state of this position.  If there is an active node,
 524         # its name will be the state; otherwise the state will be 0
 525         # (nothing at this position) or undef (ellipsis at this position)
 526         my $active = $self->lemmata->{$pos};
 527
 528         # Is there a formerly active node that was toggled off?
 529         if( exists( $positions_off->{$pos} ) ) {
 530             my $off_node = $positions_off->{$pos};
 531             if( $active && $active ne $off_node) {
 532                 push( @answer, [ $off_node, 0 ], [ $active, 1 ] );
 533             } else {
 534                 push( @answer, [ $off_node, $active ] );
 535             }
 536
 537         # No formerly active node, so we just see if there is a currently
 538         # active one.
 539         } elsif( $active ) {
 540             # Push the active node, whatever it is.
 541             push( @answer, [ $active, 1 ] );
 542         } else {
 543             # Push the state that is there. Arbitrarily use the first node
 544             # at that position.
 545             my @pos_nodes = $self->readings_at_position( $pos );
 546             push( @answer, [ $pos_nodes[0]->name, $self->lemmata->{$pos} ] );
 547         }
 548     }
 549
 550     return @answer;
 551 }
 552
 553 =item B<toggle_reading>
 554
 555 my @readings_delemmatized = $graph->toggle_reading( $reading_name );
 556
 557 Takes a reading node name, and either lemmatizes or de-lemmatizes
 558 it. Returns a list of all readings that are de-lemmatized as a result
 559 of the toggle.
 560
 561 =cut
 562
 563 sub toggle_reading {
 564     my( $self, $rname ) = @_;
 565
 566     return unless $rname;
 567     my $reading = $self->reading( $rname );
 568     if( !$reading || $reading->is_common() ) {
 569         # Do nothing, it's a common node.
 570         return;
 571     }
 572
 573     my $pos = $reading->position;
 574     my $old_state = $self->lemmata->{$pos};
 575     my @readings_off;
 576     if( $old_state && $old_state eq $rname ) {
 577         # Turn off the node. We turn on no others by default.
 578         push( @readings_off, $reading );
 579     } else {
 580         # Turn on the node.
 581         $self->lemmata->{$pos} = $rname;
 582         # Any other 'on' readings in the same position should be off.
 583         push( @readings_off, $self->same_position_as( $reading ) );
 584         # Any node that is an identical transposed one should be off.
 585         push( @readings_off, $reading->identical_readings );
 586     }
 587     @readings_off = unique_list( @readings_off );
 588
 589     # Turn off the readings that need to be turned off.
 590     my @readings_delemmatized;
 591     foreach my $n ( @readings_off ) {
 592         my $state = $self->lemmata->{$n->position};
 593         if( $state && $state eq $n->name ) {
 594             # this reading is still on, so turn it off
 595             push( @readings_delemmatized, $n );
 596             my $new_state = undef;
 597             if( $n eq $reading ) {
 598                 # This is the reading that was clicked, so if there are no
 599                 # other readings there, turn off the position.  In all other
 600                 # cases, restore the ellipsis.
 601                 my @other_n = $self->same_position_as( $n );
 602                 $new_state = 0 unless @other_n;
 603             }
 604             $self->lemmata->{$n->position} = $new_state;
 605         } elsif( $old_state && $old_state eq $n->name ) {
 606             # another reading has already been turned on here
 607             push( @readings_delemmatized, $n );
 608         } # else some other reading was on anyway, so pass.
 609     }
 610     return @readings_delemmatized;
 611 }
 612
 613 sub same_position_as {
 614     my( $self, $reading ) = @_;
 615     my $pos = $reading->position;
 616     my @same = grep { $_ ne $reading } $self->readings_at_position( $reading->position );
 617     return @same;
 618 }
 619
 620 # Return the string that joins together a list of witnesses for
 621 # display on a single path.
 622 sub path_label {
 623     my $self = shift;
 624     return join( $self->wit_list_separator, @_ );
 625 }
 626
 627 sub witnesses_of_label {
 628     my( $self, $label ) = @_;
 629     my $regex = $self->wit_list_separator;
 630     my @answer = split( /\Q$regex\E/, $label );
 631     return @answer;
 632 }
 633
 634 sub unique_list {
 635     my( @list ) = @_;
 636     my %h;
 637     map { $h{$_->name} = $_ } @list;
 638     return values( %h );
 639 }
 640
 641 no Moose;
 642 __PACKAGE__->meta->make_immutable;