base/lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use feature 'say';
   4 use Encode qw( decode_utf8 );
   5 use File::Temp;
   6 use File::Which;
   7 use Graph;
   8 use IPC::Run qw( run binary );
   9 use Text::CSV;
  10 use Text::Tradition::Collation::Data;
  11 use Text::Tradition::Collation::Reading;
  12 use Text::Tradition::Collation::RelationshipStore;
  13 use Text::Tradition::Error;
  14 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
  15 use XML::LibXML;
  16 use XML::LibXML::XPathContext;
  17 use Moose;
  18
  19 has _data => (
  20         isa      => 'Text::Tradition::Collation::Data',
  21         is       => 'ro',
  22         required => 1,
  23         handles  => [ qw(
  24                 sequence
  25                 paths
  26                 _set_relations
  27                 relations
  28                 _set_start
  29                 _set_end
  30                 ac_label
  31                 has_cached_table
  32                 relationships
  33                 related_readings
  34                 get_relationship
  35                 del_relationship
  36                 equivalence
  37                 equivalence_graph
  38                 readings
  39                 reading
  40                 _add_reading
  41                 del_reading
  42                 has_reading
  43                 wit_list_separator
  44                 baselabel
  45                 linear
  46                 wordsep
  47                 start
  48                 end
  49                 cached_table
  50                 _graphcalc_done
  51                 has_cached_svg
  52                 wipe_table
  53         )]
  54 );
  55
  56 has 'tradition' => (
  57     is => 'ro',
  58     isa => 'Text::Tradition',
  59     writer => '_set_tradition',
  60     weak_ref => 1,
  61     );
  62
  63 =head1 NAME
  64
  65 Text::Tradition::Collation - a software model for a text collation
  66
  67 =head1 SYNOPSIS
  68
  69   use Text::Tradition;
  70   my $t = Text::Tradition->new(
  71     'name' => 'this is a text',
  72     'input' => 'TEI',
  73     'file' => '/path/to/tei_parallel_seg_file.xml' );
  74
  75   my $c = $t->collation;
  76   my @readings = $c->readings;
  77   my @paths = $c->paths;
  78   my @relationships = $c->relationships;
  79
  80   my $svg_variant_graph = $t->collation->as_svg();
  81
  82 =head1 DESCRIPTION
  83
  84 Text::Tradition is a library for representation and analysis of collated
  85 texts, particularly medieval ones.  The Collation is the central feature of
  86 a Tradition, where the text, its sequence of readings, and its relationships
  87 between readings are actually kept.
  88
  89 =head1 CONSTRUCTOR
  90
  91 =head2 new
  92
  93 The constructor.  Takes a hash or hashref of the following arguments:
  94
  95 =over
  96
  97 =item * tradition - The Text::Tradition object to which the collation
  98 belongs. Required.
  99
 100 =item * linear - Whether the collation should be linear; that is, whether
 101 transposed readings should be treated as two linked readings rather than one,
 102 and therefore whether the collation graph is acyclic.  Defaults to true.
 103
 104 =item * baselabel - The default label for the path taken by a base text
 105 (if any). Defaults to 'base text'.
 106
 107 =item * wit_list_separator - The string to join a list of witnesses for
 108 purposes of making labels in display graphs.  Defaults to ', '.
 109
 110 =item * ac_label - The extra label to tack onto a witness sigil when
 111 representing another layer of path for the given witness - that is, when
 112 a text has more than one possible reading due to scribal corrections or
 113 the like.  Defaults to ' (a.c.)'.
 114
 115 =item * wordsep - The string used to separate words in the original text.
 116 Defaults to ' '.
 117
 118 =back
 119
 120 =head1 ACCESSORS
 121
 122 =head2 tradition
 123
 124 =head2 linear
 125
 126 =head2 wit_list_separator
 127
 128 =head2 baselabel
 129
 130 =head2 ac_label
 131
 132 =head2 wordsep
 133
 134 Simple accessors for collation attributes.
 135
 136 =head2 start
 137
 138 The meta-reading at the start of every witness path.
 139
 140 =head2 end
 141
 142 The meta-reading at the end of every witness path.
 143
 144 =head2 readings
 145
 146 Returns all Reading objects in the graph.
 147
 148 =head2 reading( $id )
 149
 150 Returns the Reading object corresponding to the given ID.
 151
 152 =head2 add_reading( $reading_args )
 153
 154 Adds a new reading object to the collation.
 155 See L<Text::Tradition::Collation::Reading> for the available arguments.
 156
 157 =head2 del_reading( $object_or_id )
 158
 159 Removes the given reading from the collation, implicitly removing its
 160 paths and relationships.
 161
 162 =head2 has_reading( $id )
 163
 164 Predicate to see whether a given reading ID is in the graph.
 165
 166 =head2 reading_witnesses( $object_or_id )
 167
 168 Returns a list of sigils whose witnesses contain the reading.
 169
 170 =head2 paths
 171
 172 Returns all reading paths within the document - that is, all edges in the
 173 collation graph.  Each path is an arrayref of [ $source, $target ] reading IDs.
 174
 175 =head2 add_path( $source, $target, $sigil )
 176
 177 Links the given readings in the collation in sequence, under the given witness
 178 sigil.  The readings may be specified by object or ID.
 179
 180 =head2 del_path( $source, $target, $sigil )
 181
 182 Links the given readings in the collation in sequence, under the given witness
 183 sigil.  The readings may be specified by object or ID.
 184
 185 =head2 has_path( $source, $target );
 186
 187 Returns true if the two readings are linked in sequence in any witness.
 188 The readings may be specified by object or ID.
 189
 190 =head2 relationships
 191
 192 Returns all Relationship objects in the collation.
 193
 194 =head2 add_relationship( $reading, $other_reading, $options )
 195
 196 Adds a new relationship of the type given in $options between the two readings,
 197 which may be specified by object or ID.  Returns a value of ( $status, @vectors)
 198 where $status is true on success, and @vectors is a list of relationship edges
 199 that were ultimately added.
 200 See L<Text::Tradition::Collation::Relationship> for the available options.
 201
 202 =cut
 203
 204 sub BUILDARGS {
 205         my ( $class, @args ) = @_;
 206         my %args = @args == 1 ? %{ $args[0] } : @args;
 207         # TODO determine these from the Moose::Meta object
 208         my @delegate_attrs = qw(sequence relations readings wit_list_separator baselabel
 209                 linear wordsep start end cached_table _graphcalc_done);
 210         my %data_args;
 211         for my $attr (@delegate_attrs) {
 212                 $data_args{$attr} = delete $args{$attr} if exists $args{$attr};
 213         }
 214         $args{_data} = Text::Tradition::Collation::Data->new(%data_args);
 215         return \%args;
 216 }
 217
 218 sub BUILD {
 219     my $self = shift;
 220     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
 221     $self->_set_start( $self->add_reading(
 222         { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
 223     $self->_set_end( $self->add_reading(
 224         { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
 225 }
 226
 227 sub register_relationship_type {
 228         my $self = shift;
 229         my %args = @_ == 1 ? %{$_[0]} : @_;
 230         if( $self->relations->has_type( $args{name} ) ) {
 231                 throw( 'Relationship type ' . $args{name} . ' already registered' );
 232         }
 233         $self->relations->add_type( %args );
 234 }
 235
 236 ### Reading construct/destruct functions
 237
 238 sub add_reading {
 239         my( $self, $reading ) = @_;
 240         unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
 241                 my %args = %$reading;
 242                 if( $args{'init'} ) {
 243                         # If we are initializing an empty collation, don't assume that we
 244                         # have set a tradition.
 245                         delete $args{'init'};
 246                 } elsif( $self->tradition->can('language') && $self->tradition->has_language
 247                         && !exists $args{'language'} ) {
 248                         $args{'language'} = $self->tradition->language;
 249                 }
 250                 $reading = Text::Tradition::Collation::Reading->new(
 251                         'collation' => $self,
 252                         %args );
 253         }
 254         # First check to see if a reading with this ID exists.
 255         if( $self->reading( $reading->id ) ) {
 256                 throw( "Collation already has a reading with id " . $reading->id );
 257         }
 258         $self->_graphcalc_done(0);
 259         $self->_add_reading( $reading->id => $reading );
 260         # Once the reading has been added, put it in both graphs.
 261         $self->sequence->add_vertex( $reading->id );
 262         $self->relations->add_reading( $reading->id );
 263         return $reading;
 264 };
 265
 266 around del_reading => sub {
 267         my $orig = shift;
 268         my $self = shift;
 269         my $arg = shift;
 270
 271         if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
 272                 $arg = $arg->id;
 273         }
 274         # Remove the reading from the graphs.
 275         $self->_graphcalc_done(0);
 276         $self->_clear_cache; # Explicitly clear caches to GC the reading
 277         $self->sequence->delete_vertex( $arg );
 278         $self->relations->delete_reading( $arg );
 279
 280         # Carry on.
 281         $self->$orig( $arg );
 282 };
 283
 284 =head2 merge_readings( $main, $second, $concatenate, $with_str )
 285
 286 Merges the $second reading into the $main one. If $concatenate is true, then
 287 the merged node will carry the text of both readings, concatenated with either
 288 $with_str (if specified) or a sensible default (the empty string if the
 289 appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
 290
 291 The first two arguments may be either readings or reading IDs.
 292
 293 =begin testing
 294
 295 use Text::Tradition;
 296
 297 my $cxfile = 't/data/Collatex-16.xml';
 298 my $t = Text::Tradition->new(
 299     'name'  => 'inline',
 300     'input' => 'CollateX',
 301     'file'  => $cxfile,
 302     );
 303 my $c = $t->collation;
 304
 305 my $rno = scalar $c->readings;
 306 # Split n21 ('unto') for testing purposes
 307 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
 308 my $old_r = $c->reading( 'n21' );
 309 $old_r->alter_text( 'to' );
 310 $c->del_path( 'n20', 'n21', 'A' );
 311 $c->add_path( 'n20', 'n21p0', 'A' );
 312 $c->add_path( 'n21p0', 'n21', 'A' );
 313 $c->add_relationship( 'n21', 'n22', { type => 'collated', scope => 'local' } );
 314 $c->flatten_ranks();
 315 ok( $c->reading( 'n21p0' ), "New reading exists" );
 316 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
 317
 318 # Combine n3 and n4 ( with his )
 319 $c->merge_readings( 'n3', 'n4', 1 );
 320 ok( !$c->reading('n4'), "Reading n4 is gone" );
 321 is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
 322
 323 # Collapse n9 and n10 ( rood / root )
 324 $c->merge_readings( 'n9', 'n10' );
 325 ok( !$c->reading('n10'), "Reading n10 is gone" );
 326 is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
 327
 328 # Combine n21 and n21p0
 329 my $remaining = $c->reading('n21');
 330 $remaining ||= $c->reading('n22');  # one of these should still exist
 331 $c->merge_readings( 'n21p0', $remaining, 1 );
 332 ok( !$c->reading('n21'), "Reading $remaining is gone" );
 333 is( $c->reading('n21p0')->text, 'unto', "Reading n21p0 merged correctly" );
 334
 335 =end testing
 336
 337 =cut
 338
 339 sub merge_readings {
 340         my $self = shift;
 341
 342         # Sanity check
 343         my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
 344         my $mergemeta = $kept_obj->is_meta;
 345         throw( "Cannot merge meta and non-meta reading" )
 346                 unless ( $mergemeta && $del_obj->is_meta )
 347                         || ( !$mergemeta && !$del_obj->is_meta );
 348         if( $mergemeta ) {
 349                 throw( "Cannot merge with start or end node" )
 350                         if( $kept_obj eq $self->start || $kept_obj eq $self->end
 351                                 || $del_obj eq $self->start || $del_obj eq $self->end );
 352                 throw( "Cannot combine text of meta readings" ) if $combine;
 353         }
 354         # We only need the IDs for adding paths to the graph, not the reading
 355         # objects themselves.
 356         my $kept = $kept_obj->id;
 357         my $deleted = $del_obj->id;
 358         $self->_graphcalc_done(0);
 359
 360     # The kept reading should inherit the paths and the relationships
 361     # of the deleted reading.
 362         foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
 363                 my @vector = ( $kept );
 364                 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
 365                 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
 366                 next if $vector[0] eq $vector[1]; # Don't add a self loop
 367                 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
 368                 $self->sequence->add_edge( @vector );
 369                 my $fwits = $self->sequence->get_edge_attributes( @vector );
 370                 @wits{keys %$fwits} = values %$fwits;
 371                 $self->sequence->set_edge_attributes( @vector, \%wits );
 372         }
 373         $self->relations->merge_readings( $kept, $deleted, $combine );
 374
 375         # Do the deletion deed.
 376         if( $combine ) {
 377                 # Combine the text of the readings
 378                 my $joinstr = $combine_char;
 379                 unless( defined $joinstr ) {
 380                         $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
 381                         $joinstr = $self->wordsep unless defined $joinstr;
 382                 }
 383                 $kept_obj->_combine( $del_obj, $joinstr );
 384         }
 385         $self->del_reading( $deleted );
 386 }
 387
 388 =head2 compress_readings
 389
 390 Where possible in the graph, compresses plain sequences of readings into a
 391 single reading. The sequences must consist of readings with no
 392 relationships to other readings, with only a single witness path between
 393 them and no other witness paths from either that would skip the other. The
 394 readings must also not be marked as nonsense or bad grammar.
 395
 396 WARNING: This operation cannot be undone.
 397
 398 =cut
 399
 400 sub compress_readings {
 401         my $self = shift;
 402         # Anywhere in the graph that there is a reading that joins only to a single
 403         # successor, and neither of these have any relationships, just join the two
 404         # readings.
 405         foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
 406                 # Now look for readings that can be joined to their successors.
 407                 next unless $rdg->is_combinable;
 408                 my %seen;
 409                 while( $self->sequence->successors( $rdg ) == 1 ) {
 410                         my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
 411                         throw( "Infinite loop" ) if $seen{$next->id};
 412                         $seen{$next->id} = 1;
 413                         last if $self->sequence->predecessors( $next ) > 1;
 414                         last unless $next->is_combinable;
 415                         say "Joining readings $rdg and $next";
 416                         $self->merge_readings( $rdg, $next, 1 );
 417                 }
 418         }
 419         # Make sure we haven't screwed anything up
 420         foreach my $wit ( $self->tradition->witnesses ) {
 421                 my $pathtext = $self->path_text( $wit->sigil );
 422                 my $origtext = join( ' ', @{$wit->text} );
 423                 throw( "Text differs for witness " . $wit->sigil )
 424                         unless $pathtext eq $origtext;
 425                 if( $wit->is_layered ) {
 426                         $pathtext = $self->path_text( $wit->sigil.$self->ac_label );
 427                         $origtext = join( ' ', @{$wit->layertext} );
 428                         throw( "Ante-corr text differs for witness " . $wit->sigil )
 429                                 unless $pathtext eq $origtext;
 430                 }
 431         }
 432
 433         $self->relations->rebuild_equivalence();
 434         $self->calculate_ranks();
 435 }
 436
 437 # Helper function for manipulating the graph.
 438 sub _stringify_args {
 439         my( $self, $first, $second, @args ) = @_;
 440     $first = $first->id
 441         if ref( $first ) eq 'Text::Tradition::Collation::Reading';
 442     $second = $second->id
 443         if ref( $second ) eq 'Text::Tradition::Collation::Reading';
 444     return( $first, $second, @args );
 445 }
 446
 447 # Helper function for manipulating the graph.
 448 sub _objectify_args {
 449         my( $self, $first, $second, $arg ) = @_;
 450     $first = $self->reading( $first )
 451         unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
 452     $second = $self->reading( $second )
 453         unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
 454     return( $first, $second, $arg );
 455 }
 456
 457 =head2 duplicate_reading( $reading, @witlist )
 458
 459 Split the given reading into two, so that the new reading is in the path for
 460 the witnesses given in @witlist. If the result is that certain non-colocated
 461 relationships (e.g. transpositions) are no longer valid, these will be removed.
 462 Returns the newly-created reading.
 463
 464 =begin testing
 465
 466 use Text::Tradition;
 467
 468 my $st = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/collatecorr.xml' );
 469 is( ref( $st ), 'Text::Tradition', "Got a tradition from test file" );
 470 ok( $st->has_witness('Ba96'), "Tradition has the affected witness" );
 471
 472 my $sc = $st->collation;
 473 my $numr = 17;
 474 ok( $sc->reading('n131'), "Tradition has the affected reading" );
 475 is( scalar( $sc->readings ), $numr, "There are $numr readings in the graph" );
 476 is( $sc->end->rank, 14, "There are fourteen ranks in the graph" );
 477
 478 # Detach the erroneously collated reading
 479 my $newr = $sc->duplicate_reading( 'n131', 'Ba96' );
 480 ok( $newr, "New reading was created" );
 481 ok( $sc->reading('n131_0'), "Detached the bad collation with a new reading" );
 482 is( scalar( $sc->readings ), $numr + 1, "A reading was added to the graph" );
 483 is( $sc->end->rank, 10, "There are now only ten ranks in the graph" );
 484 my $csucc = $sc->common_successor( 'n131', 'n131_0' );
 485 is( $csucc->id, 'n136', "Found correct common successor to duped reading" );
 486
 487 # Check that the bad transposition is gone
 488 is( $sc->get_relationship( 'n130', 'n135' ), undef, "Bad transposition relationship is gone" );
 489
 490 # The collation should not be fixed
 491 my @pairs = $sc->identical_readings();
 492 is( scalar @pairs, 0, "Not re-collated yet" );
 493 # Fix the collation
 494 ok( $sc->merge_readings( 'n124', 'n131_0' ), "Collated the readings correctly" );
 495 @pairs = $sc->identical_readings( start => 'n124', end => $csucc->id );
 496 is( scalar @pairs, 3, "Found three more identical readings" );
 497 is( $sc->end->rank, 11, "The ranks shifted appropriately" );
 498 $sc->flatten_ranks();
 499 is( scalar( $sc->readings ), $numr - 3, "Now we are collated correctly" );
 500
 501 =end testing
 502
 503 =cut
 504
 505 sub duplicate_reading {
 506         my( $self, $r, @wits ) = @_;
 507         # Add the new reading, duplicating $r.
 508         unless( ref( $r ) eq 'Text::Tradition::Collation::Reading' ) {
 509                 $r = $self->reading( $r );
 510         }
 511         throw( "Cannot duplicate a meta-reading" )
 512                 if $r->is_meta;
 513
 514         # Get all the reading attributes and duplicate them.
 515         my $rmeta = Text::Tradition::Collation::Reading->meta;
 516         my %args;
 517     foreach my $attr( $rmeta->get_all_attributes ) {
 518                 next if $attr->name =~ /^_/;
 519                 my $acc = $attr->get_read_method;
 520                 if( !$acc && $attr->has_applied_traits ) {
 521                         my $tr = $attr->applied_traits;
 522                         if( $tr->[0] =~ /::(Array|Hash)$/ ) {
 523                                 my $which = $1;
 524                                 my %methods = reverse %{$attr->handles};
 525                                 $acc = $methods{elements};
 526                                 $args{$attr->name} = $which eq 'Array'
 527                                         ? [ $r->$acc ] : { $r->$acc };
 528                         }
 529                 } else {
 530                         $args{$attr->name} = $r->$acc if $acc;
 531                 }
 532         }
 533         # By definition the new reading will no longer be common.
 534         $args{is_common} = 0;
 535         # The new reading also needs its own ID.
 536         $args{id} = $self->_generate_dup_id( $r->id );
 537
 538         # Try to make the new reading.
 539         my $newr = $self->add_reading( \%args );
 540         # The old reading is also no longer common.
 541         $r->is_common( 0 );
 542
 543         # For each of the witnesses, dissociate from the old reading and
 544         # associate with the new.
 545         foreach my $wit ( @wits ) {
 546                 my $prior = $self->prior_reading( $r, $wit );
 547                 my $next = $self->next_reading( $r, $wit );
 548                 $self->del_path( $prior, $r, $wit );
 549                 $self->add_path( $prior, $newr, $wit );
 550                 $self->del_path( $r, $next, $wit );
 551                 $self->add_path( $newr, $next, $wit );
 552         }
 553
 554         # If the graph is ranked, we need to look for relationships that are now
 555         # invalid (i.e. 'non-colocation' types that might now be colocated) and
 556         # remove them. If not, we can skip it.
 557         my $succ;
 558         my %rrk;
 559         if( $self->end->has_rank ) {
 560                 # Find the point where we can stop checking
 561                 $succ = $self->common_successor( $r, $newr );
 562
 563                 # Hash the existing ranks
 564                 foreach my $rdg ( $self->readings ) {
 565                         $rrk{$rdg->id} = $rdg->rank;
 566                 }
 567                 # Calculate the new ranks
 568                 $self->calculate_ranks();
 569
 570                 # Check for invalid non-colocated relationships among changed-rank readings
 571                 # from where the ranks start changing up to $succ
 572                 my $lastrank = $succ->rank;
 573                 foreach my $rdg ( $self->readings ) {
 574                         next if $rdg->rank > $lastrank;
 575                         next if $rdg->rank == $rrk{$rdg->id};
 576                         my @noncolo = $rdg->related_readings( sub { !$_[0]->colocated } );
 577                         next unless @noncolo;
 578                         foreach my $nc ( @noncolo ) {
 579                                 $self->relations->verify_or_delete( $rdg, $nc );
 580                         }
 581                 }
 582         }
 583         return $newr;
 584 }
 585
 586 sub _generate_dup_id {
 587         my( $self, $rid ) = @_;
 588         my $newid;
 589         my $i = 0;
 590         while( !$newid ) {
 591                 $newid = $rid."_$i";
 592                 if( $self->has_reading( $newid ) ) {
 593                         $newid = '';
 594                         $i++;
 595                 }
 596         }
 597         return $newid;
 598 }
 599
 600 ### Path logic
 601
 602 sub add_path {
 603         my $self = shift;
 604
 605         # We only need the IDs for adding paths to the graph, not the reading
 606         # objects themselves.
 607     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 608
 609         $self->_graphcalc_done(0);
 610         # Connect the readings
 611         unless( $self->sequence->has_edge( $source, $target ) ) {
 612             $self->sequence->add_edge( $source, $target );
 613             $self->relations->add_equivalence_edge( $source, $target );
 614         }
 615     # Note the witness in question
 616     $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
 617 }
 618
 619 sub del_path {
 620         my $self = shift;
 621         my @args;
 622         if( ref( $_[0] ) eq 'ARRAY' ) {
 623                 my $e = shift @_;
 624                 @args = ( @$e, @_ );
 625         } else {
 626                 @args = @_;
 627         }
 628
 629         # We only need the IDs for removing paths from the graph, not the reading
 630         # objects themselves.
 631     my( $source, $target, $wit ) = $self->_stringify_args( @args );
 632
 633         $self->_graphcalc_done(0);
 634         if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
 635                 $self->sequence->delete_edge_attribute( $source, $target, $wit );
 636         }
 637         unless( $self->sequence->has_edge_attributes( $source, $target ) ) {
 638                 $self->sequence->delete_edge( $source, $target );
 639                 $self->relations->delete_equivalence_edge( $source, $target );
 640         }
 641 }
 642
 643
 644 # Extra graph-alike utility
 645 sub has_path {
 646         my $self = shift;
 647     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 648         return undef unless $self->sequence->has_edge( $source, $target );
 649         return $self->sequence->has_edge_attribute( $source, $target, $wit );
 650 }
 651
 652 =head2 clear_witness( @sigil_list )
 653
 654 Clear the given witnesses out of the collation entirely, removing references
 655 to them in paths, and removing readings that belong only to them.  Should only
 656 be called via $tradition->del_witness.
 657
 658 =cut
 659
 660 sub clear_witness {
 661         my( $self, @sigils ) = @_;
 662
 663         $self->_graphcalc_done(0);
 664         # Clear the witness(es) out of the paths
 665         foreach my $e ( $self->paths ) {
 666                 foreach my $sig ( @sigils ) {
 667                         $self->del_path( $e, $sig );
 668                 }
 669         }
 670
 671         # Clear out the newly unused readings
 672         foreach my $r ( $self->readings ) {
 673                 unless( $self->reading_witnesses( $r ) ) {
 674                         $self->del_reading( $r );
 675                 }
 676         }
 677 }
 678
 679 sub add_relationship {
 680         my $self = shift;
 681     my( $source, $target, $opts ) = $self->_stringify_args( @_ );
 682     my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
 683     foreach my $v ( @vectors ) {
 684         next unless $self->get_relationship( $v )->colocated;
 685         if( $self->reading( $v->[0] )->has_rank && $self->reading( $v->[1] )->has_rank
 686                 && $self->reading( $v->[0] )->rank ne $self->reading( $v->[1] )->rank ) {
 687                         $self->_graphcalc_done(0);
 688                         $self->_clear_cache;
 689                         last;
 690         }
 691     }
 692     return @vectors;
 693 }
 694
 695 around qw/ get_relationship del_relationship / => sub {
 696         my $orig = shift;
 697         my $self = shift;
 698         my @args = @_;
 699         if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
 700                 @args = @{$_[0]};
 701         }
 702         my @stringargs = $self->_stringify_args( @args );
 703         $self->$orig( @stringargs );
 704 };
 705
 706 =head2 reading_witnesses( $reading )
 707
 708 Return a list of sigils corresponding to the witnesses in which the reading appears.
 709
 710 =cut
 711
 712 sub reading_witnesses {
 713         my( $self, $reading ) = @_;
 714         # We need only check either the incoming or the outgoing edges; I have
 715         # arbitrarily chosen "incoming".  Thus, special-case the start node.
 716         if( $reading eq $self->start ) {
 717                 return map { $_->sigil } grep { $_->is_collated } $self->tradition->witnesses;
 718         }
 719         my %all_witnesses;
 720         foreach my $e ( $self->sequence->edges_to( $reading ) ) {
 721                 my $wits = $self->sequence->get_edge_attributes( @$e );
 722                 @all_witnesses{ keys %$wits } = 1;
 723         }
 724         my $acstr = $self->ac_label;
 725         foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
 726                 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
 727         }
 728         return keys %all_witnesses;
 729 }
 730
 731 =head1 OUTPUT METHODS
 732
 733 =head2 as_svg( \%options )
 734
 735 Returns an SVG string that represents the graph, via as_dot and graphviz.
 736 See as_dot for a list of options.  Must have GraphViz (dot) installed to run.
 737
 738 =cut
 739
 740 sub as_svg {
 741     my( $self, $opts ) = @_;
 742     throw( "Need GraphViz installed to output SVG" )
 743         unless File::Which::which( 'dot' );
 744     my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
 745     $self->calculate_ranks()
 746         unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
 747         my @cmd = qw/dot -Tsvg/;
 748         my( $svg, $err );
 749         my $dotfile = File::Temp->new();
 750         ## USE FOR DEBUGGING
 751         # $dotfile->unlink_on_destroy(0);
 752         binmode $dotfile, ':utf8';
 753         print $dotfile $self->as_dot( $opts );
 754         push( @cmd, $dotfile->filename );
 755         run( \@cmd, ">", binary(), \$svg );
 756         $svg = decode_utf8( $svg );
 757         return $svg;
 758 }
 759
 760
 761 =head2 as_dot( \%options )
 762
 763 Returns a string that is the collation graph expressed in dot
 764 (i.e. GraphViz) format.  Options include:
 765
 766 =over 4
 767
 768 =item * from
 769
 770 =item * to
 771
 772 =item * color_common
 773
 774 =back
 775
 776 =cut
 777
 778 sub as_dot {
 779     my( $self, $opts ) = @_;
 780     my $startrank = $opts->{'from'} if $opts;
 781     my $endrank = $opts->{'to'} if $opts;
 782     my $color_common = $opts->{'color_common'} if $opts;
 783     my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
 784        && $self->end->rank > 100;
 785     $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
 786
 787     # Check the arguments
 788     if( $startrank ) {
 789         return if $endrank && $startrank > $endrank;
 790         return if $startrank > $self->end->rank;
 791         }
 792         if( defined $endrank ) {
 793                 return if $endrank < 0;
 794                 $endrank = undef if $endrank == $self->end->rank;
 795         }
 796
 797     my $graph_name = $self->tradition->name;
 798     $graph_name =~ s/[^\w\s]//g;
 799     $graph_name = join( '_', split( /\s+/, $graph_name ) );
 800
 801     my %graph_attrs = (
 802         'rankdir' => 'LR',
 803         'bgcolor' => 'none',
 804         );
 805     my %node_attrs = (
 806         'fontsize' => 14,
 807         'fillcolor' => 'white',
 808         'style' => 'filled',
 809         'shape' => 'ellipse'
 810         );
 811     my %edge_attrs = (
 812         'arrowhead' => 'open',
 813         'color' => '#000000',
 814         'fontcolor' => '#000000',
 815         );
 816
 817     my $dot = sprintf( "digraph %s {\n", $graph_name );
 818     $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
 819     $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
 820
 821         # Output substitute start/end readings if necessary
 822         if( $startrank ) {
 823                 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
 824         }
 825         if( $endrank ) {
 826                 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
 827         }
 828         if( $STRAIGHTENHACK ) {
 829                 ## HACK part 1
 830                 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
 831                 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
 832                 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
 833         }
 834         my %used;  # Keep track of the readings that actually appear in the graph
 835         # Sort the readings by rank if we have ranks; this speeds layout.
 836         my @all_readings = $self->end->has_rank
 837                 ? sort { $a->rank <=> $b->rank } $self->readings
 838                 : $self->readings;
 839         # TODO Refrain from outputting lacuna nodes - just grey out the edges.
 840     foreach my $reading ( @all_readings ) {
 841         # Only output readings within our rank range.
 842         next if $startrank && $reading->rank < $startrank;
 843         next if $endrank && $reading->rank > $endrank;
 844         $used{$reading->id} = 1;
 845         # Need not output nodes without separate labels
 846         next if $reading->id eq $reading->text;
 847         my $rattrs;
 848         my $label = $reading->text;
 849         $label .= '-' if $reading->join_next;
 850         $label = "-$label" if $reading->join_prior;
 851         $label =~ s/\"/\\\"/g;
 852                 $rattrs->{'label'} = $label;
 853                 $rattrs->{'id'} = $reading->id;
 854                 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
 855         $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
 856     }
 857
 858         # Add the real edges. Need to weight one edge per rank jump, in a
 859         # continuous line.
 860         # my $weighted = $self->_add_edge_weights;
 861     my @edges = $self->paths;
 862         my( %substart, %subend );
 863     foreach my $edge ( @edges ) {
 864         # Do we need to output this edge?
 865         if( $used{$edge->[0]} && $used{$edge->[1]} ) {
 866                 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
 867                         my $variables = { %edge_attrs, 'label' => $label };
 868
 869                         # Account for the rank gap if necessary
 870                         my $rank0 = $self->reading( $edge->[0] )->rank
 871                                 if $self->reading( $edge->[0] )->has_rank;
 872                         my $rank1 = $self->reading( $edge->[1] )->rank
 873                                 if $self->reading( $edge->[1] )->has_rank;
 874                         if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
 875                                 $variables->{'minlen'} = $rank1 - $rank0;
 876                         }
 877
 878                         # Add the calculated edge weights
 879                         # if( exists $weighted->{$edge->[0]}
 880                         #       && $weighted->{$edge->[0]} eq $edge->[1] ) {
 881                         #       # $variables->{'color'} = 'red';
 882                         #       $variables->{'weight'} = 3.0;
 883                         # }
 884
 885                         # EXPERIMENTAL: make edge width reflect no. of witnesses
 886                         my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
 887                         $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
 888
 889                         my $varopts = _dot_attr_string( $variables );
 890                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
 891                                 $edge->[0], $edge->[1], $varopts );
 892         } elsif( $used{$edge->[0]} ) {
 893                 $subend{$edge->[0]} = $edge->[1];
 894         } elsif( $used{$edge->[1]} ) {
 895                 $substart{$edge->[1]} = $edge->[0];
 896         }
 897     }
 898
 899     # If we are asked to, add relationship links
 900     if( exists $opts->{show_relations} ) {
 901         my $filter = $opts->{show_relations}; # can be 'transposition' or 'all'
 902         if( $filter eq 'transposition' ) {
 903                 $filter =~ qr/^transposition$/;
 904         }
 905         foreach my $redge ( $self->relationships ) {
 906                 if( $used{$redge->[0]} && $used{$redge->[1]} ) {
 907                         if( $filter ne 'all' ) {
 908                                 my $rel = $self->get_relationship( $redge );
 909                                 next unless $rel->type =~ /$filter/;
 910                                         my $variables = {
 911                                                 arrowhead => 'none',
 912                                                 color => '#FFA14F',
 913                                                 constraint => 'false',
 914                                                 label => uc( substr( $rel->type, 0, 4 ) ),
 915                                                 penwidth => '3',
 916                                         };
 917                                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
 918                                                 $redge->[0], $redge->[1], _dot_attr_string( $variables ) );
 919                                 }
 920                 }
 921         }
 922     }
 923
 924     # Add substitute start and end edges if necessary
 925     foreach my $node ( keys %substart ) {
 926         my $witstr = $self->_path_display_label ( $self->path_witnesses( $substart{$node}, $node ) );
 927         my $variables = { %edge_attrs, 'label' => $witstr };
 928         my $nrdg = $self->reading( $node );
 929         if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
 930                 # Substart is actually one lower than $startrank
 931                 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
 932         }
 933         my $varopts = _dot_attr_string( $variables );
 934         $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
 935         }
 936     foreach my $node ( keys %subend ) {
 937         my $witstr = $self->_path_display_label ( $self->path_witnesses( $node, $subend{$node} ) );
 938         my $variables = { %edge_attrs, 'label' => $witstr };
 939         my $varopts = _dot_attr_string( $variables );
 940         $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
 941         }
 942         # HACK part 2
 943         if( $STRAIGHTENHACK ) {
 944                 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
 945                 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
 946         }
 947
 948     $dot .= "}\n";
 949     return $dot;
 950 }
 951
 952 sub _dot_attr_string {
 953         my( $hash ) = @_;
 954         my @attrs;
 955         foreach my $k ( sort keys %$hash ) {
 956                 my $v = $hash->{$k};
 957                 push( @attrs, $k.'="'.$v.'"' );
 958         }
 959         return( '[ ' . join( ', ', @attrs ) . ' ]' );
 960 }
 961
 962 sub _add_edge_weights {
 963         my $self = shift;
 964         # Walk the graph from START to END, choosing the successor node with
 965         # the largest number of witness paths each time.
 966         my $weighted = {};
 967         my $curr = $self->start->id;
 968         my $ranked = $self->end->has_rank;
 969         while( $curr ne $self->end->id ) {
 970                 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
 971                 my @succ = sort { $self->path_witnesses( $curr, $a )
 972                                                         <=> $self->path_witnesses( $curr, $b ) }
 973                         $self->sequence->successors( $curr );
 974                 my $next = pop @succ;
 975                 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
 976                 # Try to avoid lacunae in the weighted path.
 977                 while( @succ &&
 978                            ( $self->reading( $next )->is_lacuna ||
 979                                  $nextrank - $rank > 1 ) ){
 980                         $next = pop @succ;
 981                 }
 982                 $weighted->{$curr} = $next;
 983                 $curr = $next;
 984         }
 985         return $weighted;
 986 }
 987
 988 =head2 path_witnesses( $edge )
 989
 990 Returns the list of sigils whose witnesses are associated with the given edge.
 991 The edge can be passed as either an array or an arrayref of ( $source, $target ).
 992
 993 =cut
 994
 995 sub path_witnesses {
 996         my( $self, @edge ) = @_;
 997         # If edge is an arrayref, cope.
 998         if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
 999                 my $e = shift @edge;
1000                 @edge = @$e;
1001         }
1002         my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
1003         return @wits;
1004 }
1005
1006 # Helper function. Make a display label for the given witnesses, showing a.c.
1007 # witnesses only where the main witness is not also in the list.
1008 sub _path_display_label {
1009         my $self = shift;
1010         my %wits;
1011         map { $wits{$_} = 1 } @_;
1012
1013         # If an a.c. wit is listed, remove it if the main wit is also listed.
1014         # Otherwise keep it for explicit listing.
1015         my $aclabel = $self->ac_label;
1016         my @disp_ac;
1017         foreach my $w ( sort keys %wits ) {
1018                 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
1019                         if( exists $wits{$1} ) {
1020                                 delete $wits{$w};
1021                         } else {
1022                                 push( @disp_ac, $w );
1023                         }
1024                 }
1025         }
1026
1027         # See if we are in a majority situation.
1028         my $maj = scalar( $self->tradition->witnesses ) * 0.6;
1029         $maj = $maj > 5 ? $maj : 5;
1030         if( scalar keys %wits > $maj ) {
1031                 unshift( @disp_ac, 'majority' );
1032                 return join( ', ', @disp_ac );
1033         } else {
1034                 return join( ', ', sort keys %wits );
1035         }
1036 }
1037
1038 =head2 readings_at_rank( $rank )
1039
1040 Returns a list of readings at a given rank, taken from the alignment table.
1041
1042 =cut
1043
1044 sub readings_at_rank {
1045         my( $self, $rank ) = @_;
1046         my $table = $self->alignment_table;
1047         # Table rank is real rank - 1.
1048         my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
1049         my %readings;
1050         foreach my $e ( @elements ) {
1051                 next unless ref( $e ) eq 'HASH';
1052                 next unless exists $e->{'t'};
1053                 $readings{$e->{'t'}->id} = $e->{'t'};
1054         }
1055         return values %readings;
1056 }
1057
1058 =head2 as_graphml
1059
1060 Returns a GraphML representation of the collation.  The GraphML will contain
1061 two graphs. The first expresses the attributes of the readings and the witness
1062 paths that link them; the second expresses the relationships that link the
1063 readings.  This is the native transfer format for a tradition.
1064
1065 =begin testing
1066
1067 use Text::Tradition;
1068 use TryCatch;
1069
1070 my $READINGS = 311;
1071 my $PATHS = 361;
1072
1073 my $datafile = 't/data/florilegium_tei_ps.xml';
1074 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1075                                       'name' => 'test0',
1076                                       'file' => $datafile,
1077                                       'linear' => 1 );
1078
1079 ok( $tradition, "Got a tradition object" );
1080 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
1081 ok( $tradition->collation, "Tradition has a collation" );
1082
1083 my $c = $tradition->collation;
1084 is( scalar $c->readings, $READINGS, "Collation has all readings" );
1085 is( scalar $c->paths, $PATHS, "Collation has all paths" );
1086 is( scalar $c->relationships, 0, "Collation has all relationships" );
1087
1088 # Add a few relationships
1089 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
1090 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
1091 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
1092
1093 # Now write it to GraphML and parse it again.
1094
1095 my $graphml = $c->as_graphml;
1096 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
1097 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
1098 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
1099 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
1100
1101 # Now add a stemma, write to GraphML, and look at the output.
1102 SKIP: {
1103         skip "Analysis module not present", 3 unless $tradition->can( 'add_stemma' );
1104         my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
1105         is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
1106         is( $tradition->stemmata, 1, "Tradition now has the stemma" );
1107         $graphml = $c->as_graphml;
1108         like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
1109 }
1110
1111 =end testing
1112
1113 =cut
1114
1115 ## TODO MOVE this to Tradition.pm and modularize it better
1116 sub as_graphml {
1117     my( $self, $options ) = @_;
1118         $self->calculate_ranks unless $self->_graphcalc_done;
1119
1120         my $start = $options->{'from'}
1121                 ? $self->reading( $options->{'from'} ) : $self->start;
1122         my $end = $options->{'to'}
1123                 ? $self->reading( $options->{'to'} ) : $self->end;
1124         if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
1125                 throw( 'Start node must be before end node' );
1126         }
1127         # The readings need to be ranked for this to work.
1128         $start = $self->start unless $start->has_rank;
1129         $end = $self->end unless $end->has_rank;
1130         my $rankoffset = 0;
1131         unless( $start eq $self->start ) {
1132                 $rankoffset = $start->rank - 1;
1133         }
1134         my %use_readings;
1135
1136     # Some namespaces
1137     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
1138     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
1139     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
1140         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
1141
1142     # Create the document and root node
1143     require XML::LibXML;
1144     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
1145     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
1146     $graphml->setDocumentElement( $root );
1147     $root->setNamespace( $xsi_ns, 'xsi', 0 );
1148     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
1149
1150     # List of attribute types to save on our objects and their corresponding
1151     # GraphML types
1152     my %save_types = (
1153         'Str' => 'string',
1154         'Int' => 'int',
1155         'Bool' => 'boolean',
1156         'ReadingID' => 'string',
1157         'RelationshipType' => 'string',
1158         'RelationshipScope' => 'string',
1159     );
1160
1161     # Add the data keys for the graph. Include an extra key 'version' for the
1162     # GraphML output version.
1163     my %graph_data_keys;
1164     my $gdi = 0;
1165     my %graph_attributes = ( 'version' => 'string' );
1166         # Graph attributes include those of Tradition and those of Collation.
1167         my %gattr_from;
1168         # TODO Use meta introspection method from duplicate_reading to do this
1169         # instead of naming custom keys.
1170         my $tmeta = $self->tradition->meta;
1171         my $cmeta = $self->meta;
1172         map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
1173         map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
1174         foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
1175                 next if $attr->name =~ /^_/;
1176                 next unless $save_types{$attr->type_constraint->name};
1177                 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1178         }
1179     # Extra custom keys for complex objects that should be saved in some form.
1180     # The subroutine should return a string, or undef/empty.
1181     if( $tmeta->has_method('stemmata') ) {
1182                 $graph_attributes{'stemmata'} = sub {
1183                         my @stemstrs;
1184                         map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1185                                 $self->tradition->stemmata;
1186                         join( "\n", @stemstrs );
1187                 };
1188         }
1189
1190         if( $tmeta->has_method('user') ) {
1191                 $graph_attributes{'user'} = sub {
1192                         $self->tradition->user ? $self->tradition->user->id : undef
1193                 };
1194         }
1195
1196     foreach my $datum ( sort keys %graph_attributes ) {
1197         $graph_data_keys{$datum} = 'dg'.$gdi++;
1198         my $key = $root->addNewChild( $graphml_ns, 'key' );
1199         my $dtype = ref( $graph_attributes{$datum} ) ? 'string'
1200                 : $graph_attributes{$datum};
1201         $key->setAttribute( 'attr.name', $datum );
1202         $key->setAttribute( 'attr.type', $dtype );
1203         $key->setAttribute( 'for', 'graph' );
1204         $key->setAttribute( 'id', $graph_data_keys{$datum} );
1205     }
1206
1207     # Add the data keys for reading nodes
1208     my %reading_attributes;
1209     my $rmeta = Text::Tradition::Collation::Reading->meta;
1210     foreach my $attr( $rmeta->get_all_attributes ) {
1211                 next if $attr->name =~ /^_/;
1212                 next unless $save_types{$attr->type_constraint->name};
1213                 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1214         }
1215         if( $self->start->does('Text::Tradition::Morphology' ) ) {
1216                 # Extra custom key for the reading morphology
1217                 $reading_attributes{'lexemes'} = 'string';
1218         }
1219
1220     my %node_data_keys;
1221     my $ndi = 0;
1222     foreach my $datum ( sort keys %reading_attributes ) {
1223         $node_data_keys{$datum} = 'dn'.$ndi++;
1224         my $key = $root->addNewChild( $graphml_ns, 'key' );
1225         $key->setAttribute( 'attr.name', $datum );
1226         $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
1227         $key->setAttribute( 'for', 'node' );
1228         $key->setAttribute( 'id', $node_data_keys{$datum} );
1229     }
1230
1231     # Add the data keys for edges, that is, paths and relationships. Path
1232     # data does not come from a Moose class so is here manually.
1233     my $edi = 0;
1234     my %edge_data_keys;
1235     my %edge_attributes = (
1236         witness => 'string',                    # ID/label for a path
1237         extra => 'boolean',                             # Path key
1238         );
1239     my @path_attributes = keys %edge_attributes; # track our manual additions
1240     my $pmeta = Text::Tradition::Collation::Relationship->meta;
1241     foreach my $attr( $pmeta->get_all_attributes ) {
1242                 next if $attr->name =~ /^_/;
1243                 next unless $save_types{$attr->type_constraint->name};
1244                 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1245         }
1246     foreach my $datum ( sort keys %edge_attributes ) {
1247         $edge_data_keys{$datum} = 'de'.$edi++;
1248         my $key = $root->addNewChild( $graphml_ns, 'key' );
1249         $key->setAttribute( 'attr.name', $datum );
1250         $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
1251         $key->setAttribute( 'for', 'edge' );
1252         $key->setAttribute( 'id', $edge_data_keys{$datum} );
1253     }
1254
1255     # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1256     my $xmlidname = $self->tradition->name;
1257     $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1258     if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1259         $xmlidname = '_'.$xmlidname;
1260     }
1261     my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1262     $sgraph->setAttribute( 'edgedefault', 'directed' );
1263     $sgraph->setAttribute( 'id', $xmlidname );
1264     $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
1265     $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
1266     $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
1267     $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
1268     $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
1269
1270     # Tradition/collation attribute data
1271     foreach my $datum ( keys %graph_attributes ) {
1272         my $value;
1273         if( $datum eq 'version' ) {
1274                 $value = '3.2';
1275         } elsif( ref( $graph_attributes{$datum} ) ) {
1276                 my $sub = $graph_attributes{$datum};
1277                 $value = &$sub();
1278         } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1279                 $value = $self->tradition->$datum;
1280         } else {
1281                 $value = $self->$datum;
1282         }
1283                 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
1284         }
1285
1286     my $node_ctr = 0;
1287     my %node_hash;
1288     # Add our readings to the graph
1289     foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
1290         next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1291                 ( $n->rank < $start->rank || $n->rank > $end->rank );
1292         $use_readings{$n->id} = 1;
1293         # Add to the main graph
1294         my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
1295         my $node_xmlid = 'n' . $node_ctr++;
1296         $node_hash{ $n->id } = $node_xmlid;
1297         $node_el->setAttribute( 'id', $node_xmlid );
1298         foreach my $d ( keys %reading_attributes ) {
1299                 my $nval = $n->$d;
1300                 # Custom serialization
1301                 if( $d eq 'lexemes' ) {
1302                                 # If nval is a true value, we have lexemes so we need to
1303                                 # serialize them. Otherwise set nval to undef so that the
1304                                 # key is excluded from this reading.
1305                         $nval = $nval ? $n->_serialize_lexemes : undef;
1306                 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1307                         $nval = undef;
1308                 }
1309                 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
1310                         # Adjust the ranks within the subgraph.
1311                         $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1312                                 : $nval - $rankoffset;
1313                 }
1314                 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1315                         if defined $nval;
1316         }
1317     }
1318
1319     # Add the path edges to the sequence graph
1320     my $edge_ctr = 0;
1321     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1322         # We add an edge in the graphml for every witness in $e.
1323         next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1324         my @edge_wits = sort $self->path_witnesses( $e );
1325         $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1326         $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1327         # Skip any path from start to end; that witness is not in the subgraph.
1328         next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
1329         foreach my $wit ( @edge_wits ) {
1330                         my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1331                                                                                 $node_hash{ $e->[0] },
1332                                                                                 $node_hash{ $e->[1] } );
1333                         my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
1334                         $edge_el->setAttribute( 'source', $from );
1335                         $edge_el->setAttribute( 'target', $to );
1336                         $edge_el->setAttribute( 'id', $id );
1337
1338                         # It's a witness path, so add the witness
1339                         my $base = $wit;
1340                         my $key = $edge_data_keys{'witness'};
1341                         # Is this an ante-corr witness?
1342                         my $aclabel = $self->ac_label;
1343                         if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1344                                 # Keep the base witness
1345                                 $base = $1;
1346                                 # ...and record that this is an 'extra' reading path
1347                                 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1348                         }
1349                         _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1350                 }
1351         }
1352
1353         # Report the actual number of nodes and edges that went in
1354         $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1355         $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1356
1357         # Add the relationship graph to the XML
1358         map { delete $edge_data_keys{$_} } @path_attributes;
1359         $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1360                 $node_data_keys{'id'}, \%edge_data_keys );
1361
1362     # Save and return the thing
1363     my $result = decode_utf8( $graphml->toString(1) );
1364     return $result;
1365 }
1366
1367 sub _add_graphml_data {
1368     my( $el, $key, $value ) = @_;
1369     return unless defined $value;
1370     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
1371     $data_el->setAttribute( 'key', $key );
1372     $data_el->appendText( $value );
1373 }
1374
1375 =head2 as_csv
1376
1377 Returns a CSV alignment table representation of the collation graph, one
1378 row per witness (or witness uncorrected.)
1379
1380 =cut
1381
1382 sub as_csv {
1383     my( $self ) = @_;
1384     my $table = $self->alignment_table;
1385     my $csv = Text::CSV->new( { binary => 1, quote_null => 0 } );
1386     my @result;
1387     # Make the header row
1388     $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1389         push( @result, decode_utf8( $csv->string ) );
1390     # Make the rest of the rows
1391     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1392         my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1393         my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
1394         $csv->combine( @row );
1395         push( @result, decode_utf8( $csv->string ) );
1396     }
1397     return join( "\n", @result );
1398 }
1399
1400 =head2 alignment_table
1401
1402 Return a reference to an alignment table, in a slightly enhanced CollateX
1403 format which looks like this:
1404
1405  $table = { alignment => [ { witness => "SIGIL",
1406                              tokens => [ { t => "TEXT" }, ... ] },
1407                            { witness => "SIG2",
1408                              tokens => [ { t => "TEXT" }, ... ] },
1409                            ... ],
1410             length => TEXTLEN };
1411
1412 =cut
1413
1414 sub alignment_table {
1415     my( $self ) = @_;
1416     return $self->cached_table if $self->has_cached_table;
1417
1418     # Make sure we can do this
1419         throw( "Need a linear graph in order to make an alignment table" )
1420                 unless $self->linear;
1421     $self->calculate_ranks()
1422         unless $self->_graphcalc_done && $self->end->has_rank;
1423
1424     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
1425     my @all_pos = ( 1 .. $self->end->rank - 1 );
1426     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
1427         # say STDERR "Making witness row(s) for " . $wit->sigil;
1428         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1429         my @row = _make_witness_row( \@wit_path, \@all_pos );
1430         my $witobj = { 'witness' => $wit->sigil, 'tokens' => \@row };
1431         $witobj->{'identifier'} = $wit->identifier if $wit->identifier;
1432         push( @{$table->{'alignment'}}, $witobj );
1433         if( $wit->is_layered ) {
1434                 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
1435                         $wit->sigil.$self->ac_label );
1436             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
1437             my $witacobj = { 'witness' => $wit->sigil.$self->ac_label,
1438                 'tokens' => \@ac_row };
1439             $witacobj->{'identifier'} = $wit->identifier if $wit->identifier;
1440                         push( @{$table->{'alignment'}}, $witacobj );
1441         }
1442     }
1443     $self->cached_table( $table );
1444     return $table;
1445 }
1446
1447 sub _make_witness_row {
1448     my( $path, $positions ) = @_;
1449     my %char_hash;
1450     map { $char_hash{$_} = undef } @$positions;
1451     my $debug = 0;
1452     foreach my $rdg ( @$path ) {
1453         say STDERR "rank " . $rdg->rank if $debug;
1454         # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
1455         $char_hash{$rdg->rank} = { 't' => $rdg };
1456     }
1457     my @row = map { $char_hash{$_} } @$positions;
1458     # Fill in lacuna markers for undef spots in the row
1459     my $last_el = shift @row;
1460     my @filled_row = ( $last_el );
1461     foreach my $el ( @row ) {
1462         # If we are using node reference, make the lacuna node appear many times
1463         # in the table.  If not, use the lacuna tag.
1464         if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1465             $el = $last_el;
1466         }
1467         push( @filled_row, $el );
1468         $last_el = $el;
1469     }
1470     return @filled_row;
1471 }
1472
1473
1474 =head1 NAVIGATION METHODS
1475
1476 =head2 reading_sequence( $first, $last, $sigil, $backup )
1477
1478 Returns the ordered list of readings, starting with $first and ending
1479 with $last, for the witness given in $sigil. If a $backup sigil is
1480 specified (e.g. when walking a layered witness), it will be used wherever
1481 no $sigil path exists.  If there is a base text reading, that will be
1482 used wherever no path exists for $sigil or $backup.
1483
1484 =cut
1485
1486 # TODO Think about returning some lazy-eval iterator.
1487 # TODO Get rid of backup; we should know from what witness is whether we need it.
1488
1489 sub reading_sequence {
1490     my( $self, $start, $end, $witness ) = @_;
1491
1492     $witness = $self->baselabel unless $witness;
1493     my @readings = ( $start );
1494     my %seen;
1495     my $n = $start;
1496     while( $n && $n->id ne $end->id ) {
1497         if( exists( $seen{$n->id} ) ) {
1498             throw( "Detected loop for $witness at " . $n->id );
1499         }
1500         $seen{$n->id} = 1;
1501
1502         my $next = $self->next_reading( $n, $witness );
1503         unless( $next ) {
1504             throw( "Did not find any path for $witness from reading " . $n->id );
1505         }
1506         push( @readings, $next );
1507         $n = $next;
1508     }
1509     # Check that the last reading is our end reading.
1510     my $last = $readings[$#readings];
1511     throw( "Last reading found from " . $start->text .
1512         " for witness $witness is not the end!" ) # TODO do we get this far?
1513         unless $last->id eq $end->id;
1514
1515     return @readings;
1516 }
1517
1518 =head2 next_reading( $reading, $sigil );
1519
1520 Returns the reading that follows the given reading along the given witness
1521 path.
1522
1523 =cut
1524
1525 sub next_reading {
1526     # Return the successor via the corresponding path.
1527     my $self = shift;
1528     my $answer = $self->_find_linked_reading( 'next', @_ );
1529         return undef unless $answer;
1530     return $self->reading( $answer );
1531 }
1532
1533 =head2 prior_reading( $reading, $sigil )
1534
1535 Returns the reading that precedes the given reading along the given witness
1536 path.
1537
1538 =cut
1539
1540 sub prior_reading {
1541     # Return the predecessor via the corresponding path.
1542     my $self = shift;
1543     my $answer = $self->_find_linked_reading( 'prior', @_ );
1544     return $self->reading( $answer );
1545 }
1546
1547 sub _find_linked_reading {
1548     my( $self, $direction, $node, $path ) = @_;
1549
1550     # Get a backup if we are dealing with a layered witness
1551     my $alt_path;
1552     my $aclabel = $self->ac_label;
1553     if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1554         $alt_path = $1;
1555     }
1556
1557     my @linked_paths = $direction eq 'next'
1558         ? $self->sequence->edges_from( $node )
1559         : $self->sequence->edges_to( $node );
1560     return undef unless scalar( @linked_paths );
1561
1562     # We have to find the linked path that contains all of the
1563     # witnesses supplied in $path.
1564     my( @path_wits, @alt_path_wits );
1565     @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1566     @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
1567     my $base_le;
1568     my $alt_le;
1569     foreach my $le ( @linked_paths ) {
1570         if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
1571             $base_le = $le;
1572         }
1573                 my @le_wits = sort $self->path_witnesses( $le );
1574                 if( _is_within( \@path_wits, \@le_wits ) ) {
1575                         # This is the right path.
1576                         return $direction eq 'next' ? $le->[1] : $le->[0];
1577                 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1578                         $alt_le = $le;
1579                 }
1580     }
1581     # Got this far? Return the alternate path if it exists.
1582     return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
1583         if $alt_le;
1584
1585     # Got this far? Return the base path if it exists.
1586     return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
1587         if $base_le;
1588
1589     # Got this far? We have no appropriate path.
1590     warn "Could not find $direction node from " . $node->id
1591         . " along path $path";
1592     return undef;
1593 }
1594
1595 # Some set logic.
1596 sub _is_within {
1597     my( $set1, $set2 ) = @_;
1598     my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
1599     foreach my $el ( @$set1 ) {
1600         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
1601     }
1602     return $ret;
1603 }
1604
1605 # Return the string that joins together a list of witnesses for
1606 # display on a single path.
1607 sub _witnesses_of_label {
1608     my( $self, $label ) = @_;
1609     my $regex = $self->wit_list_separator;
1610     my @answer = split( /\Q$regex\E/, $label );
1611     return @answer;
1612 }
1613
1614 =head2 common_readings
1615
1616 Returns the list of common readings in the graph (i.e. those readings that are
1617 shared by all non-lacunose witnesses.)
1618
1619 =cut
1620
1621 sub common_readings {
1622         my $self = shift;
1623         my @common = grep { $_->is_common } $self->readings;
1624         return @common;
1625 }
1626
1627 =head2 path_text( $sigil, [, $start, $end ] )
1628
1629 Returns the text of a witness (plus its backup, if we are using a layer)
1630 as stored in the collation.  The text is returned as a string, where the
1631 individual readings are joined with spaces and the meta-readings (e.g.
1632 lacunae) are omitted.  Optional specification of $start and $end allows
1633 the generation of a subset of the witness text.
1634
1635 =cut
1636
1637 sub path_text {
1638         my( $self, $wit, $start, $end ) = @_;
1639         $start = $self->start unless $start;
1640         $end = $self->end unless $end;
1641         my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
1642         my $pathtext = '';
1643         my $last;
1644         foreach my $r ( @path ) {
1645                 unless ( $r->join_prior || !$last || $last->join_next ) {
1646                         $pathtext .= ' ';
1647                 }
1648                 $pathtext .= $r->text;
1649                 $last = $r;
1650         }
1651         return $pathtext;
1652 }
1653
1654 =head1 INITIALIZATION METHODS
1655
1656 These are mostly for use by parsers.
1657
1658 =head2 make_witness_path( $witness )
1659
1660 Link the array of readings contained in $witness->path (and in
1661 $witness->uncorrected_path if it exists) into collation paths.
1662 Clear out the arrays when finished.
1663
1664 =head2 make_witness_paths
1665
1666 Call make_witness_path for all witnesses in the tradition.
1667
1668 =cut
1669
1670 # For use when a collation is constructed from a base text and an apparatus.
1671 # We have the sequences of readings and just need to add path edges.
1672 # When we are done, clear out the witness path attributes, as they are no
1673 # longer needed.
1674 # TODO Find a way to replace the witness path attributes with encapsulated functions?
1675
1676 sub make_witness_paths {
1677     my( $self ) = @_;
1678     foreach my $wit ( $self->tradition->witnesses ) {
1679         # say STDERR "Making path for " . $wit->sigil;
1680         $self->make_witness_path( $wit );
1681     }
1682 }
1683
1684 sub make_witness_path {
1685     my( $self, $wit ) = @_;
1686     my @chain = @{$wit->path};
1687     my $sig = $wit->sigil;
1688     # Add start and end if necessary
1689     unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1690     push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1691     foreach my $idx ( 0 .. $#chain-1 ) {
1692         $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
1693     }
1694     if( $wit->is_layered ) {
1695         @chain = @{$wit->uncorrected_path};
1696                 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1697                 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1698         foreach my $idx( 0 .. $#chain-1 ) {
1699             my $source = $chain[$idx];
1700             my $target = $chain[$idx+1];
1701             $self->add_path( $source, $target, $sig.$self->ac_label )
1702                 unless $self->has_path( $source, $target, $sig );
1703         }
1704     }
1705     $wit->clear_path;
1706     $wit->clear_uncorrected_path;
1707 }
1708
1709 =head2 calculate_ranks
1710
1711 Calculate the reading ranks (that is, their aligned positions relative
1712 to each other) for the graph.  This can only be called on linear collations.
1713
1714 =begin testing
1715
1716 use Text::Tradition;
1717
1718 my $cxfile = 't/data/Collatex-16.xml';
1719 my $t = Text::Tradition->new(
1720     'name'  => 'inline',
1721     'input' => 'CollateX',
1722     'file'  => $cxfile,
1723     );
1724 my $c = $t->collation;
1725
1726 # Make an svg
1727 my $table = $c->alignment_table;
1728 ok( $c->has_cached_table, "Alignment table was cached" );
1729 is( $c->alignment_table, $table, "Cached table returned upon second call" );
1730 $c->calculate_ranks;
1731 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
1732 $c->add_relationship( 'n13', 'n23', { type => 'repetition' } );
1733 is( $c->alignment_table, $table, "Alignment table unchanged after non-colo relationship add" );
1734 $c->add_relationship( 'n24', 'n23', { type => 'spelling' } );
1735 isnt( $c->alignment_table, $table, "Alignment table changed after colo relationship add" );
1736
1737 =end testing
1738
1739 =cut
1740
1741 sub calculate_ranks {
1742     my $self = shift;
1743     # Save the existing ranks, in case we need to invalidate the cached SVG.
1744     my %existing_ranks;
1745     map { $existing_ranks{$_} = $_->rank } $self->readings;
1746
1747     # Do the rankings based on the relationship equivalence graph, starting
1748     # with the start node.
1749     my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
1750
1751     # Transfer our rankings from the topological graph to the real one.
1752     foreach my $r ( $self->readings ) {
1753         if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
1754             $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
1755         } else {
1756                 # Die. Find the last rank we calculated.
1757                 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
1758                                  <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
1759                         $self->readings;
1760                 my $last = pop @all_defined;
1761             throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
1762         }
1763     }
1764     # Do we need to invalidate the cached data?
1765     if( $self->has_cached_table ) {
1766         foreach my $r ( $self->readings ) {
1767                 next if defined( $existing_ranks{$r} )
1768                         && $existing_ranks{$r} == $r->rank;
1769                 # Something has changed, so clear the cache
1770                 $self->_clear_cache;
1771                         # ...and recalculate the common readings.
1772                         $self->calculate_common_readings();
1773                 last;
1774         }
1775     }
1776         # The graph calculation information is now up to date.
1777         $self->_graphcalc_done(1);
1778 }
1779
1780 sub _clear_cache {
1781         my $self = shift;
1782         $self->wipe_table if $self->has_cached_table;
1783 }
1784
1785
1786 =head2 flatten_ranks
1787
1788 A convenience method for parsing collation data.  Searches the graph for readings
1789 with the same text at the same rank, and merges any that are found.
1790
1791 =cut
1792
1793 sub flatten_ranks {
1794     my ( $self, %args ) = shift;
1795     my %unique_rank_rdg;
1796     my $changed;
1797     foreach my $p ( $self->identical_readings( %args ) ) {
1798                 # say STDERR "Combining readings at same rank: @$p";
1799                 $changed = 1;
1800                 $self->merge_readings( @$p );
1801                 # TODO see if this now makes a common point.
1802     }
1803     # If we merged readings, the ranks are still fine but the alignment
1804     # table is wrong. Wipe it.
1805     $self->wipe_table() if $changed;
1806 }
1807
1808 =head2 identical_readings
1809 =head2 identical_readings( start => $startnode, end => $endnode )
1810 =head2 identical_readings( startrank => $startrank, endrank => $endrank )
1811
1812 Goes through the graph identifying all pairs of readings that appear to be
1813 identical, and therefore able to be merged into a single reading. Returns the
1814 relevant identical pairs. Can be restricted to run over only a part of the
1815 graph, specified either by node or by rank.
1816
1817 =cut
1818
1819 sub identical_readings {
1820         my ( $self, %args ) = @_;
1821     # Find where we should start and end.
1822     my $startrank = $args{startrank} || 0;
1823     if( $args{start} ) {
1824         throw( "Starting reading has no rank" ) unless $self->reading( $args{start} )
1825                 && $self->reading( $args{start} )->has_rank;
1826         $startrank = $self->reading( $args{start} )->rank;
1827     }
1828     my $endrank = $args{endrank} || $self->end->rank;
1829     if( $args{end} ) {
1830         throw( "Ending reading has no rank" ) unless $self->reading( $args{end} )
1831                 && $self->reading( $args{end} )->has_rank;
1832         $endrank = $self->reading( $args{end} )->rank;
1833     }
1834
1835     # Make sure the ranks are correct.
1836     unless( $self->_graphcalc_done ) {
1837         $self->calculate_ranks;
1838     }
1839     # Go through the readings looking for duplicates.
1840     my %unique_rank_rdg;
1841     my @pairs;
1842     foreach my $rdg ( $self->readings ) {
1843         next unless $rdg->has_rank;
1844         my $rk = $rdg->rank;
1845         next if $rk > $endrank || $rk < $startrank;
1846         my $key = $rk . "||" . $rdg->text;
1847         if( exists $unique_rank_rdg{$key} ) {
1848                 # Make sure they don't have different grammatical forms
1849                         my $ur = $unique_rank_rdg{$key};
1850                 if( $rdg->is_identical( $ur ) ) {
1851                                 push( @pairs, [ $ur, $rdg ] );
1852                         }
1853         } else {
1854             $unique_rank_rdg{$key} = $rdg;
1855         }
1856     }
1857
1858     return @pairs;
1859 }
1860
1861
1862 =head2 calculate_common_readings
1863
1864 Goes through the graph identifying the readings that appear in every witness
1865 (apart from those with lacunae at that spot.) Marks them as common and returns
1866 the list.
1867
1868 =begin testing
1869
1870 use Text::Tradition;
1871
1872 my $cxfile = 't/data/Collatex-16.xml';
1873 my $t = Text::Tradition->new(
1874     'name'  => 'inline',
1875     'input' => 'CollateX',
1876     'file'  => $cxfile,
1877     );
1878 my $c = $t->collation;
1879
1880 my @common = $c->calculate_common_readings();
1881 is( scalar @common, 8, "Found correct number of common readings" );
1882 my @marked = sort $c->common_readings();
1883 is( scalar @common, 8, "All common readings got marked as such" );
1884 my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
1885 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1886
1887 =end testing
1888
1889 =cut
1890
1891 sub calculate_common_readings {
1892         my $self = shift;
1893         my @common;
1894         map { $_->is_common( 0 ) } $self->readings;
1895         # Implicitly calls calculate_ranks
1896         my $table = $self->alignment_table;
1897         foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1898                 my @row = map { $_->{'tokens'}->[$idx]
1899                                                         ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1900                                         @{$table->{'alignment'}};
1901                 my %hash;
1902                 foreach my $r ( @row ) {
1903                         if( $r ) {
1904                                 $hash{$r->id} = $r unless $r->is_meta;
1905                         } else {
1906                                 $hash{'UNDEF'} = $r;
1907                         }
1908                 }
1909                 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1910                         my( $r ) = values %hash;
1911                         $r->is_common( 1 );
1912                         push( @common, $r );
1913                 }
1914         }
1915         return @common;
1916 }
1917
1918 =head2 text_from_paths
1919
1920 Calculate the text array for all witnesses from the path, for later consistency
1921 checking.  Only to be used if there is no non-graph-based way to know the
1922 original texts.
1923
1924 =cut
1925
1926 sub text_from_paths {
1927         my $self = shift;
1928     foreach my $wit ( $self->tradition->witnesses ) {
1929         my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1930         my @text;
1931         foreach my $r ( @readings ) {
1932                 next if $r->is_meta;
1933                 push( @text, $r->text );
1934         }
1935         $wit->text( \@text );
1936         if( $wit->is_layered ) {
1937                         my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
1938                                                                                                   $wit->sigil.$self->ac_label );
1939                         my @uctext;
1940                         foreach my $r ( @ucrdgs ) {
1941                                 next if $r->is_meta;
1942                                 push( @uctext, $r->text );
1943                         }
1944                         $wit->layertext( \@uctext );
1945         }
1946     }
1947 }
1948
1949 =head1 UTILITY FUNCTIONS
1950
1951 =head2 common_predecessor( $reading_a, $reading_b )
1952
1953 Find the last reading that occurs in sequence before both the given readings.
1954 At the very least this should be $self->start.
1955
1956 =head2 common_successor( $reading_a, $reading_b )
1957
1958 Find the first reading that occurs in sequence after both the given readings.
1959 At the very least this should be $self->end.
1960
1961 =begin testing
1962
1963 use Text::Tradition;
1964
1965 my $cxfile = 't/data/Collatex-16.xml';
1966 my $t = Text::Tradition->new(
1967     'name'  => 'inline',
1968     'input' => 'CollateX',
1969     'file'  => $cxfile,
1970     );
1971 my $c = $t->collation;
1972
1973 is( $c->common_predecessor( 'n24', 'n23' )->id,
1974     'n20', "Found correct common predecessor" );
1975 is( $c->common_successor( 'n24', 'n23' )->id,
1976     '__END__', "Found correct common successor" );
1977
1978 is( $c->common_predecessor( 'n19', 'n17' )->id,
1979     'n16', "Found correct common predecessor for readings on same path" );
1980 is( $c->common_successor( 'n21', 'n10' )->id,
1981     '__END__', "Found correct common successor for readings on same path" );
1982
1983 =end testing
1984
1985 =cut
1986
1987 ## Return the closest reading that is a predecessor of both the given readings.
1988 sub common_predecessor {
1989         my $self = shift;
1990         my( $r1, $r2 ) = $self->_objectify_args( @_ );
1991         return $self->_common_in_path( $r1, $r2, 'predecessors' );
1992 }
1993
1994 sub common_successor {
1995         my $self = shift;
1996         my( $r1, $r2 ) = $self->_objectify_args( @_ );
1997         return $self->_common_in_path( $r1, $r2, 'successors' );
1998 }
1999
2000
2001 # TODO think about how to do this without ranks...
2002 sub _common_in_path {
2003         my( $self, $r1, $r2, $dir ) = @_;
2004         my $iter = $self->end->rank;
2005         my @candidates;
2006         my @last_r1 = ( $r1 );
2007         my @last_r2 = ( $r2 );
2008         # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
2009         my %all_seen;
2010         # say STDERR "Finding common $dir for $r1, $r2";
2011         while( !@candidates ) {
2012                 last unless $iter--;  # Avoid looping infinitely
2013                 # Iterate separately down the graph from r1 and r2
2014                 my( @new_lc1, @new_lc2 );
2015                 foreach my $lc ( @last_r1 ) {
2016                         foreach my $p ( $lc->$dir ) {
2017                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
2018                                         # say STDERR "Path candidate $p from $lc";
2019                                         push( @candidates, $p );
2020                                 } elsif( !$all_seen{$p->id} ) {
2021                                         $all_seen{$p->id} = 'r1';
2022                                         push( @new_lc1, $p );
2023                                 }
2024                         }
2025                 }
2026                 foreach my $lc ( @last_r2 ) {
2027                         foreach my $p ( $lc->$dir ) {
2028                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
2029                                         # say STDERR "Path candidate $p from $lc";
2030                                         push( @candidates, $p );
2031                                 } elsif( !$all_seen{$p->id} ) {
2032                                         $all_seen{$p->id} = 'r2';
2033                                         push( @new_lc2, $p );
2034                                 }
2035                         }
2036                 }
2037                 @last_r1 = @new_lc1;
2038                 @last_r2 = @new_lc2;
2039         }
2040         my @answer = sort { $a->rank <=> $b->rank } @candidates;
2041         return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
2042 }
2043
2044 sub throw {
2045         Text::Tradition::Error->throw(
2046                 'ident' => 'Collation error',
2047                 'message' => $_[0],
2048                 );
2049 }
2050
2051 no Moose;
2052 __PACKAGE__->meta->make_immutable;
2053
2054 =head1 BUGS/TODO
2055
2056 =over
2057
2058 =item * Rework XML serialization in a more modular way
2059
2060 =back
2061
2062 =head1 LICENSE
2063
2064 This package is free software and is provided "as is" without express
2065 or implied warranty.  You can redistribute it and/or modify it under
2066 the same terms as Perl itself.
2067
2068 =head1 AUTHOR
2069
2070 Tara L Andrews E<lt>aurum@cpan.orgE<gt>