base/lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use feature 'say';
   4 use Encode qw( decode_utf8 );
   5 use File::Temp;
   6 use File::Which;
   7 use Graph;
   8 use IPC::Run qw( run binary );
   9 use Text::CSV;
  10 use Text::Tradition::Collation::Data;
  11 use Text::Tradition::Collation::Reading;
  12 use Text::Tradition::Collation::RelationshipStore;
  13 use Text::Tradition::Error;
  14 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
  15 use XML::LibXML;
  16 use XML::LibXML::XPathContext;
  17 use Moose;
  18
  19 has _data => (
  20         isa      => 'Text::Tradition::Collation::Data',
  21         is       => 'ro',
  22         required => 1,
  23         handles  => [ qw(
  24                 sequence
  25                 paths
  26                 _set_relations
  27                 relations
  28                 _set_start
  29                 _set_end
  30                 ac_label
  31                 has_cached_table
  32                 relationships
  33                 related_readings
  34                 get_relationship
  35                 del_relationship
  36                 equivalence
  37                 equivalence_graph
  38                 readings
  39                 reading
  40                 _add_reading
  41                 del_reading
  42                 has_reading
  43                 wit_list_separator
  44                 baselabel
  45                 linear
  46                 wordsep
  47                 start
  48                 end
  49                 cached_table
  50                 _graphcalc_done
  51                 has_cached_svg
  52                 wipe_table
  53         )]
  54 );
  55
  56 has 'tradition' => (
  57     is => 'ro',
  58     isa => 'Text::Tradition',
  59     writer => '_set_tradition',
  60     weak_ref => 1,
  61     );
  62
  63 =head1 NAME
  64
  65 Text::Tradition::Collation - a software model for a text collation
  66
  67 =head1 SYNOPSIS
  68
  69   use Text::Tradition;
  70   my $t = Text::Tradition->new(
  71     'name' => 'this is a text',
  72     'input' => 'TEI',
  73     'file' => '/path/to/tei_parallel_seg_file.xml' );
  74
  75   my $c = $t->collation;
  76   my @readings = $c->readings;
  77   my @paths = $c->paths;
  78   my @relationships = $c->relationships;
  79
  80   my $svg_variant_graph = $t->collation->as_svg();
  81
  82 =head1 DESCRIPTION
  83
  84 Text::Tradition is a library for representation and analysis of collated
  85 texts, particularly medieval ones.  The Collation is the central feature of
  86 a Tradition, where the text, its sequence of readings, and its relationships
  87 between readings are actually kept.
  88
  89 =head1 CONSTRUCTOR
  90
  91 =head2 new
  92
  93 The constructor.  Takes a hash or hashref of the following arguments:
  94
  95 =over
  96
  97 =item * tradition - The Text::Tradition object to which the collation
  98 belongs. Required.
  99
 100 =item * linear - Whether the collation should be linear; that is, whether
 101 transposed readings should be treated as two linked readings rather than one,
 102 and therefore whether the collation graph is acyclic.  Defaults to true.
 103
 104 =item * baselabel - The default label for the path taken by a base text
 105 (if any). Defaults to 'base text'.
 106
 107 =item * wit_list_separator - The string to join a list of witnesses for
 108 purposes of making labels in display graphs.  Defaults to ', '.
 109
 110 =item * ac_label - The extra label to tack onto a witness sigil when
 111 representing another layer of path for the given witness - that is, when
 112 a text has more than one possible reading due to scribal corrections or
 113 the like.  Defaults to ' (a.c.)'.
 114
 115 =item * wordsep - The string used to separate words in the original text.
 116 Defaults to ' '.
 117
 118 =back
 119
 120 =head1 ACCESSORS
 121
 122 =head2 tradition
 123
 124 =head2 linear
 125
 126 =head2 wit_list_separator
 127
 128 =head2 baselabel
 129
 130 =head2 ac_label
 131
 132 =head2 wordsep
 133
 134 Simple accessors for collation attributes.
 135
 136 =head2 start
 137
 138 The meta-reading at the start of every witness path.
 139
 140 =head2 end
 141
 142 The meta-reading at the end of every witness path.
 143
 144 =head2 readings
 145
 146 Returns all Reading objects in the graph.
 147
 148 =head2 reading( $id )
 149
 150 Returns the Reading object corresponding to the given ID.
 151
 152 =head2 add_reading( $reading_args )
 153
 154 Adds a new reading object to the collation.
 155 See L<Text::Tradition::Collation::Reading> for the available arguments.
 156
 157 =head2 del_reading( $object_or_id )
 158
 159 Removes the given reading from the collation, implicitly removing its
 160 paths and relationships.
 161
 162 =head2 merge_readings( $main, $second, $concatenate, $with_str )
 163
 164 Merges the $second reading into the $main one. If $concatenate is true, then
 165 the merged node will carry the text of both readings, concatenated with either
 166 $with_str (if specified) or a sensible default (the empty string if the
 167 appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
 168
 169 The first two arguments may be either readings or reading IDs.
 170
 171 =head2 has_reading( $id )
 172
 173 Predicate to see whether a given reading ID is in the graph.
 174
 175 =head2 reading_witnesses( $object_or_id )
 176
 177 Returns a list of sigils whose witnesses contain the reading.
 178
 179 =head2 paths
 180
 181 Returns all reading paths within the document - that is, all edges in the
 182 collation graph.  Each path is an arrayref of [ $source, $target ] reading IDs.
 183
 184 =head2 add_path( $source, $target, $sigil )
 185
 186 Links the given readings in the collation in sequence, under the given witness
 187 sigil.  The readings may be specified by object or ID.
 188
 189 =head2 del_path( $source, $target, $sigil )
 190
 191 Links the given readings in the collation in sequence, under the given witness
 192 sigil.  The readings may be specified by object or ID.
 193
 194 =head2 has_path( $source, $target );
 195
 196 Returns true if the two readings are linked in sequence in any witness.
 197 The readings may be specified by object or ID.
 198
 199 =head2 relationships
 200
 201 Returns all Relationship objects in the collation.
 202
 203 =head2 add_relationship( $reading, $other_reading, $options )
 204
 205 Adds a new relationship of the type given in $options between the two readings,
 206 which may be specified by object or ID.  Returns a value of ( $status, @vectors)
 207 where $status is true on success, and @vectors is a list of relationship edges
 208 that were ultimately added.
 209 See L<Text::Tradition::Collation::Relationship> for the available options.
 210
 211 =cut
 212
 213 sub BUILDARGS {
 214         my ( $class, @args ) = @_;
 215         my %args = @args == 1 ? %{ $args[0] } : @args;
 216         # TODO determine these from the Moose::Meta object
 217         my @delegate_attrs = qw(sequence relations readings wit_list_separator baselabel
 218                 linear wordsep start end cached_table _graphcalc_done);
 219         my %data_args;
 220         for my $attr (@delegate_attrs) {
 221                 $data_args{$attr} = delete $args{$attr} if exists $args{$attr};
 222         }
 223         $args{_data} = Text::Tradition::Collation::Data->new(%data_args);
 224         return \%args;
 225 }
 226
 227 sub BUILD {
 228     my $self = shift;
 229     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
 230     $self->_set_start( $self->add_reading(
 231         { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
 232     $self->_set_end( $self->add_reading(
 233         { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
 234 }
 235
 236 sub register_relationship_type {
 237         my $self = shift;
 238         my %args = @_ == 1 ? %{$_[0]} : @_;
 239         if( $self->relations->has_type( $args{name} ) ) {
 240                 throw( 'Relationship type ' . $args{name} . ' already registered' );
 241         }
 242         $self->relations->add_type( %args );
 243 }
 244
 245 ### Reading construct/destruct functions
 246
 247 sub add_reading {
 248         my( $self, $reading ) = @_;
 249         unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
 250                 my %args = %$reading;
 251                 if( $args{'init'} ) {
 252                         # If we are initializing an empty collation, don't assume that we
 253                         # have set a tradition.
 254                         delete $args{'init'};
 255                 } elsif( $self->tradition->can('language') && $self->tradition->has_language
 256                         && !exists $args{'language'} ) {
 257                         $args{'language'} = $self->tradition->language;
 258                 }
 259                 $reading = Text::Tradition::Collation::Reading->new(
 260                         'collation' => $self,
 261                         %args );
 262         }
 263         # First check to see if a reading with this ID exists.
 264         if( $self->reading( $reading->id ) ) {
 265                 throw( "Collation already has a reading with id " . $reading->id );
 266         }
 267         $self->_graphcalc_done(0);
 268         $self->_add_reading( $reading->id => $reading );
 269         # Once the reading has been added, put it in both graphs.
 270         $self->sequence->add_vertex( $reading->id );
 271         $self->relations->add_reading( $reading->id );
 272         return $reading;
 273 };
 274
 275 around del_reading => sub {
 276         my $orig = shift;
 277         my $self = shift;
 278         my $arg = shift;
 279
 280         if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
 281                 $arg = $arg->id;
 282         }
 283         # Remove the reading from the graphs.
 284         $self->_graphcalc_done(0);
 285         $self->_clear_cache; # Explicitly clear caches to GC the reading
 286         $self->sequence->delete_vertex( $arg );
 287         $self->relations->delete_reading( $arg );
 288
 289         # Carry on.
 290         $self->$orig( $arg );
 291 };
 292
 293 =begin testing
 294
 295 use Text::Tradition;
 296
 297 my $cxfile = 't/data/Collatex-16.xml';
 298 my $t = Text::Tradition->new(
 299     'name'  => 'inline',
 300     'input' => 'CollateX',
 301     'file'  => $cxfile,
 302     );
 303 my $c = $t->collation;
 304
 305 my $rno = scalar $c->readings;
 306 # Split n21 for testing purposes
 307 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
 308 my $old_r = $c->reading( 'n21' );
 309 $old_r->alter_text( 'to' );
 310 $c->del_path( 'n20', 'n21', 'A' );
 311 $c->add_path( 'n20', 'n21p0', 'A' );
 312 $c->add_path( 'n21p0', 'n21', 'A' );
 313 $c->flatten_ranks();
 314 ok( $c->reading( 'n21p0' ), "New reading exists" );
 315 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
 316
 317 # Combine n3 and n4 ( with his )
 318 $c->merge_readings( 'n3', 'n4', 1 );
 319 ok( !$c->reading('n4'), "Reading n4 is gone" );
 320 is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
 321
 322 # Collapse n9 and n10 ( rood / root )
 323 $c->merge_readings( 'n9', 'n10' );
 324 ok( !$c->reading('n10'), "Reading n10 is gone" );
 325 is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
 326
 327 # Combine n21 and n21p0
 328 my $remaining = $c->reading('n21');
 329 $remaining ||= $c->reading('n22');  # one of these should still exist
 330 $c->merge_readings( 'n21p0', $remaining, 1 );
 331 ok( !$c->reading('n21'), "Reading $remaining is gone" );
 332 is( $c->reading('n21p0')->text, 'unto', "Reading n21p0 merged correctly" );
 333
 334 =end testing
 335
 336 =cut
 337
 338 sub merge_readings {
 339         my $self = shift;
 340
 341         # Sanity check
 342         my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
 343         my $mergemeta = $kept_obj->is_meta;
 344         throw( "Cannot merge meta and non-meta reading" )
 345                 unless ( $mergemeta && $del_obj->is_meta )
 346                         || ( !$mergemeta && !$del_obj->is_meta );
 347         if( $mergemeta ) {
 348                 throw( "Cannot merge with start or end node" )
 349                         if( $kept_obj eq $self->start || $kept_obj eq $self->end
 350                                 || $del_obj eq $self->start || $del_obj eq $self->end );
 351                 throw( "Cannot combine text of meta readings" ) if $combine;
 352         }
 353         # We only need the IDs for adding paths to the graph, not the reading
 354         # objects themselves.
 355         my $kept = $kept_obj->id;
 356         my $deleted = $del_obj->id;
 357         $self->_graphcalc_done(0);
 358
 359     # The kept reading should inherit the paths and the relationships
 360     # of the deleted reading.
 361         foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
 362                 my @vector = ( $kept );
 363                 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
 364                 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
 365                 next if $vector[0] eq $vector[1]; # Don't add a self loop
 366                 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
 367                 $self->sequence->add_edge( @vector );
 368                 my $fwits = $self->sequence->get_edge_attributes( @vector );
 369                 @wits{keys %$fwits} = values %$fwits;
 370                 $self->sequence->set_edge_attributes( @vector, \%wits );
 371         }
 372         $self->relations->merge_readings( $kept, $deleted, $combine );
 373
 374         # Do the deletion deed.
 375         if( $combine ) {
 376                 # Combine the text of the readings
 377                 my $joinstr = $combine_char;
 378                 unless( defined $joinstr ) {
 379                         $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
 380                         $joinstr = $self->wordsep unless defined $joinstr;
 381                 }
 382                 $kept_obj->_combine( $del_obj, $joinstr );
 383         }
 384         $self->del_reading( $deleted );
 385 }
 386
 387 =head2 compress_readings
 388
 389 Where possible in the graph, compresses plain sequences of readings into a
 390 single reading. The sequences must consist of readings with no
 391 relationships to other readings, with only a single witness path between
 392 them and no other witness paths from either that would skip the other. The
 393 readings must also not be marked as nonsense or bad grammar.
 394
 395 WARNING: This operation cannot be undone.
 396
 397 =cut
 398
 399 sub compress_readings {
 400         my $self = shift;
 401         # Anywhere in the graph that there is a reading that joins only to a single
 402         # successor, and neither of these have any relationships, just join the two
 403         # readings.
 404         foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
 405                 # Now look for readings that can be joined to their successors.
 406                 next unless $rdg->is_combinable;
 407                 my %seen;
 408                 while( $self->sequence->successors( $rdg ) == 1 ) {
 409                         my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
 410                         throw( "Infinite loop" ) if $seen{$next->id};
 411                         $seen{$next->id} = 1;
 412                         last if $self->sequence->predecessors( $next ) > 1;
 413                         last unless $next->is_combinable;
 414                         say "Joining readings $rdg and $next";
 415                         $self->merge_readings( $rdg, $next, 1 );
 416                 }
 417         }
 418         # Make sure we haven't screwed anything up
 419         foreach my $wit ( $self->tradition->witnesses ) {
 420                 my $pathtext = $self->path_text( $wit->sigil );
 421                 my $origtext = join( ' ', @{$wit->text} );
 422                 throw( "Text differs for witness " . $wit->sigil )
 423                         unless $pathtext eq $origtext;
 424                 if( $wit->is_layered ) {
 425                         $pathtext = $self->path_text( $wit->sigil.$self->ac_label );
 426                         $origtext = join( ' ', @{$wit->layertext} );
 427                         throw( "Ante-corr text differs for witness " . $wit->sigil )
 428                                 unless $pathtext eq $origtext;
 429                 }
 430         }
 431
 432         $self->relations->rebuild_equivalence();
 433         $self->calculate_ranks();
 434 }
 435
 436 # Helper function for manipulating the graph.
 437 sub _stringify_args {
 438         my( $self, $first, $second, @args ) = @_;
 439     $first = $first->id
 440         if ref( $first ) eq 'Text::Tradition::Collation::Reading';
 441     $second = $second->id
 442         if ref( $second ) eq 'Text::Tradition::Collation::Reading';
 443     return( $first, $second, @args );
 444 }
 445
 446 # Helper function for manipulating the graph.
 447 sub _objectify_args {
 448         my( $self, $first, $second, $arg ) = @_;
 449     $first = $self->reading( $first )
 450         unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
 451     $second = $self->reading( $second )
 452         unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
 453     return( $first, $second, $arg );
 454 }
 455
 456 =head2 duplicate_reading( $reading, @witlist )
 457
 458 Split the given reading into two, so that the new reading is in the path for
 459 the witnesses given in @witlist. If the result is that certain non-colocated
 460 relationships (e.g. transpositions) are no longer valid, these will be removed.
 461 Returns the newly-created reading.
 462
 463 =begin testing
 464
 465 use Text::Tradition;
 466
 467 my $st = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/collatecorr.xml' );
 468 is( ref( $st ), 'Text::Tradition', "Got a tradition from test file" );
 469 ok( $st->has_witness('Ba96'), "Tradition has the affected witness" );
 470
 471 my $sc = $st->collation;
 472 my $numr = 17;
 473 ok( $sc->reading('n131'), "Tradition has the affected reading" );
 474 is( scalar( $sc->readings ), $numr, "There are $numr readings in the graph" );
 475 is( $sc->end->rank, 14, "There are fourteen ranks in the graph" );
 476
 477 # Detach the erroneously collated reading
 478 my $newr = $sc->duplicate_reading( 'n131', 'Ba96' );
 479 ok( $newr, "New reading was created" );
 480 ok( $sc->reading('n131_0'), "Detached the bad collation with a new reading" );
 481 is( scalar( $sc->readings ), $numr + 1, "A reading was added to the graph" );
 482 is( $sc->end->rank, 10, "There are now only ten ranks in the graph" );
 483
 484 # Check that the bad transposition is gone
 485 is( $sc->get_relationship( 'n130', 'n135' ), undef, "Bad transposition relationship is gone" );
 486
 487 # Fix the collation
 488 ok( $sc->add_relationship( 'n124', 'n131_0', { type => 'collated', scope => 'local' } ),
 489         "Collated the readings correctly" );
 490 $sc->calculate_ranks();
 491 $sc->flatten_ranks();
 492 is( $sc->end->rank, 11, "The ranks shifted appropriately" );
 493 is( scalar( $sc->readings ), $numr - 3, "Now we are collated correctly" );
 494
 495 =end testing
 496
 497 =cut
 498
 499 sub duplicate_reading {
 500         my( $self, $r, @wits ) = @_;
 501         # Add the new reading, duplicating $r.
 502         unless( ref( $r ) eq 'Text::Tradition::Collation::Reading' ) {
 503                 $r = $self->reading( $r );
 504         }
 505         throw( "Cannot duplicate a meta-reading" )
 506                 if $r->is_meta;
 507
 508         # Get all the reading attributes and duplicate them.
 509         my $rmeta = Text::Tradition::Collation::Reading->meta;
 510         my %args;
 511     foreach my $attr( $rmeta->get_all_attributes ) {
 512                 next if $attr->name =~ /^_/;
 513                 my $acc = $attr->get_read_method;
 514                 if( !$acc && $attr->has_applied_traits ) {
 515                         my $tr = $attr->applied_traits;
 516                         if( $tr->[0] =~ /::(Array|Hash)$/ ) {
 517                                 my $which = $1;
 518                                 my %methods = reverse %{$attr->handles};
 519                                 $acc = $methods{elements};
 520                                 $args{$attr->name} = $which eq 'Array'
 521                                         ? [ $r->$acc ] : { $r->$acc };
 522                         }
 523                 } else {
 524                         $args{$attr->name} = $r->$acc if $acc;
 525                 }
 526         }
 527         # By definition the new reading will no longer be common.
 528         $args{is_common} = 0;
 529         # The new reading also needs its own ID.
 530         $args{id} = $self->_generate_dup_id( $r->id );
 531
 532         # Try to make the new reading.
 533         my $newr = $self->add_reading( \%args );
 534         # The old reading is also no longer common.
 535         $r->is_common( 0 );
 536
 537         # For each of the witnesses, dissociate from the old reading and
 538         # associate with the new.
 539         foreach my $wit ( @wits ) {
 540                 my $prior = $self->prior_reading( $r, $wit );
 541                 my $next = $self->next_reading( $r, $wit );
 542                 $self->del_path( $prior, $r, $wit );
 543                 $self->add_path( $prior, $newr, $wit );
 544                 $self->del_path( $r, $next, $wit );
 545                 $self->add_path( $newr, $next, $wit );
 546         }
 547
 548         # Hash the reading ranks and find the closest common successor to our
 549         # two readings
 550         my %rrk;
 551         my $succ;
 552         if( $self->end->has_rank ) {
 553                 $succ = $self->common_successor( $r, $newr );
 554                 foreach my $rdg ( $self->readings ) {
 555                         $rrk{$rdg->id} = $rdg->rank;
 556                 }
 557         }
 558
 559         # Rebuild the equivalence graph and calculate the new ranks
 560         $self->relations->rebuild_equivalence();
 561         $self->calculate_ranks();
 562
 563         # Check for invalid non-colocated relationships among changed-rank readings
 564         # from where the ranks start changing up to $succ
 565         if( $self->end->has_rank ) {
 566                 my $lastrank = $succ->rank;
 567                 foreach my $rdg ( $self->readings ) {
 568                         next if $rdg->rank > $lastrank;
 569                         next if $rdg->rank == $rrk{$rdg->id};
 570                         my @noncolo = $rdg->related_readings( sub { !$_[0]->colocated } );
 571                         next unless @noncolo;
 572                         foreach my $nc ( @noncolo ) {
 573                                 $self->relations->verify_or_delete( $rdg, $nc );
 574                         }
 575                 }
 576         }
 577         return $newr;
 578 }
 579
 580 sub _generate_dup_id {
 581         my( $self, $rid ) = @_;
 582         my $newid;
 583         my $i = 0;
 584         while( !$newid ) {
 585                 $newid = $rid."_$i";
 586                 if( $self->has_reading( $newid ) ) {
 587                         $newid = '';
 588                         $i++;
 589                 }
 590         }
 591         return $newid;
 592 }
 593
 594 ### Path logic
 595
 596 sub add_path {
 597         my $self = shift;
 598
 599         # We only need the IDs for adding paths to the graph, not the reading
 600         # objects themselves.
 601     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 602
 603         $self->_graphcalc_done(0);
 604         # Connect the readings
 605         unless( $self->sequence->has_edge( $source, $target ) ) {
 606             $self->sequence->add_edge( $source, $target );
 607             $self->relations->add_equivalence_edge( $source, $target );
 608         }
 609     # Note the witness in question
 610     $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
 611 }
 612
 613 sub del_path {
 614         my $self = shift;
 615         my @args;
 616         if( ref( $_[0] ) eq 'ARRAY' ) {
 617                 my $e = shift @_;
 618                 @args = ( @$e, @_ );
 619         } else {
 620                 @args = @_;
 621         }
 622
 623         # We only need the IDs for removing paths from the graph, not the reading
 624         # objects themselves.
 625     my( $source, $target, $wit ) = $self->_stringify_args( @args );
 626
 627         $self->_graphcalc_done(0);
 628         if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
 629                 $self->sequence->delete_edge_attribute( $source, $target, $wit );
 630         }
 631         unless( $self->sequence->has_edge_attributes( $source, $target ) ) {
 632                 $self->sequence->delete_edge( $source, $target );
 633                 $self->relations->delete_equivalence_edge( $source, $target );
 634         }
 635 }
 636
 637
 638 # Extra graph-alike utility
 639 sub has_path {
 640         my $self = shift;
 641     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 642         return undef unless $self->sequence->has_edge( $source, $target );
 643         return $self->sequence->has_edge_attribute( $source, $target, $wit );
 644 }
 645
 646 =head2 clear_witness( @sigil_list )
 647
 648 Clear the given witnesses out of the collation entirely, removing references
 649 to them in paths, and removing readings that belong only to them.  Should only
 650 be called via $tradition->del_witness.
 651
 652 =cut
 653
 654 sub clear_witness {
 655         my( $self, @sigils ) = @_;
 656
 657         $self->_graphcalc_done(0);
 658         # Clear the witness(es) out of the paths
 659         foreach my $e ( $self->paths ) {
 660                 foreach my $sig ( @sigils ) {
 661                         $self->del_path( $e, $sig );
 662                 }
 663         }
 664
 665         # Clear out the newly unused readings
 666         foreach my $r ( $self->readings ) {
 667                 unless( $self->reading_witnesses( $r ) ) {
 668                         $self->del_reading( $r );
 669                 }
 670         }
 671 }
 672
 673 sub add_relationship {
 674         my $self = shift;
 675     my( $source, $target, $opts ) = $self->_stringify_args( @_ );
 676     my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
 677     foreach my $v ( @vectors ) {
 678         next unless $self->get_relationship( $v )->colocated;
 679         if( $self->reading( $v->[0] )->has_rank && $self->reading( $v->[1] )->has_rank
 680                 && $self->reading( $v->[0] )->rank ne $self->reading( $v->[1] )->rank ) {
 681                         $self->_graphcalc_done(0);
 682                         $self->_clear_cache;
 683                         last;
 684         }
 685     }
 686     return @vectors;
 687 }
 688
 689 around qw/ get_relationship del_relationship / => sub {
 690         my $orig = shift;
 691         my $self = shift;
 692         my @args = @_;
 693         if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
 694                 @args = @{$_[0]};
 695         }
 696         my( $source, $target ) = $self->_stringify_args( @args );
 697         $self->$orig( $source, $target );
 698 };
 699
 700 =head2 reading_witnesses( $reading )
 701
 702 Return a list of sigils corresponding to the witnesses in which the reading appears.
 703
 704 =cut
 705
 706 sub reading_witnesses {
 707         my( $self, $reading ) = @_;
 708         # We need only check either the incoming or the outgoing edges; I have
 709         # arbitrarily chosen "incoming".  Thus, special-case the start node.
 710         if( $reading eq $self->start ) {
 711                 return map { $_->sigil } grep { $_->is_collated } $self->tradition->witnesses;
 712         }
 713         my %all_witnesses;
 714         foreach my $e ( $self->sequence->edges_to( $reading ) ) {
 715                 my $wits = $self->sequence->get_edge_attributes( @$e );
 716                 @all_witnesses{ keys %$wits } = 1;
 717         }
 718         my $acstr = $self->ac_label;
 719         foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
 720                 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
 721         }
 722         return keys %all_witnesses;
 723 }
 724
 725 =head1 OUTPUT METHODS
 726
 727 =head2 as_svg( \%options )
 728
 729 Returns an SVG string that represents the graph, via as_dot and graphviz.
 730 See as_dot for a list of options.  Must have GraphViz (dot) installed to run.
 731
 732 =cut
 733
 734 sub as_svg {
 735     my( $self, $opts ) = @_;
 736     throw( "Need GraphViz installed to output SVG" )
 737         unless File::Which::which( 'dot' );
 738     my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
 739     $self->calculate_ranks()
 740         unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
 741         my @cmd = qw/dot -Tsvg/;
 742         my( $svg, $err );
 743         my $dotfile = File::Temp->new();
 744         ## USE FOR DEBUGGING
 745         # $dotfile->unlink_on_destroy(0);
 746         binmode $dotfile, ':utf8';
 747         print $dotfile $self->as_dot( $opts );
 748         push( @cmd, $dotfile->filename );
 749         run( \@cmd, ">", binary(), \$svg );
 750         $svg = decode_utf8( $svg );
 751         return $svg;
 752 }
 753
 754
 755 =head2 as_dot( \%options )
 756
 757 Returns a string that is the collation graph expressed in dot
 758 (i.e. GraphViz) format.  Options include:
 759
 760 =over 4
 761
 762 =item * from
 763
 764 =item * to
 765
 766 =item * color_common
 767
 768 =back
 769
 770 =cut
 771
 772 sub as_dot {
 773     my( $self, $opts ) = @_;
 774     my $startrank = $opts->{'from'} if $opts;
 775     my $endrank = $opts->{'to'} if $opts;
 776     my $color_common = $opts->{'color_common'} if $opts;
 777     my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
 778        && $self->end->rank > 100;
 779     $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
 780
 781     # Check the arguments
 782     if( $startrank ) {
 783         return if $endrank && $startrank > $endrank;
 784         return if $startrank > $self->end->rank;
 785         }
 786         if( defined $endrank ) {
 787                 return if $endrank < 0;
 788                 $endrank = undef if $endrank == $self->end->rank;
 789         }
 790
 791     my $graph_name = $self->tradition->name;
 792     $graph_name =~ s/[^\w\s]//g;
 793     $graph_name = join( '_', split( /\s+/, $graph_name ) );
 794
 795     my %graph_attrs = (
 796         'rankdir' => 'LR',
 797         'bgcolor' => 'none',
 798         );
 799     my %node_attrs = (
 800         'fontsize' => 14,
 801         'fillcolor' => 'white',
 802         'style' => 'filled',
 803         'shape' => 'ellipse'
 804         );
 805     my %edge_attrs = (
 806         'arrowhead' => 'open',
 807         'color' => '#000000',
 808         'fontcolor' => '#000000',
 809         );
 810
 811     my $dot = sprintf( "digraph %s {\n", $graph_name );
 812     $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
 813     $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
 814
 815         # Output substitute start/end readings if necessary
 816         if( $startrank ) {
 817                 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
 818         }
 819         if( $endrank ) {
 820                 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
 821         }
 822         if( $STRAIGHTENHACK ) {
 823                 ## HACK part 1
 824                 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
 825                 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
 826                 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
 827         }
 828         my %used;  # Keep track of the readings that actually appear in the graph
 829         # Sort the readings by rank if we have ranks; this speeds layout.
 830         my @all_readings = $self->end->has_rank
 831                 ? sort { $a->rank <=> $b->rank } $self->readings
 832                 : $self->readings;
 833         # TODO Refrain from outputting lacuna nodes - just grey out the edges.
 834     foreach my $reading ( @all_readings ) {
 835         # Only output readings within our rank range.
 836         next if $startrank && $reading->rank < $startrank;
 837         next if $endrank && $reading->rank > $endrank;
 838         $used{$reading->id} = 1;
 839         # Need not output nodes without separate labels
 840         next if $reading->id eq $reading->text;
 841         my $rattrs;
 842         my $label = $reading->text;
 843         $label .= '-' if $reading->join_next;
 844         $label = "-$label" if $reading->join_prior;
 845         $label =~ s/\"/\\\"/g;
 846                 $rattrs->{'label'} = $label;
 847                 $rattrs->{'id'} = $reading->id;
 848                 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
 849         $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
 850     }
 851
 852         # Add the real edges. Need to weight one edge per rank jump, in a
 853         # continuous line.
 854         # my $weighted = $self->_add_edge_weights;
 855     my @edges = $self->paths;
 856         my( %substart, %subend );
 857     foreach my $edge ( @edges ) {
 858         # Do we need to output this edge?
 859         if( $used{$edge->[0]} && $used{$edge->[1]} ) {
 860                 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
 861                         my $variables = { %edge_attrs, 'label' => $label };
 862
 863                         # Account for the rank gap if necessary
 864                         my $rank0 = $self->reading( $edge->[0] )->rank
 865                                 if $self->reading( $edge->[0] )->has_rank;
 866                         my $rank1 = $self->reading( $edge->[1] )->rank
 867                                 if $self->reading( $edge->[1] )->has_rank;
 868                         if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
 869                                 $variables->{'minlen'} = $rank1 - $rank0;
 870                         }
 871
 872                         # Add the calculated edge weights
 873                         # if( exists $weighted->{$edge->[0]}
 874                         #       && $weighted->{$edge->[0]} eq $edge->[1] ) {
 875                         #       # $variables->{'color'} = 'red';
 876                         #       $variables->{'weight'} = 3.0;
 877                         # }
 878
 879                         # EXPERIMENTAL: make edge width reflect no. of witnesses
 880                         my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
 881                         $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
 882
 883                         my $varopts = _dot_attr_string( $variables );
 884                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
 885                                 $edge->[0], $edge->[1], $varopts );
 886         } elsif( $used{$edge->[0]} ) {
 887                 $subend{$edge->[0]} = $edge->[1];
 888         } elsif( $used{$edge->[1]} ) {
 889                 $substart{$edge->[1]} = $edge->[0];
 890         }
 891     }
 892
 893     # If we are asked to, add relationship links
 894     if( exists $opts->{show_relations} ) {
 895         my $filter = $opts->{show_relations}; # can be 'transposition' or 'all'
 896         if( $filter eq 'transposition' ) {
 897                 $filter =~ qr/^transposition$/;
 898         }
 899         foreach my $redge ( $self->relationships ) {
 900                 if( $used{$redge->[0]} && $used{$redge->[1]} ) {
 901                         if( $filter ne 'all' ) {
 902                                 my $rel = $self->get_relationship( $redge );
 903                                 next unless $rel->type =~ /$filter/;
 904                                         my $variables = {
 905                                                 arrowhead => 'none',
 906                                                 color => '#FFA14F',
 907                                                 constraint => 'false',
 908                                                 label => uc( substr( $rel->type, 0, 4 ) ),
 909                                                 penwidth => '3',
 910                                         };
 911                                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
 912                                                 $redge->[0], $redge->[1], _dot_attr_string( $variables ) );
 913                                 }
 914                 }
 915         }
 916     }
 917
 918     # Add substitute start and end edges if necessary
 919     foreach my $node ( keys %substart ) {
 920         my $witstr = $self->_path_display_label ( $self->path_witnesses( $substart{$node}, $node ) );
 921         my $variables = { %edge_attrs, 'label' => $witstr };
 922         my $nrdg = $self->reading( $node );
 923         if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
 924                 # Substart is actually one lower than $startrank
 925                 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
 926         }
 927         my $varopts = _dot_attr_string( $variables );
 928         $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
 929         }
 930     foreach my $node ( keys %subend ) {
 931         my $witstr = $self->_path_display_label ( $self->path_witnesses( $node, $subend{$node} ) );
 932         my $variables = { %edge_attrs, 'label' => $witstr };
 933         my $varopts = _dot_attr_string( $variables );
 934         $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
 935         }
 936         # HACK part 2
 937         if( $STRAIGHTENHACK ) {
 938                 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
 939                 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
 940         }
 941
 942     $dot .= "}\n";
 943     return $dot;
 944 }
 945
 946 sub _dot_attr_string {
 947         my( $hash ) = @_;
 948         my @attrs;
 949         foreach my $k ( sort keys %$hash ) {
 950                 my $v = $hash->{$k};
 951                 push( @attrs, $k.'="'.$v.'"' );
 952         }
 953         return( '[ ' . join( ', ', @attrs ) . ' ]' );
 954 }
 955
 956 sub _add_edge_weights {
 957         my $self = shift;
 958         # Walk the graph from START to END, choosing the successor node with
 959         # the largest number of witness paths each time.
 960         my $weighted = {};
 961         my $curr = $self->start->id;
 962         my $ranked = $self->end->has_rank;
 963         while( $curr ne $self->end->id ) {
 964                 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
 965                 my @succ = sort { $self->path_witnesses( $curr, $a )
 966                                                         <=> $self->path_witnesses( $curr, $b ) }
 967                         $self->sequence->successors( $curr );
 968                 my $next = pop @succ;
 969                 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
 970                 # Try to avoid lacunae in the weighted path.
 971                 while( @succ &&
 972                            ( $self->reading( $next )->is_lacuna ||
 973                                  $nextrank - $rank > 1 ) ){
 974                         $next = pop @succ;
 975                 }
 976                 $weighted->{$curr} = $next;
 977                 $curr = $next;
 978         }
 979         return $weighted;
 980 }
 981
 982 =head2 path_witnesses( $edge )
 983
 984 Returns the list of sigils whose witnesses are associated with the given edge.
 985 The edge can be passed as either an array or an arrayref of ( $source, $target ).
 986
 987 =cut
 988
 989 sub path_witnesses {
 990         my( $self, @edge ) = @_;
 991         # If edge is an arrayref, cope.
 992         if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
 993                 my $e = shift @edge;
 994                 @edge = @$e;
 995         }
 996         my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
 997         return @wits;
 998 }
 999
1000 # Helper function. Make a display label for the given witnesses, showing a.c.
1001 # witnesses only where the main witness is not also in the list.
1002 sub _path_display_label {
1003         my $self = shift;
1004         my %wits;
1005         map { $wits{$_} = 1 } @_;
1006
1007         # If an a.c. wit is listed, remove it if the main wit is also listed.
1008         # Otherwise keep it for explicit listing.
1009         my $aclabel = $self->ac_label;
1010         my @disp_ac;
1011         foreach my $w ( sort keys %wits ) {
1012                 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
1013                         if( exists $wits{$1} ) {
1014                                 delete $wits{$w};
1015                         } else {
1016                                 push( @disp_ac, $w );
1017                         }
1018                 }
1019         }
1020
1021         # See if we are in a majority situation.
1022         my $maj = scalar( $self->tradition->witnesses ) * 0.6;
1023         $maj = $maj > 5 ? $maj : 5;
1024         if( scalar keys %wits > $maj ) {
1025                 unshift( @disp_ac, 'majority' );
1026                 return join( ', ', @disp_ac );
1027         } else {
1028                 return join( ', ', sort keys %wits );
1029         }
1030 }
1031
1032 =head2 readings_at_rank( $rank )
1033
1034 Returns a list of readings at a given rank, taken from the alignment table.
1035
1036 =cut
1037
1038 sub readings_at_rank {
1039         my( $self, $rank ) = @_;
1040         my $table = $self->alignment_table;
1041         # Table rank is real rank - 1.
1042         my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
1043         my %readings;
1044         foreach my $e ( @elements ) {
1045                 next unless ref( $e ) eq 'HASH';
1046                 next unless exists $e->{'t'};
1047                 $readings{$e->{'t'}->id} = $e->{'t'};
1048         }
1049         return values %readings;
1050 }
1051
1052 =head2 as_graphml
1053
1054 Returns a GraphML representation of the collation.  The GraphML will contain
1055 two graphs. The first expresses the attributes of the readings and the witness
1056 paths that link them; the second expresses the relationships that link the
1057 readings.  This is the native transfer format for a tradition.
1058
1059 =begin testing
1060
1061 use Text::Tradition;
1062 use TryCatch;
1063
1064 my $READINGS = 311;
1065 my $PATHS = 361;
1066
1067 my $datafile = 't/data/florilegium_tei_ps.xml';
1068 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1069                                       'name' => 'test0',
1070                                       'file' => $datafile,
1071                                       'linear' => 1 );
1072
1073 ok( $tradition, "Got a tradition object" );
1074 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
1075 ok( $tradition->collation, "Tradition has a collation" );
1076
1077 my $c = $tradition->collation;
1078 is( scalar $c->readings, $READINGS, "Collation has all readings" );
1079 is( scalar $c->paths, $PATHS, "Collation has all paths" );
1080 is( scalar $c->relationships, 0, "Collation has all relationships" );
1081
1082 # Add a few relationships
1083 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
1084 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
1085 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
1086
1087 # Now write it to GraphML and parse it again.
1088
1089 my $graphml = $c->as_graphml;
1090 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
1091 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
1092 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
1093 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
1094
1095 # Now add a stemma, write to GraphML, and look at the output.
1096 SKIP: {
1097         skip "Analysis module not present", 3 unless $tradition->can( 'add_stemma' );
1098         my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
1099         is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
1100         is( $tradition->stemmata, 1, "Tradition now has the stemma" );
1101         $graphml = $c->as_graphml;
1102         like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
1103 }
1104
1105 =end testing
1106
1107 =cut
1108
1109 ## TODO MOVE this to Tradition.pm and modularize it better
1110 sub as_graphml {
1111     my( $self, $options ) = @_;
1112         $self->calculate_ranks unless $self->_graphcalc_done;
1113
1114         my $start = $options->{'from'}
1115                 ? $self->reading( $options->{'from'} ) : $self->start;
1116         my $end = $options->{'to'}
1117                 ? $self->reading( $options->{'to'} ) : $self->end;
1118         if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
1119                 throw( 'Start node must be before end node' );
1120         }
1121         # The readings need to be ranked for this to work.
1122         $start = $self->start unless $start->has_rank;
1123         $end = $self->end unless $end->has_rank;
1124         my $rankoffset = 0;
1125         unless( $start eq $self->start ) {
1126                 $rankoffset = $start->rank - 1;
1127         }
1128         my %use_readings;
1129
1130     # Some namespaces
1131     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
1132     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
1133     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
1134         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
1135
1136     # Create the document and root node
1137     require XML::LibXML;
1138     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
1139     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
1140     $graphml->setDocumentElement( $root );
1141     $root->setNamespace( $xsi_ns, 'xsi', 0 );
1142     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
1143
1144     # List of attribute types to save on our objects and their corresponding
1145     # GraphML types
1146     my %save_types = (
1147         'Str' => 'string',
1148         'Int' => 'int',
1149         'Bool' => 'boolean',
1150         'ReadingID' => 'string',
1151         'RelationshipType' => 'string',
1152         'RelationshipScope' => 'string',
1153     );
1154
1155     # Add the data keys for the graph. Include an extra key 'version' for the
1156     # GraphML output version.
1157     my %graph_data_keys;
1158     my $gdi = 0;
1159     my %graph_attributes = ( 'version' => 'string' );
1160         # Graph attributes include those of Tradition and those of Collation.
1161         my %gattr_from;
1162         # TODO Use meta introspection method from duplicate_reading to do this
1163         # instead of naming custom keys.
1164         my $tmeta = $self->tradition->meta;
1165         my $cmeta = $self->meta;
1166         map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
1167         map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
1168         foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
1169                 next if $attr->name =~ /^_/;
1170                 next unless $save_types{$attr->type_constraint->name};
1171                 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1172         }
1173     # Extra custom keys for complex objects that should be saved in some form.
1174     # The subroutine should return a string, or undef/empty.
1175     if( $tmeta->has_method('stemmata') ) {
1176                 $graph_attributes{'stemmata'} = sub {
1177                         my @stemstrs;
1178                         map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1179                                 $self->tradition->stemmata;
1180                         join( "\n", @stemstrs );
1181                 };
1182         }
1183
1184         if( $tmeta->has_method('user') ) {
1185                 $graph_attributes{'user'} = sub {
1186                         $self->tradition->user ? $self->tradition->user->id : undef
1187                 };
1188         }
1189
1190     foreach my $datum ( sort keys %graph_attributes ) {
1191         $graph_data_keys{$datum} = 'dg'.$gdi++;
1192         my $key = $root->addNewChild( $graphml_ns, 'key' );
1193         my $dtype = ref( $graph_attributes{$datum} ) ? 'string'
1194                 : $graph_attributes{$datum};
1195         $key->setAttribute( 'attr.name', $datum );
1196         $key->setAttribute( 'attr.type', $dtype );
1197         $key->setAttribute( 'for', 'graph' );
1198         $key->setAttribute( 'id', $graph_data_keys{$datum} );
1199     }
1200
1201     # Add the data keys for reading nodes
1202     my %reading_attributes;
1203     my $rmeta = Text::Tradition::Collation::Reading->meta;
1204     foreach my $attr( $rmeta->get_all_attributes ) {
1205                 next if $attr->name =~ /^_/;
1206                 next unless $save_types{$attr->type_constraint->name};
1207                 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1208         }
1209         if( $self->start->does('Text::Tradition::Morphology' ) ) {
1210                 # Extra custom key for the reading morphology
1211                 $reading_attributes{'lexemes'} = 'string';
1212         }
1213
1214     my %node_data_keys;
1215     my $ndi = 0;
1216     foreach my $datum ( sort keys %reading_attributes ) {
1217         $node_data_keys{$datum} = 'dn'.$ndi++;
1218         my $key = $root->addNewChild( $graphml_ns, 'key' );
1219         $key->setAttribute( 'attr.name', $datum );
1220         $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
1221         $key->setAttribute( 'for', 'node' );
1222         $key->setAttribute( 'id', $node_data_keys{$datum} );
1223     }
1224
1225     # Add the data keys for edges, that is, paths and relationships. Path
1226     # data does not come from a Moose class so is here manually.
1227     my $edi = 0;
1228     my %edge_data_keys;
1229     my %edge_attributes = (
1230         witness => 'string',                    # ID/label for a path
1231         extra => 'boolean',                             # Path key
1232         );
1233     my @path_attributes = keys %edge_attributes; # track our manual additions
1234     my $pmeta = Text::Tradition::Collation::Relationship->meta;
1235     foreach my $attr( $pmeta->get_all_attributes ) {
1236                 next if $attr->name =~ /^_/;
1237                 next unless $save_types{$attr->type_constraint->name};
1238                 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1239         }
1240     foreach my $datum ( sort keys %edge_attributes ) {
1241         $edge_data_keys{$datum} = 'de'.$edi++;
1242         my $key = $root->addNewChild( $graphml_ns, 'key' );
1243         $key->setAttribute( 'attr.name', $datum );
1244         $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
1245         $key->setAttribute( 'for', 'edge' );
1246         $key->setAttribute( 'id', $edge_data_keys{$datum} );
1247     }
1248
1249     # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1250     my $xmlidname = $self->tradition->name;
1251     $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1252     if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1253         $xmlidname = '_'.$xmlidname;
1254     }
1255     my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1256     $sgraph->setAttribute( 'edgedefault', 'directed' );
1257     $sgraph->setAttribute( 'id', $xmlidname );
1258     $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
1259     $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
1260     $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
1261     $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
1262     $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
1263
1264     # Tradition/collation attribute data
1265     foreach my $datum ( keys %graph_attributes ) {
1266         my $value;
1267         if( $datum eq 'version' ) {
1268                 $value = '3.2';
1269         } elsif( ref( $graph_attributes{$datum} ) ) {
1270                 my $sub = $graph_attributes{$datum};
1271                 $value = &$sub();
1272         } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1273                 $value = $self->tradition->$datum;
1274         } else {
1275                 $value = $self->$datum;
1276         }
1277                 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
1278         }
1279
1280     my $node_ctr = 0;
1281     my %node_hash;
1282     # Add our readings to the graph
1283     foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
1284         next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1285                 ( $n->rank < $start->rank || $n->rank > $end->rank );
1286         $use_readings{$n->id} = 1;
1287         # Add to the main graph
1288         my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
1289         my $node_xmlid = 'n' . $node_ctr++;
1290         $node_hash{ $n->id } = $node_xmlid;
1291         $node_el->setAttribute( 'id', $node_xmlid );
1292         foreach my $d ( keys %reading_attributes ) {
1293                 my $nval = $n->$d;
1294                 # Custom serialization
1295                 if( $d eq 'lexemes' ) {
1296                                 # If nval is a true value, we have lexemes so we need to
1297                                 # serialize them. Otherwise set nval to undef so that the
1298                                 # key is excluded from this reading.
1299                         $nval = $nval ? $n->_serialize_lexemes : undef;
1300                 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1301                         $nval = undef;
1302                 }
1303                 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
1304                         # Adjust the ranks within the subgraph.
1305                         $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1306                                 : $nval - $rankoffset;
1307                 }
1308                 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1309                         if defined $nval;
1310         }
1311     }
1312
1313     # Add the path edges to the sequence graph
1314     my $edge_ctr = 0;
1315     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1316         # We add an edge in the graphml for every witness in $e.
1317         next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1318         my @edge_wits = sort $self->path_witnesses( $e );
1319         $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1320         $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1321         # Skip any path from start to end; that witness is not in the subgraph.
1322         next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
1323         foreach my $wit ( @edge_wits ) {
1324                         my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1325                                                                                 $node_hash{ $e->[0] },
1326                                                                                 $node_hash{ $e->[1] } );
1327                         my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
1328                         $edge_el->setAttribute( 'source', $from );
1329                         $edge_el->setAttribute( 'target', $to );
1330                         $edge_el->setAttribute( 'id', $id );
1331
1332                         # It's a witness path, so add the witness
1333                         my $base = $wit;
1334                         my $key = $edge_data_keys{'witness'};
1335                         # Is this an ante-corr witness?
1336                         my $aclabel = $self->ac_label;
1337                         if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1338                                 # Keep the base witness
1339                                 $base = $1;
1340                                 # ...and record that this is an 'extra' reading path
1341                                 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1342                         }
1343                         _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1344                 }
1345         }
1346
1347         # Report the actual number of nodes and edges that went in
1348         $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1349         $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1350
1351         # Add the relationship graph to the XML
1352         map { delete $edge_data_keys{$_} } @path_attributes;
1353         $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1354                 $node_data_keys{'id'}, \%edge_data_keys );
1355
1356     # Save and return the thing
1357     my $result = decode_utf8( $graphml->toString(1) );
1358     return $result;
1359 }
1360
1361 sub _add_graphml_data {
1362     my( $el, $key, $value ) = @_;
1363     return unless defined $value;
1364     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
1365     $data_el->setAttribute( 'key', $key );
1366     $data_el->appendText( $value );
1367 }
1368
1369 =head2 as_csv
1370
1371 Returns a CSV alignment table representation of the collation graph, one
1372 row per witness (or witness uncorrected.)
1373
1374 =cut
1375
1376 sub as_csv {
1377     my( $self ) = @_;
1378     my $table = $self->alignment_table;
1379     my $csv = Text::CSV->new( { binary => 1, quote_null => 0 } );
1380     my @result;
1381     # Make the header row
1382     $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1383         push( @result, decode_utf8( $csv->string ) );
1384     # Make the rest of the rows
1385     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1386         my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1387         my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
1388         $csv->combine( @row );
1389         push( @result, decode_utf8( $csv->string ) );
1390     }
1391     return join( "\n", @result );
1392 }
1393
1394 =head2 alignment_table
1395
1396 Return a reference to an alignment table, in a slightly enhanced CollateX
1397 format which looks like this:
1398
1399  $table = { alignment => [ { witness => "SIGIL",
1400                              tokens => [ { t => "TEXT" }, ... ] },
1401                            { witness => "SIG2",
1402                              tokens => [ { t => "TEXT" }, ... ] },
1403                            ... ],
1404             length => TEXTLEN };
1405
1406 =cut
1407
1408 sub alignment_table {
1409     my( $self ) = @_;
1410     return $self->cached_table if $self->has_cached_table;
1411
1412     # Make sure we can do this
1413         throw( "Need a linear graph in order to make an alignment table" )
1414                 unless $self->linear;
1415     $self->calculate_ranks()
1416         unless $self->_graphcalc_done && $self->end->has_rank;
1417
1418     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
1419     my @all_pos = ( 1 .. $self->end->rank - 1 );
1420     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
1421         # say STDERR "Making witness row(s) for " . $wit->sigil;
1422         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1423         my @row = _make_witness_row( \@wit_path, \@all_pos );
1424         my $witobj = { 'witness' => $wit->sigil, 'tokens' => \@row };
1425         $witobj->{'identifier'} = $wit->identifier if $wit->identifier;
1426         push( @{$table->{'alignment'}}, $witobj );
1427         if( $wit->is_layered ) {
1428                 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
1429                         $wit->sigil.$self->ac_label );
1430             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
1431             my $witacobj = { 'witness' => $wit->sigil.$self->ac_label,
1432                 'tokens' => \@ac_row };
1433             $witacobj->{'identifier'} = $wit->identifier if $wit->identifier;
1434                         push( @{$table->{'alignment'}}, $witacobj );
1435         }
1436     }
1437     $self->cached_table( $table );
1438     return $table;
1439 }
1440
1441 sub _make_witness_row {
1442     my( $path, $positions ) = @_;
1443     my %char_hash;
1444     map { $char_hash{$_} = undef } @$positions;
1445     my $debug = 0;
1446     foreach my $rdg ( @$path ) {
1447         say STDERR "rank " . $rdg->rank if $debug;
1448         # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
1449         $char_hash{$rdg->rank} = { 't' => $rdg };
1450     }
1451     my @row = map { $char_hash{$_} } @$positions;
1452     # Fill in lacuna markers for undef spots in the row
1453     my $last_el = shift @row;
1454     my @filled_row = ( $last_el );
1455     foreach my $el ( @row ) {
1456         # If we are using node reference, make the lacuna node appear many times
1457         # in the table.  If not, use the lacuna tag.
1458         if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1459             $el = $last_el;
1460         }
1461         push( @filled_row, $el );
1462         $last_el = $el;
1463     }
1464     return @filled_row;
1465 }
1466
1467
1468 =head1 NAVIGATION METHODS
1469
1470 =head2 reading_sequence( $first, $last, $sigil, $backup )
1471
1472 Returns the ordered list of readings, starting with $first and ending
1473 with $last, for the witness given in $sigil. If a $backup sigil is
1474 specified (e.g. when walking a layered witness), it will be used wherever
1475 no $sigil path exists.  If there is a base text reading, that will be
1476 used wherever no path exists for $sigil or $backup.
1477
1478 =cut
1479
1480 # TODO Think about returning some lazy-eval iterator.
1481 # TODO Get rid of backup; we should know from what witness is whether we need it.
1482
1483 sub reading_sequence {
1484     my( $self, $start, $end, $witness ) = @_;
1485
1486     $witness = $self->baselabel unless $witness;
1487     my @readings = ( $start );
1488     my %seen;
1489     my $n = $start;
1490     while( $n && $n->id ne $end->id ) {
1491         if( exists( $seen{$n->id} ) ) {
1492             throw( "Detected loop for $witness at " . $n->id );
1493         }
1494         $seen{$n->id} = 1;
1495
1496         my $next = $self->next_reading( $n, $witness );
1497         unless( $next ) {
1498             throw( "Did not find any path for $witness from reading " . $n->id );
1499         }
1500         push( @readings, $next );
1501         $n = $next;
1502     }
1503     # Check that the last reading is our end reading.
1504     my $last = $readings[$#readings];
1505     throw( "Last reading found from " . $start->text .
1506         " for witness $witness is not the end!" ) # TODO do we get this far?
1507         unless $last->id eq $end->id;
1508
1509     return @readings;
1510 }
1511
1512 =head2 next_reading( $reading, $sigil );
1513
1514 Returns the reading that follows the given reading along the given witness
1515 path.
1516
1517 =cut
1518
1519 sub next_reading {
1520     # Return the successor via the corresponding path.
1521     my $self = shift;
1522     my $answer = $self->_find_linked_reading( 'next', @_ );
1523         return undef unless $answer;
1524     return $self->reading( $answer );
1525 }
1526
1527 =head2 prior_reading( $reading, $sigil )
1528
1529 Returns the reading that precedes the given reading along the given witness
1530 path.
1531
1532 =cut
1533
1534 sub prior_reading {
1535     # Return the predecessor via the corresponding path.
1536     my $self = shift;
1537     my $answer = $self->_find_linked_reading( 'prior', @_ );
1538     return $self->reading( $answer );
1539 }
1540
1541 sub _find_linked_reading {
1542     my( $self, $direction, $node, $path ) = @_;
1543
1544     # Get a backup if we are dealing with a layered witness
1545     my $alt_path;
1546     my $aclabel = $self->ac_label;
1547     if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1548         $alt_path = $1;
1549     }
1550
1551     my @linked_paths = $direction eq 'next'
1552         ? $self->sequence->edges_from( $node )
1553         : $self->sequence->edges_to( $node );
1554     return undef unless scalar( @linked_paths );
1555
1556     # We have to find the linked path that contains all of the
1557     # witnesses supplied in $path.
1558     my( @path_wits, @alt_path_wits );
1559     @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1560     @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
1561     my $base_le;
1562     my $alt_le;
1563     foreach my $le ( @linked_paths ) {
1564         if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
1565             $base_le = $le;
1566         }
1567                 my @le_wits = sort $self->path_witnesses( $le );
1568                 if( _is_within( \@path_wits, \@le_wits ) ) {
1569                         # This is the right path.
1570                         return $direction eq 'next' ? $le->[1] : $le->[0];
1571                 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1572                         $alt_le = $le;
1573                 }
1574     }
1575     # Got this far? Return the alternate path if it exists.
1576     return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
1577         if $alt_le;
1578
1579     # Got this far? Return the base path if it exists.
1580     return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
1581         if $base_le;
1582
1583     # Got this far? We have no appropriate path.
1584     warn "Could not find $direction node from " . $node->id
1585         . " along path $path";
1586     return undef;
1587 }
1588
1589 # Some set logic.
1590 sub _is_within {
1591     my( $set1, $set2 ) = @_;
1592     my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
1593     foreach my $el ( @$set1 ) {
1594         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
1595     }
1596     return $ret;
1597 }
1598
1599 # Return the string that joins together a list of witnesses for
1600 # display on a single path.
1601 sub _witnesses_of_label {
1602     my( $self, $label ) = @_;
1603     my $regex = $self->wit_list_separator;
1604     my @answer = split( /\Q$regex\E/, $label );
1605     return @answer;
1606 }
1607
1608 =head2 common_readings
1609
1610 Returns the list of common readings in the graph (i.e. those readings that are
1611 shared by all non-lacunose witnesses.)
1612
1613 =cut
1614
1615 sub common_readings {
1616         my $self = shift;
1617         my @common = grep { $_->is_common } $self->readings;
1618         return @common;
1619 }
1620
1621 =head2 path_text( $sigil, [, $start, $end ] )
1622
1623 Returns the text of a witness (plus its backup, if we are using a layer)
1624 as stored in the collation.  The text is returned as a string, where the
1625 individual readings are joined with spaces and the meta-readings (e.g.
1626 lacunae) are omitted.  Optional specification of $start and $end allows
1627 the generation of a subset of the witness text.
1628
1629 =cut
1630
1631 sub path_text {
1632         my( $self, $wit, $start, $end ) = @_;
1633         $start = $self->start unless $start;
1634         $end = $self->end unless $end;
1635         my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
1636         my $pathtext = '';
1637         my $last;
1638         foreach my $r ( @path ) {
1639                 unless ( $r->join_prior || !$last || $last->join_next ) {
1640                         $pathtext .= ' ';
1641                 }
1642                 $pathtext .= $r->text;
1643                 $last = $r;
1644         }
1645         return $pathtext;
1646 }
1647
1648 =head1 INITIALIZATION METHODS
1649
1650 These are mostly for use by parsers.
1651
1652 =head2 make_witness_path( $witness )
1653
1654 Link the array of readings contained in $witness->path (and in
1655 $witness->uncorrected_path if it exists) into collation paths.
1656 Clear out the arrays when finished.
1657
1658 =head2 make_witness_paths
1659
1660 Call make_witness_path for all witnesses in the tradition.
1661
1662 =cut
1663
1664 # For use when a collation is constructed from a base text and an apparatus.
1665 # We have the sequences of readings and just need to add path edges.
1666 # When we are done, clear out the witness path attributes, as they are no
1667 # longer needed.
1668 # TODO Find a way to replace the witness path attributes with encapsulated functions?
1669
1670 sub make_witness_paths {
1671     my( $self ) = @_;
1672     foreach my $wit ( $self->tradition->witnesses ) {
1673         # say STDERR "Making path for " . $wit->sigil;
1674         $self->make_witness_path( $wit );
1675     }
1676 }
1677
1678 sub make_witness_path {
1679     my( $self, $wit ) = @_;
1680     my @chain = @{$wit->path};
1681     my $sig = $wit->sigil;
1682     # Add start and end if necessary
1683     unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1684     push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1685     foreach my $idx ( 0 .. $#chain-1 ) {
1686         $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
1687     }
1688     if( $wit->is_layered ) {
1689         @chain = @{$wit->uncorrected_path};
1690                 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1691                 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1692         foreach my $idx( 0 .. $#chain-1 ) {
1693             my $source = $chain[$idx];
1694             my $target = $chain[$idx+1];
1695             $self->add_path( $source, $target, $sig.$self->ac_label )
1696                 unless $self->has_path( $source, $target, $sig );
1697         }
1698     }
1699     $wit->clear_path;
1700     $wit->clear_uncorrected_path;
1701 }
1702
1703 =head2 calculate_ranks
1704
1705 Calculate the reading ranks (that is, their aligned positions relative
1706 to each other) for the graph.  This can only be called on linear collations.
1707
1708 =begin testing
1709
1710 use Text::Tradition;
1711
1712 my $cxfile = 't/data/Collatex-16.xml';
1713 my $t = Text::Tradition->new(
1714     'name'  => 'inline',
1715     'input' => 'CollateX',
1716     'file'  => $cxfile,
1717     );
1718 my $c = $t->collation;
1719
1720 # Make an svg
1721 my $table = $c->alignment_table;
1722 ok( $c->has_cached_table, "Alignment table was cached" );
1723 is( $c->alignment_table, $table, "Cached table returned upon second call" );
1724 $c->calculate_ranks;
1725 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
1726 $c->add_relationship( 'n13', 'n23', { type => 'repetition' } );
1727 is( $c->alignment_table, $table, "Alignment table unchanged after non-colo relationship add" );
1728 $c->add_relationship( 'n24', 'n23', { type => 'spelling' } );
1729 isnt( $c->alignment_table, $table, "Alignment table changed after colo relationship add" );
1730
1731 =end testing
1732
1733 =cut
1734
1735 sub calculate_ranks {
1736     my $self = shift;
1737     # Save the existing ranks, in case we need to invalidate the cached SVG.
1738     my %existing_ranks;
1739     map { $existing_ranks{$_} = $_->rank } $self->readings;
1740
1741     # Do the rankings based on the relationship equivalence graph, starting
1742     # with the start node.
1743     my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
1744
1745     # Transfer our rankings from the topological graph to the real one.
1746     foreach my $r ( $self->readings ) {
1747         if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
1748             $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
1749         } else {
1750                 # Die. Find the last rank we calculated.
1751                 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
1752                                  <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
1753                         $self->readings;
1754                 my $last = pop @all_defined;
1755             throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
1756         }
1757     }
1758     # Do we need to invalidate the cached data?
1759     if( $self->has_cached_table ) {
1760         foreach my $r ( $self->readings ) {
1761                 next if defined( $existing_ranks{$r} )
1762                         && $existing_ranks{$r} == $r->rank;
1763                 # Something has changed, so clear the cache
1764                 $self->_clear_cache;
1765                         # ...and recalculate the common readings.
1766                         $self->calculate_common_readings();
1767                 last;
1768         }
1769     }
1770         # The graph calculation information is now up to date.
1771         $self->_graphcalc_done(1);
1772 }
1773
1774 sub _clear_cache {
1775         my $self = shift;
1776         $self->wipe_table if $self->has_cached_table;
1777 }
1778
1779
1780 =head2 flatten_ranks
1781
1782 A convenience method for parsing collation data.  Searches the graph for readings
1783 with the same text at the same rank, and merges any that are found.
1784
1785 =cut
1786
1787 sub flatten_ranks {
1788     my $self = shift;
1789     my %unique_rank_rdg;
1790     my $changed;
1791     foreach my $rdg ( $self->readings ) {
1792         next unless $rdg->has_rank;
1793         my $key = $rdg->rank . "||" . $rdg->text;
1794         if( exists $unique_rank_rdg{$key} ) {
1795                 # Make sure they don't have different grammatical forms
1796                         my $ur = $unique_rank_rdg{$key};
1797                 if( $rdg->is_identical( $ur ) ) {
1798                                 # Combine!
1799                                 #say STDERR "Combining readings at same rank: $key";
1800                                 $changed = 1;
1801                                 $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
1802                                 # TODO see if this now makes a common point.
1803                         }
1804         } else {
1805             $unique_rank_rdg{$key} = $rdg;
1806         }
1807     }
1808     # If we merged readings, the ranks are still fine but the alignment
1809     # table is wrong. Wipe it.
1810     $self->wipe_table() if $changed;
1811 }
1812
1813
1814 =head2 calculate_common_readings
1815
1816 Goes through the graph identifying the readings that appear in every witness
1817 (apart from those with lacunae at that spot.) Marks them as common and returns
1818 the list.
1819
1820 =begin testing
1821
1822 use Text::Tradition;
1823
1824 my $cxfile = 't/data/Collatex-16.xml';
1825 my $t = Text::Tradition->new(
1826     'name'  => 'inline',
1827     'input' => 'CollateX',
1828     'file'  => $cxfile,
1829     );
1830 my $c = $t->collation;
1831
1832 my @common = $c->calculate_common_readings();
1833 is( scalar @common, 8, "Found correct number of common readings" );
1834 my @marked = sort $c->common_readings();
1835 is( scalar @common, 8, "All common readings got marked as such" );
1836 my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
1837 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1838
1839 =end testing
1840
1841 =cut
1842
1843 sub calculate_common_readings {
1844         my $self = shift;
1845         my @common;
1846         map { $_->is_common( 0 ) } $self->readings;
1847         # Implicitly calls calculate_ranks
1848         my $table = $self->alignment_table;
1849         foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1850                 my @row = map { $_->{'tokens'}->[$idx]
1851                                                         ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1852                                         @{$table->{'alignment'}};
1853                 my %hash;
1854                 foreach my $r ( @row ) {
1855                         if( $r ) {
1856                                 $hash{$r->id} = $r unless $r->is_meta;
1857                         } else {
1858                                 $hash{'UNDEF'} = $r;
1859                         }
1860                 }
1861                 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1862                         my( $r ) = values %hash;
1863                         $r->is_common( 1 );
1864                         push( @common, $r );
1865                 }
1866         }
1867         return @common;
1868 }
1869
1870 =head2 text_from_paths
1871
1872 Calculate the text array for all witnesses from the path, for later consistency
1873 checking.  Only to be used if there is no non-graph-based way to know the
1874 original texts.
1875
1876 =cut
1877
1878 sub text_from_paths {
1879         my $self = shift;
1880     foreach my $wit ( $self->tradition->witnesses ) {
1881         my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1882         my @text;
1883         foreach my $r ( @readings ) {
1884                 next if $r->is_meta;
1885                 push( @text, $r->text );
1886         }
1887         $wit->text( \@text );
1888         if( $wit->is_layered ) {
1889                         my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
1890                                                                                                   $wit->sigil.$self->ac_label );
1891                         my @uctext;
1892                         foreach my $r ( @ucrdgs ) {
1893                                 next if $r->is_meta;
1894                                 push( @uctext, $r->text );
1895                         }
1896                         $wit->layertext( \@uctext );
1897         }
1898     }
1899 }
1900
1901 =head1 UTILITY FUNCTIONS
1902
1903 =head2 common_predecessor( $reading_a, $reading_b )
1904
1905 Find the last reading that occurs in sequence before both the given readings.
1906 At the very least this should be $self->start.
1907
1908 =head2 common_successor( $reading_a, $reading_b )
1909
1910 Find the first reading that occurs in sequence after both the given readings.
1911 At the very least this should be $self->end.
1912
1913 =begin testing
1914
1915 use Text::Tradition;
1916
1917 my $cxfile = 't/data/Collatex-16.xml';
1918 my $t = Text::Tradition->new(
1919     'name'  => 'inline',
1920     'input' => 'CollateX',
1921     'file'  => $cxfile,
1922     );
1923 my $c = $t->collation;
1924
1925 is( $c->common_predecessor( 'n24', 'n23' )->id,
1926     'n20', "Found correct common predecessor" );
1927 is( $c->common_successor( 'n24', 'n23' )->id,
1928     '__END__', "Found correct common successor" );
1929
1930 is( $c->common_predecessor( 'n19', 'n17' )->id,
1931     'n16', "Found correct common predecessor for readings on same path" );
1932 is( $c->common_successor( 'n21', 'n10' )->id,
1933     '__END__', "Found correct common successor for readings on same path" );
1934
1935 =end testing
1936
1937 =cut
1938
1939 ## Return the closest reading that is a predecessor of both the given readings.
1940 sub common_predecessor {
1941         my $self = shift;
1942         my( $r1, $r2 ) = $self->_objectify_args( @_ );
1943         return $self->_common_in_path( $r1, $r2, 'predecessors' );
1944 }
1945
1946 sub common_successor {
1947         my $self = shift;
1948         my( $r1, $r2 ) = $self->_objectify_args( @_ );
1949         return $self->_common_in_path( $r1, $r2, 'successors' );
1950 }
1951
1952
1953 # TODO think about how to do this without ranks...
1954 sub _common_in_path {
1955         my( $self, $r1, $r2, $dir ) = @_;
1956         my $iter = $self->end->rank;
1957         my @candidates;
1958         my @last_r1 = ( $r1 );
1959         my @last_r2 = ( $r2 );
1960         # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
1961         my %all_seen;
1962         # say STDERR "Finding common $dir for $r1, $r2";
1963         while( !@candidates ) {
1964                 last unless $iter--;  # Avoid looping infinitely
1965                 # Iterate separately down the graph from r1 and r2
1966                 my( @new_lc1, @new_lc2 );
1967                 foreach my $lc ( @last_r1 ) {
1968                         foreach my $p ( $lc->$dir ) {
1969                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
1970                                         # say STDERR "Path candidate $p from $lc";
1971                                         push( @candidates, $p );
1972                                 } elsif( !$all_seen{$p->id} ) {
1973                                         $all_seen{$p->id} = 'r1';
1974                                         push( @new_lc1, $p );
1975                                 }
1976                         }
1977                 }
1978                 foreach my $lc ( @last_r2 ) {
1979                         foreach my $p ( $lc->$dir ) {
1980                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
1981                                         # say STDERR "Path candidate $p from $lc";
1982                                         push( @candidates, $p );
1983                                 } elsif( !$all_seen{$p->id} ) {
1984                                         $all_seen{$p->id} = 'r2';
1985                                         push( @new_lc2, $p );
1986                                 }
1987                         }
1988                 }
1989                 @last_r1 = @new_lc1;
1990                 @last_r2 = @new_lc2;
1991         }
1992         my @answer = sort { $a->rank <=> $b->rank } @candidates;
1993         return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1994 }
1995
1996 sub throw {
1997         Text::Tradition::Error->throw(
1998                 'ident' => 'Collation error',
1999                 'message' => $_[0],
2000                 );
2001 }
2002
2003 no Moose;
2004 __PACKAGE__->meta->make_immutable;
2005
2006 =head1 BUGS/TODO
2007
2008 =over
2009
2010 =item * Rework XML serialization in a more modular way
2011
2012 =back
2013
2014 =head1 LICENSE
2015
2016 This package is free software and is provided "as is" without express
2017 or implied warranty.  You can redistribute it and/or modify it under
2018 the same terms as Perl itself.
2019
2020 =head1 AUTHOR
2021
2022 Tara L Andrews E<lt>aurum@cpan.orgE<gt>