base/lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use feature 'say';
   4 use Encode qw( decode_utf8 );
   5 use File::Temp;
   6 use File::Which;
   7 use Graph;
   8 use IPC::Run qw( run binary );
   9 use Text::CSV;
  10 use Text::Tradition::Collation::Data;
  11 use Text::Tradition::Collation::Reading;
  12 use Text::Tradition::Collation::RelationshipStore;
  13 use Text::Tradition::Error;
  14 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
  15 use XML::LibXML;
  16 use XML::LibXML::XPathContext;
  17 use Moose;
  18
  19 has _data => (
  20         isa      => 'Text::Tradition::Collation::Data',
  21         is       => 'ro',
  22         required => 1,
  23         handles  => [ qw(
  24                 sequence
  25                 paths
  26                 _set_relations
  27                 relations
  28                 _set_start
  29                 _set_end
  30                 ac_label
  31                 has_cached_table
  32                 relationships
  33                 related_readings
  34                 get_relationship
  35                 del_relationship
  36                 equivalence
  37                 equivalence_graph
  38                 readings
  39                 reading
  40                 _add_reading
  41                 del_reading
  42                 has_reading
  43                 wit_list_separator
  44                 baselabel
  45                 linear
  46                 wordsep
  47                 start
  48                 end
  49                 cached_table
  50                 _graphcalc_done
  51                 has_cached_svg
  52                 wipe_table
  53         )]
  54 );
  55
  56 has 'tradition' => (
  57     is => 'ro',
  58     isa => 'Text::Tradition',
  59     writer => '_set_tradition',
  60     weak_ref => 1,
  61     );
  62
  63 =head1 NAME
  64
  65 Text::Tradition::Collation - a software model for a text collation
  66
  67 =head1 SYNOPSIS
  68
  69   use Text::Tradition;
  70   my $t = Text::Tradition->new(
  71     'name' => 'this is a text',
  72     'input' => 'TEI',
  73     'file' => '/path/to/tei_parallel_seg_file.xml' );
  74
  75   my $c = $t->collation;
  76   my @readings = $c->readings;
  77   my @paths = $c->paths;
  78   my @relationships = $c->relationships;
  79
  80   my $svg_variant_graph = $t->collation->as_svg();
  81
  82 =head1 DESCRIPTION
  83
  84 Text::Tradition is a library for representation and analysis of collated
  85 texts, particularly medieval ones.  The Collation is the central feature of
  86 a Tradition, where the text, its sequence of readings, and its relationships
  87 between readings are actually kept.
  88
  89 =head1 CONSTRUCTOR
  90
  91 =head2 new
  92
  93 The constructor.  Takes a hash or hashref of the following arguments:
  94
  95 =over
  96
  97 =item * tradition - The Text::Tradition object to which the collation
  98 belongs. Required.
  99
 100 =item * linear - Whether the collation should be linear; that is, whether
 101 transposed readings should be treated as two linked readings rather than one,
 102 and therefore whether the collation graph is acyclic.  Defaults to true.
 103
 104 =item * baselabel - The default label for the path taken by a base text
 105 (if any). Defaults to 'base text'.
 106
 107 =item * wit_list_separator - The string to join a list of witnesses for
 108 purposes of making labels in display graphs.  Defaults to ', '.
 109
 110 =item * ac_label - The extra label to tack onto a witness sigil when
 111 representing another layer of path for the given witness - that is, when
 112 a text has more than one possible reading due to scribal corrections or
 113 the like.  Defaults to ' (a.c.)'.
 114
 115 =item * wordsep - The string used to separate words in the original text.
 116 Defaults to ' '.
 117
 118 =back
 119
 120 =head1 ACCESSORS
 121
 122 =head2 tradition
 123
 124 =head2 linear
 125
 126 =head2 wit_list_separator
 127
 128 =head2 baselabel
 129
 130 =head2 ac_label
 131
 132 =head2 wordsep
 133
 134 Simple accessors for collation attributes.
 135
 136 =head2 start
 137
 138 The meta-reading at the start of every witness path.
 139
 140 =head2 end
 141
 142 The meta-reading at the end of every witness path.
 143
 144 =head2 readings
 145
 146 Returns all Reading objects in the graph.
 147
 148 =head2 reading( $id )
 149
 150 Returns the Reading object corresponding to the given ID.
 151
 152 =head2 add_reading( $reading_args )
 153
 154 Adds a new reading object to the collation.
 155 See L<Text::Tradition::Collation::Reading> for the available arguments.
 156
 157 =head2 del_reading( $object_or_id )
 158
 159 Removes the given reading from the collation, implicitly removing its
 160 paths and relationships.
 161
 162 =head2 has_reading( $id )
 163
 164 Predicate to see whether a given reading ID is in the graph.
 165
 166 =head2 reading_witnesses( $object_or_id )
 167
 168 Returns a list of sigils whose witnesses contain the reading.
 169
 170 =head2 paths
 171
 172 Returns all reading paths within the document - that is, all edges in the
 173 collation graph.  Each path is an arrayref of [ $source, $target ] reading IDs.
 174
 175 =head2 add_path( $source, $target, $sigil )
 176
 177 Links the given readings in the collation in sequence, under the given witness
 178 sigil.  The readings may be specified by object or ID.
 179
 180 =head2 del_path( $source, $target, $sigil )
 181
 182 Links the given readings in the collation in sequence, under the given witness
 183 sigil.  The readings may be specified by object or ID.
 184
 185 =head2 has_path( $source, $target );
 186
 187 Returns true if the two readings are linked in sequence in any witness.
 188 The readings may be specified by object or ID.
 189
 190 =head2 relationships
 191
 192 Returns all Relationship objects in the collation.
 193
 194 =head2 add_relationship( $reading, $other_reading, $options )
 195
 196 Adds a new relationship of the type given in $options between the two readings,
 197 which may be specified by object or ID.  Returns a value of ( $status, @vectors)
 198 where $status is true on success, and @vectors is a list of relationship edges
 199 that were ultimately added.
 200 See L<Text::Tradition::Collation::Relationship> for the available options.
 201
 202 =cut
 203
 204 sub BUILDARGS {
 205         my ( $class, @args ) = @_;
 206         my %args = @args == 1 ? %{ $args[0] } : @args;
 207         # TODO determine these from the Moose::Meta object
 208         my @delegate_attrs = qw(sequence relations readings wit_list_separator baselabel
 209                 linear wordsep start end cached_table _graphcalc_done);
 210         my %data_args;
 211         for my $attr (@delegate_attrs) {
 212                 $data_args{$attr} = delete $args{$attr} if exists $args{$attr};
 213         }
 214         $args{_data} = Text::Tradition::Collation::Data->new(%data_args);
 215         return \%args;
 216 }
 217
 218 sub BUILD {
 219     my $self = shift;
 220     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
 221     $self->_set_start( $self->add_reading(
 222         { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
 223     $self->_set_end( $self->add_reading(
 224         { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
 225 }
 226
 227 sub register_relationship_type {
 228         my $self = shift;
 229         my %args = @_ == 1 ? %{$_[0]} : @_;
 230         if( $self->relations->has_type( $args{name} ) ) {
 231                 throw( 'Relationship type ' . $args{name} . ' already registered' );
 232         }
 233         $self->relations->add_type( %args );
 234 }
 235
 236 sub get_relationship_type {
 237         my( $self, $name ) = @_;
 238                 return $self->relations->has_type( $name )
 239                         ? $self->relations->type( $name ) : undef;
 240 }
 241
 242 ### Reading construct/destruct functions
 243
 244 sub add_reading {
 245         my( $self, $reading ) = @_;
 246         unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
 247                 my %args = %$reading;
 248                 if( $args{'init'} ) {
 249                         # If we are initializing an empty collation, don't assume that we
 250                         # have set a tradition.
 251                         delete $args{'init'};
 252                 } elsif( $self->tradition->can('language') && $self->tradition->has_language
 253                         && !exists $args{'language'} ) {
 254                         $args{'language'} = $self->tradition->language;
 255                 }
 256                 $reading = Text::Tradition::Collation::Reading->new(
 257                         'collation' => $self,
 258                         %args );
 259         }
 260         # First check to see if a reading with this ID exists.
 261         if( $self->reading( $reading->id ) ) {
 262                 throw( "Collation already has a reading with id " . $reading->id );
 263         }
 264         $self->_graphcalc_done(0);
 265         $self->_add_reading( $reading->id => $reading );
 266         # Once the reading has been added, put it in both graphs.
 267         $self->sequence->add_vertex( $reading->id );
 268         $self->relations->add_reading( $reading->id );
 269         return $reading;
 270 };
 271
 272 around del_reading => sub {
 273         my $orig = shift;
 274         my $self = shift;
 275         my $arg = shift;
 276
 277         if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
 278                 $arg = $arg->id;
 279         }
 280         # Remove the reading from the graphs.
 281         $self->_graphcalc_done(0);
 282         $self->_clear_cache; # Explicitly clear caches to GC the reading
 283         $self->sequence->delete_vertex( $arg );
 284         $self->relations->delete_reading( $arg );
 285
 286         # Carry on.
 287         $self->$orig( $arg );
 288 };
 289
 290 =head2 merge_readings( $main, $second, $concatenate, $with_str )
 291
 292 Merges the $second reading into the $main one. If $concatenate is true, then
 293 the merged node will carry the text of both readings, concatenated with either
 294 $with_str (if specified) or a sensible default (the empty string if the
 295 appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
 296
 297 The first two arguments may be either readings or reading IDs.
 298
 299 =begin testing
 300
 301 use Text::Tradition;
 302
 303 my $cxfile = 't/data/Collatex-16.xml';
 304 my $t = Text::Tradition->new(
 305     'name'  => 'inline',
 306     'input' => 'CollateX',
 307     'file'  => $cxfile,
 308     );
 309 my $c = $t->collation;
 310
 311 my $rno = scalar $c->readings;
 312 # Split n21 ('unto') for testing purposes
 313 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
 314 my $old_r = $c->reading( 'n21' );
 315 $old_r->alter_text( 'to' );
 316 $c->del_path( 'n20', 'n21', 'A' );
 317 $c->add_path( 'n20', 'n21p0', 'A' );
 318 $c->add_path( 'n21p0', 'n21', 'A' );
 319 $c->add_relationship( 'n21', 'n22', { type => 'collated', scope => 'local' } );
 320 $c->flatten_ranks();
 321 ok( $c->reading( 'n21p0' ), "New reading exists" );
 322 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
 323
 324 # Combine n3 and n4 ( with his )
 325 $c->merge_readings( 'n3', 'n4', 1 );
 326 ok( !$c->reading('n4'), "Reading n4 is gone" );
 327 is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
 328
 329 # Collapse n9 and n10 ( rood / root )
 330 $c->merge_readings( 'n9', 'n10' );
 331 ok( !$c->reading('n10'), "Reading n10 is gone" );
 332 is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
 333
 334 # Combine n21 and n21p0
 335 my $remaining = $c->reading('n21');
 336 $remaining ||= $c->reading('n22');  # one of these should still exist
 337 $c->merge_readings( 'n21p0', $remaining, 1 );
 338 ok( !$c->reading('n21'), "Reading $remaining is gone" );
 339 is( $c->reading('n21p0')->text, 'unto', "Reading n21p0 merged correctly" );
 340
 341 =end testing
 342
 343 =cut
 344
 345 sub merge_readings {
 346         my $self = shift;
 347
 348         # Sanity check
 349         my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
 350         my $mergemeta = $kept_obj->is_meta;
 351         throw( "Cannot merge meta and non-meta reading" )
 352                 unless ( $mergemeta && $del_obj->is_meta )
 353                         || ( !$mergemeta && !$del_obj->is_meta );
 354         if( $mergemeta ) {
 355                 throw( "Cannot merge with start or end node" )
 356                         if( $kept_obj eq $self->start || $kept_obj eq $self->end
 357                                 || $del_obj eq $self->start || $del_obj eq $self->end );
 358                 throw( "Cannot combine text of meta readings" ) if $combine;
 359         }
 360         # We can only merge readings in a linear graph if:
 361         # - they are contiguous with only one edge between them, OR
 362         # - they are at equivalent ranks in the graph.
 363         if( $self->linear ) {
 364                 my @delpred = $del_obj->predecessors;
 365                 my @keptsuc = $kept_obj->successors;
 366                 unless ( @delpred == 1 && $delpred[0] eq $kept_obj
 367                         && @keptsuc == 1 && $keptsuc[0] eq $del_obj ) {
 368                         my( $is_ok, $msg ) = $self->relations->relationship_valid(
 369                                 $kept_obj, $del_obj, 'collated' );
 370                         unless( $is_ok ) {
 371                                 throw( "Readings $kept_obj and $del_obj can be neither concatenated nor collated" );
 372                         }
 373                 }
 374         }
 375
 376         # We only need the IDs for adding paths to the graph, not the reading
 377         # objects themselves.
 378         my $kept = $kept_obj->id;
 379         my $deleted = $del_obj->id;
 380         $self->_graphcalc_done(0);
 381
 382     # The kept reading should inherit the paths and the relationships
 383     # of the deleted reading.
 384         foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
 385                 my @vector = ( $kept );
 386                 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
 387                 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
 388                 next if $vector[0] eq $vector[1]; # Don't add a self loop
 389                 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
 390                 $self->sequence->add_edge( @vector );
 391                 my $fwits = $self->sequence->get_edge_attributes( @vector );
 392                 @wits{keys %$fwits} = values %$fwits;
 393                 $self->sequence->set_edge_attributes( @vector, \%wits );
 394         }
 395         $self->relations->merge_readings( $kept, $deleted, $combine );
 396
 397         # Do the deletion deed.
 398         if( $combine ) {
 399                 # Combine the text of the readings
 400                 my $joinstr = $combine_char;
 401                 unless( defined $joinstr ) {
 402                         $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
 403                         $joinstr = $self->wordsep unless defined $joinstr;
 404                 }
 405                 $kept_obj->_combine( $del_obj, $joinstr );
 406         }
 407         $self->del_reading( $deleted );
 408 }
 409
 410 =head2 merge_related( @relationship_types )
 411
 412 Merge all readings linked with the relationship types given. If any of the selected type(s) is not a colocation, the graph will no longer be linear. The majority/plurality reading in each case will be the one kept.
 413
 414 WARNING: This operation cannot be undone.
 415
 416 =cut
 417
 418 =begin testing
 419
 420 use Text::Tradition;
 421 use TryCatch;
 422
 423 my $t = Text::Tradition->new(
 424     'name'  => 'inline',
 425     'input' => 'Self',
 426     'file'  => 't/data/legendfrag.xml',
 427     );
 428 my $c = $t->collation;
 429
 430 my %rdg_ids;
 431 map { $rdg_ids{$_} = 1 } $c->readings;
 432 $c->merge_related( 'orthographic' );
 433 is( scalar( $c->readings ), keys( %rdg_ids ) - 8,
 434         "Successfully collapsed orthographic variation" );
 435 map { $rdg_ids{$_} = undef } qw/ r13.3 r11.4 r8.5 r8.2 r7.7 r7.5 r7.4 r7.1 /;
 436 foreach my $rid ( keys %rdg_ids ) {
 437         my $exp = $rdg_ids{$rid};
 438         is( !$c->reading( $rid ), !$exp, "Reading $rid correctly " .
 439                 ( $exp ? "retained" : "removed" ) );
 440 }
 441 ok( $c->linear, "Graph is still linear" );
 442 try {
 443         $c->calculate_ranks; # This should succeed
 444         ok( 1, "Can still calculate ranks on the new graph" );
 445 } catch {
 446         ok( 0, "Rank calculation on merged graph failed: $@" );
 447 }
 448
 449 # Now add some transpositions
 450 $c->add_relationship( 'r8.4', 'r10.4', { type => 'transposition' } );
 451 $c->merge_related( 'transposition' );
 452 is( scalar( $c->readings ), keys( %rdg_ids ) - 9,
 453         "Transposed relationship is merged away" );
 454 ok( !$c->reading('r8.4'), "Correct transposed reading removed" );
 455 ok( !$c->linear, "Graph is no longer linear" );
 456 try {
 457         $c->calculate_ranks; # This should fail
 458         ok( 0, "Rank calculation happened on nonlinear graph?!" );
 459 } catch ( Text::Tradition::Error $e ) {
 460         is( $e->message, 'Cannot calculate ranks on a non-linear graph',
 461                 "Rank calculation on merged graph threw an error" );
 462 }
 463
 464
 465
 466 =end testing
 467
 468 =cut
 469
 470 # TODO: there should be a way to display merged without affecting the underlying data!
 471
 472 sub merge_related {
 473         my $self = shift;
 474         my %reltypehash;
 475         map { $reltypehash{$_} = 1 } @_;
 476
 477         # Set up the filter for finding related readings
 478         my $filter = sub {
 479                 exists $reltypehash{$_[0]->type};
 480         };
 481
 482         my $linear = 1;
 483         # Go through all readings looking for related ones
 484         foreach my $r ( $self->readings ) {
 485                 next unless $self->reading( "$r" ); # might have been deleted meanwhile
 486                 my @related = $self->related_readings( $r, $filter );
 487                 if( @related ) {
 488                         push( @related, $r );
 489                         @related = sort {
 490                                         scalar $b->witnesses <=> scalar $a->witnesses
 491                                 } @related;
 492                         my $keep = shift @related;
 493                         foreach my $delr ( @related ) {
 494                                 $linear = undef
 495                                         unless( $self->get_relationship( $keep, $delr )->colocated );
 496                                 $self->merge_readings( $keep, $delr );
 497                         }
 498                 }
 499         }
 500         $self->linear( $linear );
 501 }
 502
 503 =head2 compress_readings
 504
 505 Where possible in the graph, compresses plain sequences of readings into a
 506 single reading. The sequences must consist of readings with no
 507 relationships to other readings, with only a single witness path between
 508 them and no other witness paths from either that would skip the other. The
 509 readings must also not be marked as nonsense or bad grammar.
 510
 511 WARNING: This operation cannot be undone.
 512
 513 =cut
 514
 515 sub compress_readings {
 516         my $self = shift;
 517         # Anywhere in the graph that there is a reading that joins only to a single
 518         # successor, and neither of these have any relationships, just join the two
 519         # readings.
 520         foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
 521                 # Now look for readings that can be joined to their successors.
 522                 next unless $rdg->is_combinable;
 523                 my %seen;
 524                 while( $self->sequence->successors( $rdg ) == 1 ) {
 525                         my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
 526                         throw( "Infinite loop" ) if $seen{$next->id};
 527                         $seen{$next->id} = 1;
 528                         last if $self->sequence->predecessors( $next ) > 1;
 529                         last unless $next->is_combinable;
 530                         say "Joining readings $rdg and $next";
 531                         $self->merge_readings( $rdg, $next, 1 );
 532                 }
 533         }
 534         # Make sure we haven't screwed anything up
 535         foreach my $wit ( $self->tradition->witnesses ) {
 536                 my $pathtext = $self->path_text( $wit->sigil );
 537                 my $origtext = join( ' ', @{$wit->text} );
 538                 throw( "Text differs for witness " . $wit->sigil )
 539                         unless $pathtext eq $origtext;
 540                 if( $wit->is_layered ) {
 541                         $pathtext = $self->path_text( $wit->sigil.$self->ac_label );
 542                         $origtext = join( ' ', @{$wit->layertext} );
 543                         throw( "Ante-corr text differs for witness " . $wit->sigil )
 544                                 unless $pathtext eq $origtext;
 545                 }
 546         }
 547
 548         $self->relations->rebuild_equivalence();
 549         $self->calculate_ranks();
 550 }
 551
 552 # Helper function for manipulating the graph.
 553 sub _stringify_args {
 554         my( $self, $first, $second, @args ) = @_;
 555     $first = $first->id
 556         if ref( $first ) eq 'Text::Tradition::Collation::Reading';
 557     $second = $second->id
 558         if ref( $second ) eq 'Text::Tradition::Collation::Reading';
 559     return( $first, $second, @args );
 560 }
 561
 562 # Helper function for manipulating the graph.
 563 sub _objectify_args {
 564         my( $self, $first, $second, $arg ) = @_;
 565     $first = $self->reading( $first )
 566         unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
 567     $second = $self->reading( $second )
 568         unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
 569     return( $first, $second, $arg );
 570 }
 571
 572 =head2 duplicate_reading( $reading, @witlist )
 573
 574 Split the given reading into two, so that the new reading is in the path for
 575 the witnesses given in @witlist. If the result is that certain non-colocated
 576 relationships (e.g. transpositions) are no longer valid, these will be removed.
 577 Returns the newly-created reading.
 578
 579 =begin testing
 580
 581 use Test::More::UTF8;
 582 use Text::Tradition;
 583 use TryCatch;
 584
 585 my $st = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/collatecorr.xml' );
 586 is( ref( $st ), 'Text::Tradition', "Got a tradition from test file" );
 587 ok( $st->has_witness('Ba96'), "Tradition has the affected witness" );
 588
 589 my $sc = $st->collation;
 590 my $numr = 17;
 591 ok( $sc->reading('n131'), "Tradition has the affected reading" );
 592 is( scalar( $sc->readings ), $numr, "There are $numr readings in the graph" );
 593 is( $sc->end->rank, 14, "There are fourteen ranks in the graph" );
 594
 595 # Detach the erroneously collated reading
 596 my( $newr, @del_rdgs ) = $sc->duplicate_reading( 'n131', 'Ba96' );
 597 ok( $newr, "New reading was created" );
 598 ok( $sc->reading('n131_0'), "Detached the bad collation with a new reading" );
 599 is( scalar( $sc->readings ), $numr + 1, "A reading was added to the graph" );
 600 is( $sc->end->rank, 10, "There are now only ten ranks in the graph" );
 601 my $csucc = $sc->common_successor( 'n131', 'n131_0' );
 602 is( $csucc->id, 'n136', "Found correct common successor to duped reading" );
 603
 604 # Check that the bad transposition is gone
 605 is( scalar @del_rdgs, 1, "Deleted reading was returned by API call" );
 606 is( $sc->get_relationship( 'n130', 'n135' ), undef, "Bad transposition relationship is gone" );
 607
 608 # The collation should not be fixed
 609 my @pairs = $sc->identical_readings();
 610 is( scalar @pairs, 0, "Not re-collated yet" );
 611 # Fix the collation
 612 ok( $sc->merge_readings( 'n124', 'n131_0' ), "Collated the readings correctly" );
 613 @pairs = $sc->identical_readings( start => 'n124', end => $csucc->id );
 614 is( scalar @pairs, 3, "Found three more identical readings" );
 615 is( $sc->end->rank, 11, "The ranks shifted appropriately" );
 616 $sc->flatten_ranks();
 617 is( scalar( $sc->readings ), $numr - 3, "Now we are collated correctly" );
 618
 619 # Check that we can't "duplicate" a reading with no wits or with all wits
 620 try {
 621         my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124' );
 622         ok( 0, "Reading duplication without witnesses throws an error" );
 623 } catch( Text::Tradition::Error $e ) {
 624         like( $e->message, qr/Must specify one or more witnesses/,
 625                 "Reading duplication without witnesses throws the expected error" );
 626 } catch {
 627         ok( 0, "Reading duplication without witnesses threw the wrong error" );
 628 }
 629
 630 try {
 631         my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124', 'Ba96', 'Mü11475' );
 632         ok( 0, "Reading duplication with all witnesses throws an error" );
 633 } catch( Text::Tradition::Error $e ) {
 634         like( $e->message, qr/Cannot join all witnesses/,
 635                 "Reading duplication with all witnesses throws the expected error" );
 636 } catch {
 637         ok( 0, "Reading duplication with all witnesses threw the wrong error" );
 638 }
 639
 640 =end testing
 641
 642 =cut
 643
 644 sub duplicate_reading {
 645         my( $self, $r, @wits ) = @_;
 646         # Check that we are not doing anything unwise.
 647         throw( "Must specify one or more witnesses for the duplicated reading" )
 648                 unless @wits;
 649         unless( ref( $r ) eq 'Text::Tradition::Collation::Reading' ) {
 650                 $r = $self->reading( $r );
 651         }
 652         throw( "Cannot duplicate a meta-reading" )
 653                 if $r->is_meta;
 654         throw( "Cannot join all witnesses to the new reading" )
 655                 if scalar( @wits ) == scalar( $r->witnesses );
 656
 657         # Get all the reading attributes and duplicate them.
 658         my $rmeta = Text::Tradition::Collation::Reading->meta;
 659         my %args;
 660     foreach my $attr( $rmeta->get_all_attributes ) {
 661                 next if $attr->name =~ /^_/;
 662                 my $acc = $attr->get_read_method;
 663                 if( !$acc && $attr->has_applied_traits ) {
 664                         my $tr = $attr->applied_traits;
 665                         if( $tr->[0] =~ /::(Array|Hash)$/ ) {
 666                                 my $which = $1;
 667                                 my %methods = reverse %{$attr->handles};
 668                                 $acc = $methods{elements};
 669                                 $args{$attr->name} = $which eq 'Array'
 670                                         ? [ $r->$acc ] : { $r->$acc };
 671                         }
 672                 } else {
 673                         $args{$attr->name} = $r->$acc if $acc;
 674                 }
 675         }
 676         # By definition the new reading will no longer be common.
 677         $args{is_common} = 0;
 678         # The new reading also needs its own ID.
 679         $args{id} = $self->_generate_dup_id( $r->id );
 680
 681         # Try to make the new reading.
 682         my $newr = $self->add_reading( \%args );
 683         # The old reading is also no longer common.
 684         $r->is_common( 0 );
 685
 686         # For each of the witnesses, dissociate from the old reading and
 687         # associate with the new.
 688         foreach my $wit ( @wits ) {
 689                 my $prior = $self->prior_reading( $r, $wit );
 690                 my $next = $self->next_reading( $r, $wit );
 691                 $self->del_path( $prior, $r, $wit );
 692                 $self->add_path( $prior, $newr, $wit );
 693                 $self->del_path( $r, $next, $wit );
 694                 $self->add_path( $newr, $next, $wit );
 695         }
 696
 697         # If the graph is ranked, we need to look for relationships that are now
 698         # invalid (i.e. 'non-colocation' types that might now be colocated) and
 699         # remove them. If not, we can skip it.
 700         my $succ;
 701         my %rrk;
 702         my @deleted_relations;
 703         if( $self->end->has_rank ) {
 704                 # Find the point where we can stop checking
 705                 $succ = $self->common_successor( $r, $newr );
 706
 707                 # Hash the existing ranks
 708                 foreach my $rdg ( $self->readings ) {
 709                         $rrk{$rdg->id} = $rdg->rank;
 710                 }
 711                 # Calculate the new ranks
 712                 $self->calculate_ranks();
 713
 714                 # Check for invalid non-colocated relationships among changed-rank readings
 715                 # from where the ranks start changing up to $succ
 716                 my $lastrank = $succ->rank;
 717                 foreach my $rdg ( $self->readings ) {
 718                         next if $rdg->rank > $lastrank;
 719                         next if $rdg->rank == $rrk{$rdg->id};
 720                         my @noncolo = $rdg->related_readings( sub { !$_[0]->colocated } );
 721                         next unless @noncolo;
 722                         foreach my $nc ( @noncolo ) {
 723                                 unless( $self->relations->verify_or_delete( $rdg, $nc ) ) {
 724                                         push( @deleted_relations, [ $rdg->id, $nc->id ] );
 725                                 }
 726                         }
 727                 }
 728         }
 729         return ( $newr, @deleted_relations );
 730 }
 731
 732 sub _generate_dup_id {
 733         my( $self, $rid ) = @_;
 734         my $newid;
 735         my $i = 0;
 736         while( !$newid ) {
 737                 $newid = $rid."_$i";
 738                 if( $self->has_reading( $newid ) ) {
 739                         $newid = '';
 740                         $i++;
 741                 }
 742         }
 743         return $newid;
 744 }
 745
 746 ### Path logic
 747
 748 sub add_path {
 749         my $self = shift;
 750
 751         # We only need the IDs for adding paths to the graph, not the reading
 752         # objects themselves.
 753     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 754
 755         $self->_graphcalc_done(0);
 756         # Connect the readings
 757         unless( $self->sequence->has_edge( $source, $target ) ) {
 758             $self->sequence->add_edge( $source, $target );
 759             $self->relations->add_equivalence_edge( $source, $target );
 760         }
 761     # Note the witness in question
 762     $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
 763 }
 764
 765 sub del_path {
 766         my $self = shift;
 767         my @args;
 768         if( ref( $_[0] ) eq 'ARRAY' ) {
 769                 my $e = shift @_;
 770                 @args = ( @$e, @_ );
 771         } else {
 772                 @args = @_;
 773         }
 774
 775         # We only need the IDs for removing paths from the graph, not the reading
 776         # objects themselves.
 777     my( $source, $target, $wit ) = $self->_stringify_args( @args );
 778
 779         $self->_graphcalc_done(0);
 780         if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
 781                 $self->sequence->delete_edge_attribute( $source, $target, $wit );
 782         }
 783         unless( $self->sequence->has_edge_attributes( $source, $target ) ) {
 784                 $self->sequence->delete_edge( $source, $target );
 785                 $self->relations->delete_equivalence_edge( $source, $target );
 786         }
 787 }
 788
 789
 790 # Extra graph-alike utility
 791 sub has_path {
 792         my $self = shift;
 793     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 794         return undef unless $self->sequence->has_edge( $source, $target );
 795         return $self->sequence->has_edge_attribute( $source, $target, $wit );
 796 }
 797
 798 =head2 clear_witness( @sigil_list )
 799
 800 Clear the given witnesses out of the collation entirely, removing references
 801 to them in paths, and removing readings that belong only to them.  Should only
 802 be called via $tradition->del_witness.
 803
 804 =cut
 805
 806 sub clear_witness {
 807         my( $self, @sigils ) = @_;
 808
 809         $self->_graphcalc_done(0);
 810         # Clear the witness(es) out of the paths
 811         foreach my $e ( $self->paths ) {
 812                 foreach my $sig ( @sigils ) {
 813                         $self->del_path( $e, $sig );
 814                 }
 815         }
 816
 817         # Clear out the newly unused readings
 818         foreach my $r ( $self->readings ) {
 819                 unless( $self->reading_witnesses( $r ) ) {
 820                         $self->del_reading( $r );
 821                 }
 822         }
 823 }
 824
 825 sub add_relationship {
 826         my $self = shift;
 827     my( $source, $target, $opts ) = $self->_stringify_args( @_ );
 828     my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
 829     foreach my $v ( @vectors ) {
 830         next unless $self->get_relationship( $v )->colocated;
 831         if( $self->reading( $v->[0] )->has_rank && $self->reading( $v->[1] )->has_rank
 832                 && $self->reading( $v->[0] )->rank ne $self->reading( $v->[1] )->rank ) {
 833                         $self->_graphcalc_done(0);
 834                         $self->_clear_cache;
 835                         last;
 836         }
 837     }
 838     return @vectors;
 839 }
 840
 841 around qw/ get_relationship del_relationship / => sub {
 842         my $orig = shift;
 843         my $self = shift;
 844         my @args = @_;
 845         if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
 846                 @args = @{$_[0]};
 847         }
 848         my @stringargs = $self->_stringify_args( @args );
 849         $self->$orig( @stringargs );
 850 };
 851
 852 =head2 reading_witnesses( $reading )
 853
 854 Return a list of sigils corresponding to the witnesses in which the reading appears.
 855
 856 =cut
 857
 858 sub reading_witnesses {
 859         my( $self, $reading ) = @_;
 860         # We need only check either the incoming or the outgoing edges; I have
 861         # arbitrarily chosen "incoming".  Thus, special-case the start node.
 862         if( $reading eq $self->start ) {
 863                 return map { $_->sigil } grep { $_->is_collated } $self->tradition->witnesses;
 864         }
 865         my %all_witnesses;
 866         foreach my $e ( $self->sequence->edges_to( $reading ) ) {
 867                 my $wits = $self->sequence->get_edge_attributes( @$e );
 868                 @all_witnesses{ keys %$wits } = 1;
 869         }
 870         my $acstr = $self->ac_label;
 871         foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
 872                 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
 873         }
 874         return keys %all_witnesses;
 875 }
 876
 877 =head1 OUTPUT METHODS
 878
 879 =head2 as_svg( \%options )
 880
 881 Returns an SVG string that represents the graph, via as_dot and graphviz.
 882 See as_dot for a list of options.  Must have GraphViz (dot) installed to run.
 883
 884 =cut
 885
 886 sub as_svg {
 887     my( $self, $opts ) = @_;
 888     throw( "Need GraphViz installed to output SVG" )
 889         unless File::Which::which( 'dot' );
 890     my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
 891     $self->calculate_ranks()
 892         unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
 893         my @cmd = qw/dot -Tsvg/;
 894         my( $svg, $err );
 895         my $dotfile = File::Temp->new();
 896         ## USE FOR DEBUGGING
 897         # $dotfile->unlink_on_destroy(0);
 898         binmode $dotfile, ':utf8';
 899         print $dotfile $self->as_dot( $opts );
 900         push( @cmd, $dotfile->filename );
 901         run( \@cmd, ">", binary(), \$svg );
 902         $svg = decode_utf8( $svg );
 903         return $svg;
 904 }
 905
 906
 907 =head2 as_dot( \%options )
 908
 909 Returns a string that is the collation graph expressed in dot
 910 (i.e. GraphViz) format.  Options include:
 911
 912 =over 4
 913
 914 =item * from
 915
 916 =item * to
 917
 918 =item * color_common
 919
 920 =back
 921
 922 =cut
 923
 924 sub as_dot {
 925     my( $self, $opts ) = @_;
 926     my $startrank = $opts->{'from'} if $opts;
 927     my $endrank = $opts->{'to'} if $opts;
 928     my $color_common = $opts->{'color_common'} if $opts;
 929     my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
 930        && $self->end->rank > 100;
 931     $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
 932
 933     # Check the arguments
 934     if( $startrank ) {
 935         return if $endrank && $startrank > $endrank;
 936         return if $startrank > $self->end->rank;
 937         }
 938         if( defined $endrank ) {
 939                 return if $endrank < 0;
 940                 $endrank = undef if $endrank == $self->end->rank;
 941         }
 942
 943     my $graph_name = $self->tradition->name;
 944     $graph_name =~ s/[^\w\s]//g;
 945     $graph_name = join( '_', split( /\s+/, $graph_name ) );
 946
 947     my %graph_attrs = (
 948         'rankdir' => 'LR',
 949         'bgcolor' => 'none',
 950         );
 951     my %node_attrs = (
 952         'fontsize' => 14,
 953         'fillcolor' => 'white',
 954         'style' => 'filled',
 955         'shape' => 'ellipse'
 956         );
 957     my %edge_attrs = (
 958         'arrowhead' => 'open',
 959         'color' => '#000000',
 960         'fontcolor' => '#000000',
 961         );
 962
 963     my $dot = sprintf( "digraph %s {\n", $graph_name );
 964     $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
 965     $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
 966
 967         # Output substitute start/end readings if necessary
 968         if( $startrank ) {
 969                 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
 970         }
 971         if( $endrank ) {
 972                 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
 973         }
 974         if( $STRAIGHTENHACK ) {
 975                 ## HACK part 1
 976                 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
 977                 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
 978                 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
 979         }
 980         my %used;  # Keep track of the readings that actually appear in the graph
 981         # Sort the readings by rank if we have ranks; this speeds layout.
 982         my @all_readings = $self->end->has_rank
 983                 ? sort { $a->rank <=> $b->rank } $self->readings
 984                 : $self->readings;
 985         # TODO Refrain from outputting lacuna nodes - just grey out the edges.
 986     foreach my $reading ( @all_readings ) {
 987         # Only output readings within our rank range.
 988         next if $startrank && $reading->rank < $startrank;
 989         next if $endrank && $reading->rank > $endrank;
 990         $used{$reading->id} = 1;
 991         # Need not output nodes without separate labels
 992         next if $reading->id eq $reading->text;
 993         my $rattrs;
 994         my $label = $reading->text;
 995         $label .= '-' if $reading->join_next;
 996         $label = "-$label" if $reading->join_prior;
 997         $label =~ s/\"/\\\"/g;
 998                 $rattrs->{'label'} = $label;
 999                 $rattrs->{'id'} = $reading->id;
1000                 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
1001         $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
1002     }
1003
1004         # Add the real edges. Need to weight one edge per rank jump, in a
1005         # continuous line.
1006         # my $weighted = $self->_add_edge_weights;
1007     my @edges = $self->paths;
1008         my( %substart, %subend );
1009     foreach my $edge ( @edges ) {
1010         # Do we need to output this edge?
1011         if( $used{$edge->[0]} && $used{$edge->[1]} ) {
1012                 my $label = $self->_path_display_label( $opts,
1013                         $self->path_witnesses( $edge ) );
1014                         my $variables = { %edge_attrs, 'label' => $label };
1015
1016                         # Account for the rank gap if necessary
1017                         my $rank0 = $self->reading( $edge->[0] )->rank
1018                                 if $self->reading( $edge->[0] )->has_rank;
1019                         my $rank1 = $self->reading( $edge->[1] )->rank
1020                                 if $self->reading( $edge->[1] )->has_rank;
1021                         if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
1022                                 $variables->{'minlen'} = $rank1 - $rank0;
1023                         }
1024
1025                         # Add the calculated edge weights
1026                         # if( exists $weighted->{$edge->[0]}
1027                         #       && $weighted->{$edge->[0]} eq $edge->[1] ) {
1028                         #       # $variables->{'color'} = 'red';
1029                         #       $variables->{'weight'} = 3.0;
1030                         # }
1031
1032                         # EXPERIMENTAL: make edge width reflect no. of witnesses
1033                         my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
1034                         $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
1035
1036                         my $varopts = _dot_attr_string( $variables );
1037                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1038                                 $edge->[0], $edge->[1], $varopts );
1039         } elsif( $used{$edge->[0]} ) {
1040                 $subend{$edge->[0]} = $edge->[1];
1041         } elsif( $used{$edge->[1]} ) {
1042                 $substart{$edge->[1]} = $edge->[0];
1043         }
1044     }
1045
1046     # If we are asked to, add relationship links
1047     if( exists $opts->{show_relations} ) {
1048         my $filter = $opts->{show_relations}; # can be 'transposition' or 'all'
1049         if( $filter eq 'transposition' ) {
1050                 $filter =~ qr/^transposition$/;
1051         }
1052         my %typecolors;
1053         my @types = sort( map { $_->name } $self->relations->types );
1054         if( exists $opts->{graphcolors} ) {
1055                 foreach my $tdx ( 0 .. $#types ) {
1056                         $typecolors{$types[$tdx]} = $opts->{graphcolors}->[$tdx];
1057                 }
1058         } else {
1059                 map { $typecolors{$_} = '#FFA14F' } @types;
1060         }
1061         foreach my $redge ( $self->relationships ) {
1062                 if( $used{$redge->[0]} && $used{$redge->[1]} ) {
1063                                 my $rel = $self->get_relationship( $redge );
1064                                 next unless $filter eq 'all' || $rel->type =~ /$filter/;
1065                                 my $variables = {
1066                                         arrowhead => 'none',
1067                                         color => $typecolors{$rel->type},
1068                                         constraint => 'false',
1069                                         penwidth => '3',
1070                                 };
1071                                 unless( exists $opts->{graphcolors} ) {
1072                                         $variables->{label} = uc( substr( $rel->type, 0, 4 ) ),
1073                                 }
1074                                 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1075                                         $redge->[0], $redge->[1], _dot_attr_string( $variables ) );
1076                 }
1077         }
1078     }
1079
1080     # Add substitute start and end edges if necessary
1081     foreach my $node ( keys %substart ) {
1082         my $witstr = $self->_path_display_label( $opts,
1083                 $self->path_witnesses( $substart{$node}, $node ) );
1084         my $variables = { %edge_attrs, 'label' => $witstr };
1085         my $nrdg = $self->reading( $node );
1086         if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
1087                 # Substart is actually one lower than $startrank
1088                 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
1089         }
1090         my $varopts = _dot_attr_string( $variables );
1091         $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
1092         }
1093     foreach my $node ( keys %subend ) {
1094         my $witstr = $self->_path_display_label( $opts,
1095                 $self->path_witnesses( $node, $subend{$node} ) );
1096         my $variables = { %edge_attrs, 'label' => $witstr };
1097         my $varopts = _dot_attr_string( $variables );
1098         $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
1099         }
1100         # HACK part 2
1101         if( $STRAIGHTENHACK ) {
1102                 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
1103                 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
1104         }
1105
1106     $dot .= "}\n";
1107     return $dot;
1108 }
1109
1110 sub _dot_attr_string {
1111         my( $hash ) = @_;
1112         my @attrs;
1113         foreach my $k ( sort keys %$hash ) {
1114                 my $v = $hash->{$k};
1115                 push( @attrs, $k.'="'.$v.'"' );
1116         }
1117         return( '[ ' . join( ', ', @attrs ) . ' ]' );
1118 }
1119
1120 sub _add_edge_weights {
1121         my $self = shift;
1122         # Walk the graph from START to END, choosing the successor node with
1123         # the largest number of witness paths each time.
1124         my $weighted = {};
1125         my $curr = $self->start->id;
1126         my $ranked = $self->end->has_rank;
1127         while( $curr ne $self->end->id ) {
1128                 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
1129                 my @succ = sort { $self->path_witnesses( $curr, $a )
1130                                                         <=> $self->path_witnesses( $curr, $b ) }
1131                         $self->sequence->successors( $curr );
1132                 my $next = pop @succ;
1133                 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
1134                 # Try to avoid lacunae in the weighted path.
1135                 while( @succ &&
1136                            ( $self->reading( $next )->is_lacuna ||
1137                                  $nextrank - $rank > 1 ) ){
1138                         $next = pop @succ;
1139                 }
1140                 $weighted->{$curr} = $next;
1141                 $curr = $next;
1142         }
1143         return $weighted;
1144 }
1145
1146 =head2 path_witnesses( $edge )
1147
1148 Returns the list of sigils whose witnesses are associated with the given edge.
1149 The edge can be passed as either an array or an arrayref of ( $source, $target ).
1150
1151 =cut
1152
1153 sub path_witnesses {
1154         my( $self, @edge ) = @_;
1155         # If edge is an arrayref, cope.
1156         if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
1157                 my $e = shift @edge;
1158                 @edge = @$e;
1159         }
1160         my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
1161         return @wits;
1162 }
1163
1164 # Helper function. Make a display label for the given witnesses, showing a.c.
1165 # witnesses only where the main witness is not also in the list.
1166 sub _path_display_label {
1167         my $self = shift;
1168         my $opts = shift;
1169         my %wits;
1170         map { $wits{$_} = 1 } @_;
1171
1172         # If an a.c. wit is listed, remove it if the main wit is also listed.
1173         # Otherwise keep it for explicit listing.
1174         my $aclabel = $self->ac_label;
1175         my @disp_ac;
1176         foreach my $w ( sort keys %wits ) {
1177                 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
1178                         if( exists $wits{$1} ) {
1179                                 delete $wits{$w};
1180                         } else {
1181                                 push( @disp_ac, $w );
1182                         }
1183                 }
1184         }
1185
1186         if( $opts->{'explicit_wits'} ) {
1187                 return join( ', ', sort keys %wits );
1188         } else {
1189                 # See if we are in a majority situation.
1190                 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
1191                 $maj = $maj > 5 ? $maj : 5;
1192                 if( scalar keys %wits > $maj ) {
1193                         unshift( @disp_ac, 'majority' );
1194                         return join( ', ', @disp_ac );
1195                 } else {
1196                         return join( ', ', sort keys %wits );
1197                 }
1198         }
1199 }
1200
1201 =head2 readings_at_rank( $rank )
1202
1203 Returns a list of readings at a given rank, taken from the alignment table.
1204
1205 =cut
1206
1207 sub readings_at_rank {
1208         my( $self, $rank ) = @_;
1209         my $table = $self->alignment_table;
1210         # Table rank is real rank - 1.
1211         my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
1212         my %readings;
1213         foreach my $e ( @elements ) {
1214                 next unless ref( $e ) eq 'HASH';
1215                 next unless exists $e->{'t'};
1216                 $readings{$e->{'t'}->id} = $e->{'t'};
1217         }
1218         return values %readings;
1219 }
1220
1221 =head2 as_graphml
1222
1223 Returns a GraphML representation of the collation.  The GraphML will contain
1224 two graphs. The first expresses the attributes of the readings and the witness
1225 paths that link them; the second expresses the relationships that link the
1226 readings.  This is the native transfer format for a tradition.
1227
1228 =begin testing
1229
1230 use Text::Tradition;
1231 use TryCatch;
1232
1233 my $READINGS = 311;
1234 my $PATHS = 361;
1235
1236 my $datafile = 't/data/florilegium_tei_ps.xml';
1237 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1238                                       'name' => 'test0',
1239                                       'file' => $datafile,
1240                                       'linear' => 1 );
1241
1242 ok( $tradition, "Got a tradition object" );
1243 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
1244 ok( $tradition->collation, "Tradition has a collation" );
1245
1246 my $c = $tradition->collation;
1247 is( scalar $c->readings, $READINGS, "Collation has all readings" );
1248 is( scalar $c->paths, $PATHS, "Collation has all paths" );
1249 is( scalar $c->relationships, 0, "Collation has all relationships" );
1250
1251 # Add a few relationships
1252 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
1253 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
1254 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
1255
1256 # Now write it to GraphML and parse it again.
1257
1258 my $graphml = $c->as_graphml;
1259 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
1260 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
1261 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
1262 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
1263
1264 # Now add a stemma, write to GraphML, and look at the output.
1265 SKIP: {
1266         skip "Analysis module not present", 3 unless $tradition->can( 'add_stemma' );
1267         my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
1268         is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
1269         is( $tradition->stemmata, 1, "Tradition now has the stemma" );
1270         $graphml = $c->as_graphml;
1271         like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
1272 }
1273
1274 =end testing
1275
1276 =cut
1277
1278 ## TODO MOVE this to Tradition.pm and modularize it better
1279 sub as_graphml {
1280     my( $self, $options ) = @_;
1281         $self->calculate_ranks unless $self->_graphcalc_done;
1282
1283         my $start = $options->{'from'}
1284                 ? $self->reading( $options->{'from'} ) : $self->start;
1285         my $end = $options->{'to'}
1286                 ? $self->reading( $options->{'to'} ) : $self->end;
1287         if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
1288                 throw( 'Start node must be before end node' );
1289         }
1290         # The readings need to be ranked for this to work.
1291         $start = $self->start unless $start->has_rank;
1292         $end = $self->end unless $end->has_rank;
1293         my $rankoffset = 0;
1294         unless( $start eq $self->start ) {
1295                 $rankoffset = $start->rank - 1;
1296         }
1297         my %use_readings;
1298
1299     # Some namespaces
1300     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
1301     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
1302     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
1303         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
1304
1305     # Create the document and root node
1306     require XML::LibXML;
1307     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
1308     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
1309     $graphml->setDocumentElement( $root );
1310     $root->setNamespace( $xsi_ns, 'xsi', 0 );
1311     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
1312
1313     # List of attribute types to save on our objects and their corresponding
1314     # GraphML types
1315     my %save_types = (
1316         'Str' => 'string',
1317         'Int' => 'int',
1318         'Bool' => 'boolean',
1319         'ReadingID' => 'string',
1320         'RelationshipType' => 'string',
1321         'RelationshipScope' => 'string',
1322     );
1323
1324     # Add the data keys for the graph. Include an extra key 'version' for the
1325     # GraphML output version.
1326     my %graph_data_keys;
1327     my $gdi = 0;
1328     my %graph_attributes = ( 'version' => 'string' );
1329         # Graph attributes include those of Tradition and those of Collation.
1330         my %gattr_from;
1331         # TODO Use meta introspection method from duplicate_reading to do this
1332         # instead of naming custom keys.
1333         my $tmeta = $self->tradition->meta;
1334         my $cmeta = $self->meta;
1335         map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
1336         map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
1337         foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
1338                 next if $attr->name =~ /^_/;
1339                 next unless $save_types{$attr->type_constraint->name};
1340                 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1341         }
1342     # Extra custom keys for complex objects that should be saved in some form.
1343     # The subroutine should return a string, or undef/empty.
1344     if( $tmeta->has_method('stemmata') ) {
1345                 $graph_attributes{'stemmata'} = sub {
1346                         my @stemstrs;
1347                         map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1348                                 $self->tradition->stemmata;
1349                         join( "\n", @stemstrs );
1350                 };
1351         }
1352
1353         if( $tmeta->has_method('user') ) {
1354                 $graph_attributes{'user'} = sub {
1355                         $self->tradition->user ? $self->tradition->user->id : undef
1356                 };
1357         }
1358
1359     foreach my $datum ( sort keys %graph_attributes ) {
1360         $graph_data_keys{$datum} = 'dg'.$gdi++;
1361         my $key = $root->addNewChild( $graphml_ns, 'key' );
1362         my $dtype = ref( $graph_attributes{$datum} ) ? 'string'
1363                 : $graph_attributes{$datum};
1364         $key->setAttribute( 'attr.name', $datum );
1365         $key->setAttribute( 'attr.type', $dtype );
1366         $key->setAttribute( 'for', 'graph' );
1367         $key->setAttribute( 'id', $graph_data_keys{$datum} );
1368     }
1369
1370     # Add the data keys for reading nodes
1371     my %reading_attributes;
1372     my $rmeta = Text::Tradition::Collation::Reading->meta;
1373     foreach my $attr( $rmeta->get_all_attributes ) {
1374                 next if $attr->name =~ /^_/;
1375                 next unless $save_types{$attr->type_constraint->name};
1376                 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1377         }
1378         if( $self->start->does('Text::Tradition::Morphology' ) ) {
1379                 # Extra custom key for the reading morphology
1380                 $reading_attributes{'lexemes'} = 'string';
1381         }
1382
1383     my %node_data_keys;
1384     my $ndi = 0;
1385     foreach my $datum ( sort keys %reading_attributes ) {
1386         $node_data_keys{$datum} = 'dn'.$ndi++;
1387         my $key = $root->addNewChild( $graphml_ns, 'key' );
1388         $key->setAttribute( 'attr.name', $datum );
1389         $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
1390         $key->setAttribute( 'for', 'node' );
1391         $key->setAttribute( 'id', $node_data_keys{$datum} );
1392     }
1393
1394     # Add the data keys for edges, that is, paths and relationships. Path
1395     # data does not come from a Moose class so is here manually.
1396     my $edi = 0;
1397     my %edge_data_keys;
1398     my %edge_attributes = (
1399         witness => 'string',                    # ID/label for a path
1400         extra => 'boolean',                             # Path key
1401         );
1402     my @path_attributes = keys %edge_attributes; # track our manual additions
1403     my $pmeta = Text::Tradition::Collation::Relationship->meta;
1404     foreach my $attr( $pmeta->get_all_attributes ) {
1405                 next if $attr->name =~ /^_/;
1406                 next unless $save_types{$attr->type_constraint->name};
1407                 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1408         }
1409     foreach my $datum ( sort keys %edge_attributes ) {
1410         $edge_data_keys{$datum} = 'de'.$edi++;
1411         my $key = $root->addNewChild( $graphml_ns, 'key' );
1412         $key->setAttribute( 'attr.name', $datum );
1413         $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
1414         $key->setAttribute( 'for', 'edge' );
1415         $key->setAttribute( 'id', $edge_data_keys{$datum} );
1416     }
1417
1418     # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1419     my $xmlidname = $self->tradition->name;
1420     $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1421     if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1422         $xmlidname = '_'.$xmlidname;
1423     }
1424     my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1425     $sgraph->setAttribute( 'edgedefault', 'directed' );
1426     $sgraph->setAttribute( 'id', $xmlidname );
1427     $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
1428     $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
1429     $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
1430     $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
1431     $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
1432
1433     # Tradition/collation attribute data
1434     foreach my $datum ( keys %graph_attributes ) {
1435         my $value;
1436         if( $datum eq 'version' ) {
1437                 $value = '3.2';
1438         } elsif( ref( $graph_attributes{$datum} ) ) {
1439                 my $sub = $graph_attributes{$datum};
1440                 $value = &$sub();
1441         } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1442                 $value = $self->tradition->$datum;
1443         } else {
1444                 $value = $self->$datum;
1445         }
1446                 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
1447         }
1448
1449     my $node_ctr = 0;
1450     my %node_hash;
1451     # Add our readings to the graph
1452     foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
1453         next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1454                 ( $n->rank < $start->rank || $n->rank > $end->rank );
1455         $use_readings{$n->id} = 1;
1456         # Add to the main graph
1457         my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
1458         my $node_xmlid = 'n' . $node_ctr++;
1459         $node_hash{ $n->id } = $node_xmlid;
1460         $node_el->setAttribute( 'id', $node_xmlid );
1461         foreach my $d ( keys %reading_attributes ) {
1462                 my $nval = $n->$d;
1463                 # Custom serialization
1464                 if( $d eq 'lexemes' ) {
1465                                 # If nval is a true value, we have lexemes so we need to
1466                                 # serialize them. Otherwise set nval to undef so that the
1467                                 # key is excluded from this reading.
1468                         $nval = $nval ? $n->_serialize_lexemes : undef;
1469                 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1470                         $nval = undef;
1471                 }
1472                 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
1473                         # Adjust the ranks within the subgraph.
1474                         $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1475                                 : $nval - $rankoffset;
1476                 }
1477                 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1478                         if defined $nval;
1479         }
1480     }
1481
1482     # Add the path edges to the sequence graph
1483     my $edge_ctr = 0;
1484     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1485         # We add an edge in the graphml for every witness in $e.
1486         next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1487         my @edge_wits = sort $self->path_witnesses( $e );
1488         $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1489         $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1490         # Skip any path from start to end; that witness is not in the subgraph.
1491         next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
1492         foreach my $wit ( @edge_wits ) {
1493                         my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1494                                                                                 $node_hash{ $e->[0] },
1495                                                                                 $node_hash{ $e->[1] } );
1496                         my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
1497                         $edge_el->setAttribute( 'source', $from );
1498                         $edge_el->setAttribute( 'target', $to );
1499                         $edge_el->setAttribute( 'id', $id );
1500
1501                         # It's a witness path, so add the witness
1502                         my $base = $wit;
1503                         my $key = $edge_data_keys{'witness'};
1504                         # Is this an ante-corr witness?
1505                         my $aclabel = $self->ac_label;
1506                         if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1507                                 # Keep the base witness
1508                                 $base = $1;
1509                                 # ...and record that this is an 'extra' reading path
1510                                 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1511                         }
1512                         _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1513                 }
1514         }
1515
1516         # Report the actual number of nodes and edges that went in
1517         $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1518         $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1519
1520         # Add the relationship graph to the XML
1521         map { delete $edge_data_keys{$_} } @path_attributes;
1522         $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1523                 $node_data_keys{'id'}, \%edge_data_keys );
1524
1525     # Save and return the thing
1526     my $result = decode_utf8( $graphml->toString(1) );
1527     return $result;
1528 }
1529
1530 sub _add_graphml_data {
1531     my( $el, $key, $value ) = @_;
1532     return unless defined $value;
1533     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
1534     $data_el->setAttribute( 'key', $key );
1535     $data_el->appendText( $value );
1536 }
1537
1538 =head2 as_csv
1539
1540 Returns a CSV alignment table representation of the collation graph, one
1541 row per witness (or witness uncorrected.)
1542
1543 =head2 as_tsv
1544
1545 Returns a tab-separated alignment table representation of the collation graph,
1546 one row per witness (or witness uncorrected.)
1547
1548 =begin testing
1549
1550 use Text::Tradition;
1551 use Text::CSV;
1552
1553 my $READINGS = 311;
1554 my $PATHS = 361;
1555 my $WITS = 13;
1556 my $WITAC = 4;
1557
1558 my $datafile = 't/data/florilegium_tei_ps.xml';
1559 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1560                                       'name' => 'test0',
1561                                       'file' => $datafile,
1562                                       'linear' => 1 );
1563
1564 my $c = $tradition->collation;
1565 # Export the thing to CSV
1566 my $csvstr = $c->as_csv();
1567 # Count the columns
1568 my $csv = Text::CSV->new({ sep_char => ',', binary => 1 });
1569 my @lines = split(/\n/, $csvstr );
1570 ok( $csv->parse( $lines[0] ), "Successfully parsed first line of CSV" );
1571 is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1572 my @q_ac = grep { $_ eq 'Q'.$c->ac_label } $csv->fields;
1573 ok( @q_ac, "Found a layered witness" );
1574
1575 my $t2 = Text::Tradition->new( input => 'Tabular',
1576                                                            name => 'test2',
1577                                                            string => $csvstr,
1578                                                            sep_char => ',' );
1579 is( scalar $t2->collation->readings, $READINGS, "Reparsed CSV collation has all readings" );
1580 is( scalar $t2->collation->paths, $PATHS, "Reparsed CSV collation has all paths" );
1581
1582 # Now do it with TSV
1583 my $tsvstr = $c->as_tsv();
1584 my $t3 = Text::Tradition->new( input => 'Tabular',
1585                                                            name => 'test3',
1586                                                            string => $tsvstr,
1587                                                            sep_char => "\t" );
1588 is( scalar $t3->collation->readings, $READINGS, "Reparsed TSV collation has all readings" );
1589 is( scalar $t3->collation->paths, $PATHS, "Reparsed TSV collation has all paths" );
1590
1591 my $table = $c->alignment_table;
1592 my $noaccsv = $c->as_csv({ noac => 1 });
1593 my @noaclines = split(/\n/, $noaccsv );
1594 ok( $csv->parse( $noaclines[0] ), "Successfully parsed first line of no-ac CSV" );
1595 is( scalar( $csv->fields ), $WITS, "CSV has correct number of witness columns" );
1596 is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
1597
1598 my $safecsv = $c->as_csv({ safe_ac => 1});
1599 my @safelines = split(/\n/, $safecsv );
1600 ok( $csv->parse( $safelines[0] ), "Successfully parsed first line of safe CSV" );
1601 is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1602 @q_ac = grep { $_ eq 'Q__L' } $csv->fields;
1603 ok( @q_ac, "Found a sanitized layered witness" );
1604 is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
1605
1606 =end testing
1607
1608 =cut
1609
1610 sub _tabular {
1611     my( $self, $opts ) = @_;
1612     my $table = $self->alignment_table( $opts );
1613         my $csv_options = { binary => 1, quote_null => 0 };
1614         $csv_options->{'sep_char'} = $opts->{fieldsep};
1615         if( $opts->{fieldsep} eq "\t" ) {
1616                 # If it is really tab separated, nothing is an escape char.
1617                 $csv_options->{'quote_char'} = undef;
1618                 $csv_options->{'escape_char'} = '';
1619         }
1620     my $csv = Text::CSV->new( $csv_options );
1621     my @result;
1622     # Make the header row
1623     $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1624         push( @result, $csv->string );
1625     # Make the rest of the rows
1626     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1627         my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1628         my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
1629         $csv->combine( @row );
1630         push( @result, $csv->string );
1631     }
1632     return join( "\n", @result );
1633 }
1634
1635 sub as_csv {
1636         my $self = shift;
1637         my $opts = shift || {};
1638         $opts->{fieldsep} = ',';
1639         return $self->_tabular( $opts );
1640 }
1641
1642 sub as_tsv {
1643         my $self = shift;
1644         my $opts = shift || {};
1645         $opts->{fieldsep} = "\t";
1646         return $self->_tabular( $opts );
1647 }
1648
1649 =head2 alignment_table
1650
1651 Return a reference to an alignment table, in a slightly enhanced CollateX
1652 format which looks like this:
1653
1654  $table = { alignment => [ { witness => "SIGIL",
1655                              tokens => [ { t => "TEXT" }, ... ] },
1656                            { witness => "SIG2",
1657                              tokens => [ { t => "TEXT" }, ... ] },
1658                            ... ],
1659             length => TEXTLEN };
1660
1661 =cut
1662
1663 sub alignment_table {
1664     my( $self, $opts ) = @_;
1665     if( $self->has_cached_table ) {
1666                 return $self->cached_table
1667                         unless $opts->{noac} || $opts->{safe_ac};
1668     }
1669
1670     # Make sure we can do this
1671         throw( "Need a linear graph in order to make an alignment table" )
1672                 unless $self->linear;
1673     $self->calculate_ranks()
1674         unless $self->_graphcalc_done && $self->end->has_rank;
1675
1676     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
1677     my @all_pos = ( 1 .. $self->end->rank - 1 );
1678     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
1679         # say STDERR "Making witness row(s) for " . $wit->sigil;
1680         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1681         my @row = _make_witness_row( \@wit_path, \@all_pos );
1682         my $witobj = { 'witness' => $wit->sigil, 'tokens' => \@row };
1683         $witobj->{'identifier'} = $wit->identifier if $wit->identifier;
1684         push( @{$table->{'alignment'}}, $witobj );
1685         if( $wit->is_layered && !$opts->{noac} ) {
1686                 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
1687                         $wit->sigil.$self->ac_label );
1688             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
1689             my $witlabel = $opts->{safe_ac}
1690                 ? $wit->sigil . '__L' : $wit->sigil.$self->ac_label;
1691             my $witacobj = { 'witness' => $witlabel,
1692                 'tokens' => \@ac_row };
1693             $witacobj->{'identifier'} = $wit->identifier if $wit->identifier;
1694                         push( @{$table->{'alignment'}}, $witacobj );
1695         }
1696     }
1697     unless( $opts->{noac} || $opts->{safe_ac} ) {
1698             $self->cached_table( $table );
1699         }
1700     return $table;
1701 }
1702
1703 sub _make_witness_row {
1704     my( $path, $positions ) = @_;
1705     my %char_hash;
1706     map { $char_hash{$_} = undef } @$positions;
1707     my $debug = 0;
1708     foreach my $rdg ( @$path ) {
1709         say STDERR "rank " . $rdg->rank if $debug;
1710         # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
1711         $char_hash{$rdg->rank} = { 't' => $rdg };
1712     }
1713     my @row = map { $char_hash{$_} } @$positions;
1714     # Fill in lacuna markers for undef spots in the row
1715     my $last_el = shift @row;
1716     my @filled_row = ( $last_el );
1717     foreach my $el ( @row ) {
1718         # If we are using node reference, make the lacuna node appear many times
1719         # in the table.  If not, use the lacuna tag.
1720         if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1721             $el = $last_el;
1722         }
1723         push( @filled_row, $el );
1724         $last_el = $el;
1725     }
1726     return @filled_row;
1727 }
1728
1729
1730 =head1 NAVIGATION METHODS
1731
1732 =head2 reading_sequence( $first, $last, $sigil, $backup )
1733
1734 Returns the ordered list of readings, starting with $first and ending
1735 with $last, for the witness given in $sigil. If a $backup sigil is
1736 specified (e.g. when walking a layered witness), it will be used wherever
1737 no $sigil path exists.  If there is a base text reading, that will be
1738 used wherever no path exists for $sigil or $backup.
1739
1740 =cut
1741
1742 # TODO Think about returning some lazy-eval iterator.
1743 # TODO Get rid of backup; we should know from what witness is whether we need it.
1744
1745 sub reading_sequence {
1746     my( $self, $start, $end, $witness ) = @_;
1747
1748     $witness = $self->baselabel unless $witness;
1749     my @readings = ( $start );
1750     my %seen;
1751     my $n = $start;
1752     while( $n && $n->id ne $end->id ) {
1753         if( exists( $seen{$n->id} ) ) {
1754             throw( "Detected loop for $witness at " . $n->id );
1755         }
1756         $seen{$n->id} = 1;
1757
1758         my $next = $self->next_reading( $n, $witness );
1759         unless( $next ) {
1760             throw( "Did not find any path for $witness from reading " . $n->id );
1761         }
1762         push( @readings, $next );
1763         $n = $next;
1764     }
1765     # Check that the last reading is our end reading.
1766     my $last = $readings[$#readings];
1767     throw( "Last reading found from " . $start->text .
1768         " for witness $witness is not the end!" ) # TODO do we get this far?
1769         unless $last->id eq $end->id;
1770
1771     return @readings;
1772 }
1773
1774 =head2 next_reading( $reading, $sigil );
1775
1776 Returns the reading that follows the given reading along the given witness
1777 path.
1778
1779 =cut
1780
1781 sub next_reading {
1782     # Return the successor via the corresponding path.
1783     my $self = shift;
1784     my $answer = $self->_find_linked_reading( 'next', @_ );
1785         return undef unless $answer;
1786     return $self->reading( $answer );
1787 }
1788
1789 =head2 prior_reading( $reading, $sigil )
1790
1791 Returns the reading that precedes the given reading along the given witness
1792 path.
1793
1794 =cut
1795
1796 sub prior_reading {
1797     # Return the predecessor via the corresponding path.
1798     my $self = shift;
1799     my $answer = $self->_find_linked_reading( 'prior', @_ );
1800     return $self->reading( $answer );
1801 }
1802
1803 sub _find_linked_reading {
1804     my( $self, $direction, $node, $path ) = @_;
1805
1806     # Get a backup if we are dealing with a layered witness
1807     my $alt_path;
1808     my $aclabel = $self->ac_label;
1809     if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1810         $alt_path = $1;
1811     }
1812
1813     my @linked_paths = $direction eq 'next'
1814         ? $self->sequence->edges_from( $node )
1815         : $self->sequence->edges_to( $node );
1816     return undef unless scalar( @linked_paths );
1817
1818     # We have to find the linked path that contains all of the
1819     # witnesses supplied in $path.
1820     my( @path_wits, @alt_path_wits );
1821     @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1822     @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
1823     my $base_le;
1824     my $alt_le;
1825     foreach my $le ( @linked_paths ) {
1826         if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
1827             $base_le = $le;
1828         }
1829                 my @le_wits = sort $self->path_witnesses( $le );
1830                 if( _is_within( \@path_wits, \@le_wits ) ) {
1831                         # This is the right path.
1832                         return $direction eq 'next' ? $le->[1] : $le->[0];
1833                 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1834                         $alt_le = $le;
1835                 }
1836     }
1837     # Got this far? Return the alternate path if it exists.
1838     return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
1839         if $alt_le;
1840
1841     # Got this far? Return the base path if it exists.
1842     return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
1843         if $base_le;
1844
1845     # Got this far? We have no appropriate path.
1846     warn "Could not find $direction node from " . $node->id
1847         . " along path $path";
1848     return undef;
1849 }
1850
1851 # Some set logic.
1852 sub _is_within {
1853     my( $set1, $set2 ) = @_;
1854     my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
1855     foreach my $el ( @$set1 ) {
1856         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
1857     }
1858     return $ret;
1859 }
1860
1861 # Return the string that joins together a list of witnesses for
1862 # display on a single path.
1863 sub _witnesses_of_label {
1864     my( $self, $label ) = @_;
1865     my $regex = $self->wit_list_separator;
1866     my @answer = split( /\Q$regex\E/, $label );
1867     return @answer;
1868 }
1869
1870 =head2 common_readings
1871
1872 Returns the list of common readings in the graph (i.e. those readings that are
1873 shared by all non-lacunose witnesses.)
1874
1875 =cut
1876
1877 sub common_readings {
1878         my $self = shift;
1879         my @common = grep { $_->is_common } $self->readings;
1880         return @common;
1881 }
1882
1883 =head2 path_text( $sigil, [, $start, $end ] )
1884
1885 Returns the text of a witness (plus its backup, if we are using a layer)
1886 as stored in the collation.  The text is returned as a string, where the
1887 individual readings are joined with spaces and the meta-readings (e.g.
1888 lacunae) are omitted.  Optional specification of $start and $end allows
1889 the generation of a subset of the witness text.
1890
1891 =cut
1892
1893 sub path_text {
1894         my( $self, $wit, $start, $end ) = @_;
1895         $start = $self->start unless $start;
1896         $end = $self->end unless $end;
1897         my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
1898         my $pathtext = '';
1899         my $last;
1900         foreach my $r ( @path ) {
1901                 unless ( $r->join_prior || !$last || $last->join_next ) {
1902                         $pathtext .= ' ';
1903                 }
1904                 $pathtext .= $r->text;
1905                 $last = $r;
1906         }
1907         return $pathtext;
1908 }
1909
1910 =head1 INITIALIZATION METHODS
1911
1912 These are mostly for use by parsers.
1913
1914 =head2 make_witness_path( $witness )
1915
1916 Link the array of readings contained in $witness->path (and in
1917 $witness->uncorrected_path if it exists) into collation paths.
1918 Clear out the arrays when finished.
1919
1920 =head2 make_witness_paths
1921
1922 Call make_witness_path for all witnesses in the tradition.
1923
1924 =cut
1925
1926 # For use when a collation is constructed from a base text and an apparatus.
1927 # We have the sequences of readings and just need to add path edges.
1928 # When we are done, clear out the witness path attributes, as they are no
1929 # longer needed.
1930 # TODO Find a way to replace the witness path attributes with encapsulated functions?
1931
1932 sub make_witness_paths {
1933     my( $self ) = @_;
1934     foreach my $wit ( $self->tradition->witnesses ) {
1935         # say STDERR "Making path for " . $wit->sigil;
1936         $self->make_witness_path( $wit );
1937     }
1938 }
1939
1940 sub make_witness_path {
1941     my( $self, $wit ) = @_;
1942     my @chain = @{$wit->path};
1943     my $sig = $wit->sigil;
1944     # Add start and end if necessary
1945     unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1946     push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1947     foreach my $idx ( 0 .. $#chain-1 ) {
1948         $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
1949     }
1950     if( $wit->is_layered ) {
1951         @chain = @{$wit->uncorrected_path};
1952                 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1953                 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1954         foreach my $idx( 0 .. $#chain-1 ) {
1955             my $source = $chain[$idx];
1956             my $target = $chain[$idx+1];
1957             $self->add_path( $source, $target, $sig.$self->ac_label )
1958                 unless $self->has_path( $source, $target, $sig );
1959         }
1960     }
1961     $wit->clear_path;
1962     $wit->clear_uncorrected_path;
1963 }
1964
1965 =head2 calculate_ranks
1966
1967 Calculate the reading ranks (that is, their aligned positions relative
1968 to each other) for the graph.  This can only be called on linear collations.
1969
1970 =begin testing
1971
1972 use Text::Tradition;
1973
1974 my $cxfile = 't/data/Collatex-16.xml';
1975 my $t = Text::Tradition->new(
1976     'name'  => 'inline',
1977     'input' => 'CollateX',
1978     'file'  => $cxfile,
1979     );
1980 my $c = $t->collation;
1981
1982 # Make an svg
1983 my $table = $c->alignment_table;
1984 ok( $c->has_cached_table, "Alignment table was cached" );
1985 is( $c->alignment_table, $table, "Cached table returned upon second call" );
1986 $c->calculate_ranks;
1987 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
1988 $c->add_relationship( 'n13', 'n23', { type => 'repetition' } );
1989 is( $c->alignment_table, $table, "Alignment table unchanged after non-colo relationship add" );
1990 $c->add_relationship( 'n24', 'n23', { type => 'spelling' } );
1991 isnt( $c->alignment_table, $table, "Alignment table changed after colo relationship add" );
1992
1993 =end testing
1994
1995 =cut
1996
1997 sub calculate_ranks {
1998     my $self = shift;
1999     # Save the existing ranks, in case we need to invalidate the cached SVG.
2000     throw( "Cannot calculate ranks on a non-linear graph" )
2001         unless $self->linear;
2002     my %existing_ranks;
2003     map { $existing_ranks{$_} = $_->rank } $self->readings;
2004
2005     # Do the rankings based on the relationship equivalence graph, starting
2006     # with the start node.
2007     my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
2008
2009     # Transfer our rankings from the topological graph to the real one.
2010     foreach my $r ( $self->readings ) {
2011         if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
2012             $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
2013         } else {
2014                 # Die. Find the last rank we calculated.
2015                 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
2016                                  <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
2017                         $self->readings;
2018                 my $last = pop @all_defined;
2019             throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
2020         }
2021     }
2022     # Do we need to invalidate the cached data?
2023     if( $self->has_cached_table ) {
2024         foreach my $r ( $self->readings ) {
2025                 next if defined( $existing_ranks{$r} )
2026                         && $existing_ranks{$r} == $r->rank;
2027                 # Something has changed, so clear the cache
2028                 $self->_clear_cache;
2029                         # ...and recalculate the common readings.
2030                         $self->calculate_common_readings();
2031                 last;
2032         }
2033     }
2034         # The graph calculation information is now up to date.
2035         $self->_graphcalc_done(1);
2036 }
2037
2038 sub _clear_cache {
2039         my $self = shift;
2040         $self->wipe_table if $self->has_cached_table;
2041 }
2042
2043
2044 =head2 flatten_ranks
2045
2046 A convenience method for parsing collation data.  Searches the graph for readings
2047 with the same text at the same rank, and merges any that are found.
2048
2049 =cut
2050
2051 sub flatten_ranks {
2052     my ( $self, %args ) = shift;
2053     my %unique_rank_rdg;
2054     my $changed;
2055     foreach my $p ( $self->identical_readings( %args ) ) {
2056                 # say STDERR "Combining readings at same rank: @$p";
2057                 $changed = 1;
2058                 $self->merge_readings( @$p );
2059                 # TODO see if this now makes a common point.
2060     }
2061     # If we merged readings, the ranks are still fine but the alignment
2062     # table is wrong. Wipe it.
2063     $self->wipe_table() if $changed;
2064 }
2065
2066 =head2 identical_readings
2067 =head2 identical_readings( start => $startnode, end => $endnode )
2068 =head2 identical_readings( startrank => $startrank, endrank => $endrank )
2069
2070 Goes through the graph identifying all pairs of readings that appear to be
2071 identical, and therefore able to be merged into a single reading. Returns the
2072 relevant identical pairs. Can be restricted to run over only a part of the
2073 graph, specified either by node or by rank.
2074
2075 =cut
2076
2077 sub identical_readings {
2078         my ( $self, %args ) = @_;
2079     # Find where we should start and end.
2080     my $startrank = $args{startrank} || 0;
2081     if( $args{start} ) {
2082         throw( "Starting reading has no rank" ) unless $self->reading( $args{start} )
2083                 && $self->reading( $args{start} )->has_rank;
2084         $startrank = $self->reading( $args{start} )->rank;
2085     }
2086     my $endrank = $args{endrank} || $self->end->rank;
2087     if( $args{end} ) {
2088         throw( "Ending reading has no rank" ) unless $self->reading( $args{end} )
2089                 && $self->reading( $args{end} )->has_rank;
2090         $endrank = $self->reading( $args{end} )->rank;
2091     }
2092
2093     # Make sure the ranks are correct.
2094     unless( $self->_graphcalc_done ) {
2095         $self->calculate_ranks;
2096     }
2097     # Go through the readings looking for duplicates.
2098     my %unique_rank_rdg;
2099     my @pairs;
2100     foreach my $rdg ( $self->readings ) {
2101         next unless $rdg->has_rank;
2102         my $rk = $rdg->rank;
2103         next if $rk > $endrank || $rk < $startrank;
2104         my $key = $rk . "||" . $rdg->text;
2105         if( exists $unique_rank_rdg{$key} ) {
2106                 # Make sure they don't have different grammatical forms
2107                         my $ur = $unique_rank_rdg{$key};
2108                 if( $rdg->is_identical( $ur ) ) {
2109                                 push( @pairs, [ $ur, $rdg ] );
2110                         }
2111         } else {
2112             $unique_rank_rdg{$key} = $rdg;
2113         }
2114     }
2115
2116     return @pairs;
2117 }
2118
2119
2120 =head2 calculate_common_readings
2121
2122 Goes through the graph identifying the readings that appear in every witness
2123 (apart from those with lacunae at that spot.) Marks them as common and returns
2124 the list.
2125
2126 =begin testing
2127
2128 use Text::Tradition;
2129
2130 my $cxfile = 't/data/Collatex-16.xml';
2131 my $t = Text::Tradition->new(
2132     'name'  => 'inline',
2133     'input' => 'CollateX',
2134     'file'  => $cxfile,
2135     );
2136 my $c = $t->collation;
2137
2138 my @common = $c->calculate_common_readings();
2139 is( scalar @common, 8, "Found correct number of common readings" );
2140 my @marked = sort $c->common_readings();
2141 is( scalar @common, 8, "All common readings got marked as such" );
2142 my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
2143 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
2144
2145 =end testing
2146
2147 =cut
2148
2149 sub calculate_common_readings {
2150         my $self = shift;
2151         my @common;
2152         map { $_->is_common( 0 ) } $self->readings;
2153         # Implicitly calls calculate_ranks
2154         my $table = $self->alignment_table;
2155         foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
2156                 my @row = map { $_->{'tokens'}->[$idx]
2157                                                         ? $_->{'tokens'}->[$idx]->{'t'} : '' }
2158                                         @{$table->{'alignment'}};
2159                 my %hash;
2160                 foreach my $r ( @row ) {
2161                         if( $r ) {
2162                                 $hash{$r->id} = $r unless $r->is_meta;
2163                         } else {
2164                                 $hash{'UNDEF'} = $r;
2165                         }
2166                 }
2167                 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
2168                         my( $r ) = values %hash;
2169                         $r->is_common( 1 );
2170                         push( @common, $r );
2171                 }
2172         }
2173         return @common;
2174 }
2175
2176 =head2 text_from_paths
2177
2178 Calculate the text array for all witnesses from the path, for later consistency
2179 checking.  Only to be used if there is no non-graph-based way to know the
2180 original texts.
2181
2182 =cut
2183
2184 sub text_from_paths {
2185         my $self = shift;
2186     foreach my $wit ( $self->tradition->witnesses ) {
2187         my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
2188         my @text;
2189         foreach my $r ( @readings ) {
2190                 next if $r->is_meta;
2191                 push( @text, $r->text );
2192         }
2193         $wit->text( \@text );
2194         if( $wit->is_layered ) {
2195                         my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
2196                                                                                                   $wit->sigil.$self->ac_label );
2197                         my @uctext;
2198                         foreach my $r ( @ucrdgs ) {
2199                                 next if $r->is_meta;
2200                                 push( @uctext, $r->text );
2201                         }
2202                         $wit->layertext( \@uctext );
2203         }
2204     }
2205 }
2206
2207 =head1 UTILITY FUNCTIONS
2208
2209 =head2 common_predecessor( $reading_a, $reading_b )
2210
2211 Find the last reading that occurs in sequence before both the given readings.
2212 At the very least this should be $self->start.
2213
2214 =head2 common_successor( $reading_a, $reading_b )
2215
2216 Find the first reading that occurs in sequence after both the given readings.
2217 At the very least this should be $self->end.
2218
2219 =begin testing
2220
2221 use Text::Tradition;
2222
2223 my $cxfile = 't/data/Collatex-16.xml';
2224 my $t = Text::Tradition->new(
2225     'name'  => 'inline',
2226     'input' => 'CollateX',
2227     'file'  => $cxfile,
2228     );
2229 my $c = $t->collation;
2230
2231 is( $c->common_predecessor( 'n24', 'n23' )->id,
2232     'n20', "Found correct common predecessor" );
2233 is( $c->common_successor( 'n24', 'n23' )->id,
2234     '__END__', "Found correct common successor" );
2235
2236 is( $c->common_predecessor( 'n19', 'n17' )->id,
2237     'n16', "Found correct common predecessor for readings on same path" );
2238 is( $c->common_successor( 'n21', 'n10' )->id,
2239     '__END__', "Found correct common successor for readings on same path" );
2240
2241 =end testing
2242
2243 =cut
2244
2245 ## Return the closest reading that is a predecessor of both the given readings.
2246 sub common_predecessor {
2247         my $self = shift;
2248         my( $r1, $r2 ) = $self->_objectify_args( @_ );
2249         return $self->_common_in_path( $r1, $r2, 'predecessors' );
2250 }
2251
2252 sub common_successor {
2253         my $self = shift;
2254         my( $r1, $r2 ) = $self->_objectify_args( @_ );
2255         return $self->_common_in_path( $r1, $r2, 'successors' );
2256 }
2257
2258
2259 # TODO think about how to do this without ranks...
2260 sub _common_in_path {
2261         my( $self, $r1, $r2, $dir ) = @_;
2262         my $iter = $self->end->rank;
2263         my @candidates;
2264         my @last_r1 = ( $r1 );
2265         my @last_r2 = ( $r2 );
2266         # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
2267         my %all_seen;
2268         # say STDERR "Finding common $dir for $r1, $r2";
2269         while( !@candidates ) {
2270                 last unless $iter--;  # Avoid looping infinitely
2271                 # Iterate separately down the graph from r1 and r2
2272                 my( @new_lc1, @new_lc2 );
2273                 foreach my $lc ( @last_r1 ) {
2274                         foreach my $p ( $lc->$dir ) {
2275                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
2276                                         # say STDERR "Path candidate $p from $lc";
2277                                         push( @candidates, $p );
2278                                 } elsif( !$all_seen{$p->id} ) {
2279                                         $all_seen{$p->id} = 'r1';
2280                                         push( @new_lc1, $p );
2281                                 }
2282                         }
2283                 }
2284                 foreach my $lc ( @last_r2 ) {
2285                         foreach my $p ( $lc->$dir ) {
2286                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
2287                                         # say STDERR "Path candidate $p from $lc";
2288                                         push( @candidates, $p );
2289                                 } elsif( !$all_seen{$p->id} ) {
2290                                         $all_seen{$p->id} = 'r2';
2291                                         push( @new_lc2, $p );
2292                                 }
2293                         }
2294                 }
2295                 @last_r1 = @new_lc1;
2296                 @last_r2 = @new_lc2;
2297         }
2298         my @answer = sort { $a->rank <=> $b->rank } @candidates;
2299         return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
2300 }
2301
2302 sub throw {
2303         Text::Tradition::Error->throw(
2304                 'ident' => 'Collation error',
2305                 'message' => $_[0],
2306                 );
2307 }
2308
2309 no Moose;
2310 __PACKAGE__->meta->make_immutable;
2311
2312 =head1 BUGS/TODO
2313
2314 =over
2315
2316 =item * Rework XML serialization in a more modular way
2317
2318 =back
2319
2320 =head1 LICENSE
2321
2322 This package is free software and is provided "as is" without express
2323 or implied warranty.  You can redistribute it and/or modify it under
2324 the same terms as Perl itself.
2325
2326 =head1 AUTHOR
2327
2328 Tara L Andrews E<lt>aurum@cpan.orgE<gt>