base/lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use feature 'say';
   4 use Encode qw( decode_utf8 );
   5 use File::Temp;
   6 use File::Which;
   7 use Graph;
   8 use IPC::Run qw( run binary );
   9 use Text::CSV;
  10 use Text::Tradition::Collation::Data;
  11 use Text::Tradition::Collation::Reading;
  12 use Text::Tradition::Collation::RelationshipStore;
  13 use Text::Tradition::Error;
  14 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
  15 use XML::LibXML;
  16 use XML::LibXML::XPathContext;
  17 use Moose;
  18
  19 has _data => (
  20         isa      => 'Text::Tradition::Collation::Data',
  21         is       => 'ro',
  22         required => 1,
  23         handles  => [ qw(
  24                 sequence
  25                 paths
  26                 _set_relations
  27                 relations
  28                 _set_start
  29                 _set_end
  30                 ac_label
  31                 has_cached_table
  32                 relationships
  33                 related_readings
  34                 get_relationship
  35                 del_relationship
  36                 equivalence
  37                 equivalence_graph
  38                 readings
  39                 reading
  40                 _add_reading
  41                 del_reading
  42                 has_reading
  43                 wit_list_separator
  44                 baselabel
  45                 linear
  46                 wordsep
  47                 start
  48                 end
  49                 cached_table
  50                 _graphcalc_done
  51                 has_cached_svg
  52                 wipe_table
  53         )]
  54 );
  55
  56 has 'tradition' => (
  57     is => 'ro',
  58     isa => 'Text::Tradition',
  59     writer => '_set_tradition',
  60     weak_ref => 1,
  61     );
  62
  63 =head1 NAME
  64
  65 Text::Tradition::Collation - a software model for a text collation
  66
  67 =head1 SYNOPSIS
  68
  69   use Text::Tradition;
  70   my $t = Text::Tradition->new(
  71     'name' => 'this is a text',
  72     'input' => 'TEI',
  73     'file' => '/path/to/tei_parallel_seg_file.xml' );
  74
  75   my $c = $t->collation;
  76   my @readings = $c->readings;
  77   my @paths = $c->paths;
  78   my @relationships = $c->relationships;
  79
  80   my $svg_variant_graph = $t->collation->as_svg();
  81
  82 =head1 DESCRIPTION
  83
  84 Text::Tradition is a library for representation and analysis of collated
  85 texts, particularly medieval ones.  The Collation is the central feature of
  86 a Tradition, where the text, its sequence of readings, and its relationships
  87 between readings are actually kept.
  88
  89 =head1 CONSTRUCTOR
  90
  91 =head2 new
  92
  93 The constructor.  Takes a hash or hashref of the following arguments:
  94
  95 =over
  96
  97 =item * tradition - The Text::Tradition object to which the collation
  98 belongs. Required.
  99
 100 =item * linear - Whether the collation should be linear; that is, whether
 101 transposed readings should be treated as two linked readings rather than one,
 102 and therefore whether the collation graph is acyclic.  Defaults to true.
 103
 104 =item * baselabel - The default label for the path taken by a base text
 105 (if any). Defaults to 'base text'.
 106
 107 =item * wit_list_separator - The string to join a list of witnesses for
 108 purposes of making labels in display graphs.  Defaults to ', '.
 109
 110 =item * ac_label - The extra label to tack onto a witness sigil when
 111 representing another layer of path for the given witness - that is, when
 112 a text has more than one possible reading due to scribal corrections or
 113 the like.  Defaults to ' (a.c.)'.
 114
 115 =item * wordsep - The string used to separate words in the original text.
 116 Defaults to ' '.
 117
 118 =back
 119
 120 =head1 ACCESSORS
 121
 122 =head2 tradition
 123
 124 =head2 linear
 125
 126 =head2 wit_list_separator
 127
 128 =head2 baselabel
 129
 130 =head2 ac_label
 131
 132 =head2 wordsep
 133
 134 Simple accessors for collation attributes.
 135
 136 =head2 start
 137
 138 The meta-reading at the start of every witness path.
 139
 140 =head2 end
 141
 142 The meta-reading at the end of every witness path.
 143
 144 =head2 readings
 145
 146 Returns all Reading objects in the graph.
 147
 148 =head2 reading( $id )
 149
 150 Returns the Reading object corresponding to the given ID.
 151
 152 =head2 add_reading( $reading_args )
 153
 154 Adds a new reading object to the collation.
 155 See L<Text::Tradition::Collation::Reading> for the available arguments.
 156
 157 =head2 del_reading( $object_or_id )
 158
 159 Removes the given reading from the collation, implicitly removing its
 160 paths and relationships.
 161
 162 =head2 has_reading( $id )
 163
 164 Predicate to see whether a given reading ID is in the graph.
 165
 166 =head2 reading_witnesses( $object_or_id )
 167
 168 Returns a list of sigils whose witnesses contain the reading.
 169
 170 =head2 paths
 171
 172 Returns all reading paths within the document - that is, all edges in the
 173 collation graph.  Each path is an arrayref of [ $source, $target ] reading IDs.
 174
 175 =head2 add_path( $source, $target, $sigil )
 176
 177 Links the given readings in the collation in sequence, under the given witness
 178 sigil.  The readings may be specified by object or ID.
 179
 180 =head2 del_path( $source, $target, $sigil )
 181
 182 Links the given readings in the collation in sequence, under the given witness
 183 sigil.  The readings may be specified by object or ID.
 184
 185 =head2 has_path( $source, $target );
 186
 187 Returns true if the two readings are linked in sequence in any witness.
 188 The readings may be specified by object or ID.
 189
 190 =head2 relationships
 191
 192 Returns all Relationship objects in the collation.
 193
 194 =head2 add_relationship( $reading, $other_reading, $options )
 195
 196 Adds a new relationship of the type given in $options between the two readings,
 197 which may be specified by object or ID.  Returns a value of ( $status, @vectors)
 198 where $status is true on success, and @vectors is a list of relationship edges
 199 that were ultimately added.
 200 See L<Text::Tradition::Collation::Relationship> for the available options.
 201
 202 =cut
 203
 204 sub BUILDARGS {
 205         my ( $class, @args ) = @_;
 206         my %args = @args == 1 ? %{ $args[0] } : @args;
 207         # TODO determine these from the Moose::Meta object
 208         my @delegate_attrs = qw(sequence relations readings wit_list_separator baselabel
 209                 linear wordsep start end cached_table _graphcalc_done);
 210         my %data_args;
 211         for my $attr (@delegate_attrs) {
 212                 $data_args{$attr} = delete $args{$attr} if exists $args{$attr};
 213         }
 214         $args{_data} = Text::Tradition::Collation::Data->new(%data_args);
 215         return \%args;
 216 }
 217
 218 sub BUILD {
 219     my $self = shift;
 220     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
 221     $self->_set_start( $self->add_reading(
 222         { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
 223     $self->_set_end( $self->add_reading(
 224         { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
 225 }
 226
 227 sub register_relationship_type {
 228         my $self = shift;
 229         my %args = @_ == 1 ? %{$_[0]} : @_;
 230         if( $self->relations->has_type( $args{name} ) ) {
 231                 throw( 'Relationship type ' . $args{name} . ' already registered' );
 232         }
 233         $self->relations->add_type( %args );
 234 }
 235
 236 sub get_relationship_type {
 237         my( $self, $name ) = @_;
 238                 return $self->relations->has_type( $name )
 239                         ? $self->relations->type( $name ) : undef;
 240 }
 241
 242 ### Reading construct/destruct functions
 243
 244 sub add_reading {
 245         my( $self, $reading ) = @_;
 246         unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
 247                 my %args = %$reading;
 248                 if( $args{'init'} ) {
 249                         # If we are initializing an empty collation, don't assume that we
 250                         # have set a tradition.
 251                         delete $args{'init'};
 252                 } elsif( $self->tradition->can('language') && $self->tradition->has_language
 253                         && !exists $args{'language'} ) {
 254                         $args{'language'} = $self->tradition->language;
 255                 }
 256                 $reading = Text::Tradition::Collation::Reading->new(
 257                         'collation' => $self,
 258                         %args );
 259         }
 260         # First check to see if a reading with this ID exists.
 261         if( $self->reading( $reading->id ) ) {
 262                 throw( "Collation already has a reading with id " . $reading->id );
 263         }
 264         $self->_graphcalc_done(0);
 265         $self->_add_reading( $reading->id => $reading );
 266         # Once the reading has been added, put it in both graphs.
 267         $self->sequence->add_vertex( $reading->id );
 268         $self->relations->add_reading( $reading->id );
 269         return $reading;
 270 };
 271
 272 around del_reading => sub {
 273         my $orig = shift;
 274         my $self = shift;
 275         my $arg = shift;
 276
 277         if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
 278                 $arg = $arg->id;
 279         }
 280         # Remove the reading from the graphs.
 281         $self->_graphcalc_done(0);
 282         $self->_clear_cache; # Explicitly clear caches to GC the reading
 283         $self->sequence->delete_vertex( $arg );
 284         $self->relations->delete_reading( $arg );
 285
 286         # Carry on.
 287         $self->$orig( $arg );
 288 };
 289
 290 =head2 merge_readings( $main, $second, $concatenate, $with_str )
 291
 292 Merges the $second reading into the $main one. If $concatenate is true, then
 293 the merged node will carry the text of both readings, concatenated with either
 294 $with_str (if specified) or a sensible default (the empty string if the
 295 appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
 296
 297 The first two arguments may be either readings or reading IDs.
 298
 299 =begin testing
 300
 301 use Text::Tradition;
 302
 303 my $cxfile = 't/data/Collatex-16.xml';
 304 my $t = Text::Tradition->new(
 305     'name'  => 'inline',
 306     'input' => 'CollateX',
 307     'file'  => $cxfile,
 308     );
 309 my $c = $t->collation;
 310
 311 my $rno = scalar $c->readings;
 312 # Split n21 ('unto') for testing purposes
 313 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
 314 my $old_r = $c->reading( 'n21' );
 315 $old_r->alter_text( 'to' );
 316 $c->del_path( 'n20', 'n21', 'A' );
 317 $c->add_path( 'n20', 'n21p0', 'A' );
 318 $c->add_path( 'n21p0', 'n21', 'A' );
 319 $c->add_relationship( 'n21', 'n22', { type => 'collated', scope => 'local' } );
 320 $c->flatten_ranks();
 321 ok( $c->reading( 'n21p0' ), "New reading exists" );
 322 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
 323
 324 # Combine n3 and n4 ( with his )
 325 $c->merge_readings( 'n3', 'n4', 1 );
 326 ok( !$c->reading('n4'), "Reading n4 is gone" );
 327 is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
 328
 329 # Collapse n9 and n10 ( rood / root )
 330 $c->merge_readings( 'n9', 'n10' );
 331 ok( !$c->reading('n10'), "Reading n10 is gone" );
 332 is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
 333
 334 # Combine n21 and n21p0
 335 my $remaining = $c->reading('n21');
 336 $remaining ||= $c->reading('n22');  # one of these should still exist
 337 $c->merge_readings( 'n21p0', $remaining, 1 );
 338 ok( !$c->reading('n21'), "Reading $remaining is gone" );
 339 is( $c->reading('n21p0')->text, 'unto', "Reading n21p0 merged correctly" );
 340
 341 =end testing
 342
 343 =cut
 344
 345 sub merge_readings {
 346         my $self = shift;
 347
 348         # Sanity check
 349         my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
 350         my $mergemeta = $kept_obj->is_meta;
 351         throw( "Cannot merge meta and non-meta reading" )
 352                 unless ( $mergemeta && $del_obj->is_meta )
 353                         || ( !$mergemeta && !$del_obj->is_meta );
 354         if( $mergemeta ) {
 355                 throw( "Cannot merge with start or end node" )
 356                         if( $kept_obj eq $self->start || $kept_obj eq $self->end
 357                                 || $del_obj eq $self->start || $del_obj eq $self->end );
 358                 throw( "Cannot combine text of meta readings" ) if $combine;
 359         }
 360         # We only need the IDs for adding paths to the graph, not the reading
 361         # objects themselves.
 362         my $kept = $kept_obj->id;
 363         my $deleted = $del_obj->id;
 364         $self->_graphcalc_done(0);
 365
 366     # The kept reading should inherit the paths and the relationships
 367     # of the deleted reading.
 368         foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
 369                 my @vector = ( $kept );
 370                 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
 371                 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
 372                 next if $vector[0] eq $vector[1]; # Don't add a self loop
 373                 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
 374                 $self->sequence->add_edge( @vector );
 375                 my $fwits = $self->sequence->get_edge_attributes( @vector );
 376                 @wits{keys %$fwits} = values %$fwits;
 377                 $self->sequence->set_edge_attributes( @vector, \%wits );
 378         }
 379         $self->relations->merge_readings( $kept, $deleted, $combine );
 380
 381         # Do the deletion deed.
 382         if( $combine ) {
 383                 # Combine the text of the readings
 384                 my $joinstr = $combine_char;
 385                 unless( defined $joinstr ) {
 386                         $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
 387                         $joinstr = $self->wordsep unless defined $joinstr;
 388                 }
 389                 $kept_obj->_combine( $del_obj, $joinstr );
 390         }
 391         $self->del_reading( $deleted );
 392 }
 393
 394 =head2 merge_related( @relationship_types )
 395
 396 Merge all readings linked with the relationship types given. If any of the selected type(s) is not a colocation, the graph will no longer be linear. The majority/plurality reading in each case will be the one kept.
 397
 398 WARNING: This operation cannot be undone.
 399
 400 =cut
 401
 402 =begin testing
 403
 404 use Text::Tradition;
 405 use TryCatch;
 406
 407 my $t = Text::Tradition->new(
 408     'name'  => 'inline',
 409     'input' => 'Self',
 410     'file'  => 't/data/legendfrag.xml',
 411     );
 412 my $c = $t->collation;
 413
 414 my %rdg_ids;
 415 map { $rdg_ids{$_} = 1 } $c->readings;
 416 $c->merge_related( 'orthographic' );
 417 is( scalar( $c->readings ), keys( %rdg_ids ) - 8,
 418         "Successfully collapsed orthographic variation" );
 419 map { $rdg_ids{$_} = undef } qw/ r13.3 r11.4 r8.5 r8.2 r7.7 r7.5 r7.4 r7.1 /;
 420 foreach my $rid ( keys %rdg_ids ) {
 421         my $exp = $rdg_ids{$rid};
 422         is( !$c->reading( $rid ), !$exp, "Reading $rid correctly " .
 423                 ( $exp ? "retained" : "removed" ) );
 424 }
 425 ok( $c->linear, "Graph is still linear" );
 426 try {
 427         $c->calculate_ranks; # This should succeed
 428         ok( 1, "Can still calculate ranks on the new graph" );
 429 } catch {
 430         ok( 0, "Rank calculation on merged graph failed: $@" );
 431 }
 432
 433 # Now add some transpositions
 434 $c->add_relationship( 'r8.4', 'r10.4', { type => 'transposition' } );
 435 $c->merge_related( 'transposition' );
 436 is( scalar( $c->readings ), keys( %rdg_ids ) - 9,
 437         "Transposed relationship is merged away" );
 438 ok( !$c->reading('r8.4'), "Correct transposed reading removed" );
 439 ok( !$c->linear, "Graph is no longer linear" );
 440 try {
 441         $c->calculate_ranks; # This should fail
 442         ok( 0, "Rank calculation happened on nonlinear graph?!" );
 443 } catch ( Text::Tradition::Error $e ) {
 444         is( $e->message, 'Cannot calculate ranks on a non-linear graph',
 445                 "Rank calculation on merged graph threw an error" );
 446 }
 447
 448
 449
 450 =end testing
 451
 452 =cut
 453
 454 # TODO: there should be a way to display merged without affecting the underlying data!
 455
 456 sub merge_related {
 457         my $self = shift;
 458         my %reltypehash;
 459         map { $reltypehash{$_} = 1 } @_;
 460
 461         # Set up the filter for finding related readings
 462         my $filter = sub {
 463                 exists $reltypehash{$_[0]->type};
 464         };
 465
 466         my $linear = 1;
 467         # Go through all readings looking for related ones
 468         foreach my $r ( $self->readings ) {
 469                 next unless $self->reading( "$r" ); # might have been deleted meanwhile
 470                 my @related = $self->related_readings( $r, $filter );
 471                 if( @related ) {
 472                         push( @related, $r );
 473                         @related = sort {
 474                                         scalar $b->witnesses <=> scalar $a->witnesses
 475                                 } @related;
 476                         my $keep = shift @related;
 477                         foreach my $delr ( @related ) {
 478                                 $linear = undef
 479                                         unless( $self->get_relationship( $keep, $delr )->colocated );
 480                                 $self->merge_readings( $keep, $delr );
 481                         }
 482                 }
 483         }
 484         $self->linear( $linear );
 485 }
 486
 487 =head2 compress_readings
 488
 489 Where possible in the graph, compresses plain sequences of readings into a
 490 single reading. The sequences must consist of readings with no
 491 relationships to other readings, with only a single witness path between
 492 them and no other witness paths from either that would skip the other. The
 493 readings must also not be marked as nonsense or bad grammar.
 494
 495 WARNING: This operation cannot be undone.
 496
 497 =cut
 498
 499 sub compress_readings {
 500         my $self = shift;
 501         # Anywhere in the graph that there is a reading that joins only to a single
 502         # successor, and neither of these have any relationships, just join the two
 503         # readings.
 504         foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
 505                 # Now look for readings that can be joined to their successors.
 506                 next unless $rdg->is_combinable;
 507                 my %seen;
 508                 while( $self->sequence->successors( $rdg ) == 1 ) {
 509                         my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
 510                         throw( "Infinite loop" ) if $seen{$next->id};
 511                         $seen{$next->id} = 1;
 512                         last if $self->sequence->predecessors( $next ) > 1;
 513                         last unless $next->is_combinable;
 514                         say "Joining readings $rdg and $next";
 515                         $self->merge_readings( $rdg, $next, 1 );
 516                 }
 517         }
 518         # Make sure we haven't screwed anything up
 519         foreach my $wit ( $self->tradition->witnesses ) {
 520                 my $pathtext = $self->path_text( $wit->sigil );
 521                 my $origtext = join( ' ', @{$wit->text} );
 522                 throw( "Text differs for witness " . $wit->sigil )
 523                         unless $pathtext eq $origtext;
 524                 if( $wit->is_layered ) {
 525                         $pathtext = $self->path_text( $wit->sigil.$self->ac_label );
 526                         $origtext = join( ' ', @{$wit->layertext} );
 527                         throw( "Ante-corr text differs for witness " . $wit->sigil )
 528                                 unless $pathtext eq $origtext;
 529                 }
 530         }
 531
 532         $self->relations->rebuild_equivalence();
 533         $self->calculate_ranks();
 534 }
 535
 536 # Helper function for manipulating the graph.
 537 sub _stringify_args {
 538         my( $self, $first, $second, @args ) = @_;
 539     $first = $first->id
 540         if ref( $first ) eq 'Text::Tradition::Collation::Reading';
 541     $second = $second->id
 542         if ref( $second ) eq 'Text::Tradition::Collation::Reading';
 543     return( $first, $second, @args );
 544 }
 545
 546 # Helper function for manipulating the graph.
 547 sub _objectify_args {
 548         my( $self, $first, $second, $arg ) = @_;
 549     $first = $self->reading( $first )
 550         unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
 551     $second = $self->reading( $second )
 552         unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
 553     return( $first, $second, $arg );
 554 }
 555
 556 =head2 duplicate_reading( $reading, @witlist )
 557
 558 Split the given reading into two, so that the new reading is in the path for
 559 the witnesses given in @witlist. If the result is that certain non-colocated
 560 relationships (e.g. transpositions) are no longer valid, these will be removed.
 561 Returns the newly-created reading.
 562
 563 =begin testing
 564
 565 use Test::More::UTF8;
 566 use Text::Tradition;
 567 use TryCatch;
 568
 569 my $st = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/collatecorr.xml' );
 570 is( ref( $st ), 'Text::Tradition', "Got a tradition from test file" );
 571 ok( $st->has_witness('Ba96'), "Tradition has the affected witness" );
 572
 573 my $sc = $st->collation;
 574 my $numr = 17;
 575 ok( $sc->reading('n131'), "Tradition has the affected reading" );
 576 is( scalar( $sc->readings ), $numr, "There are $numr readings in the graph" );
 577 is( $sc->end->rank, 14, "There are fourteen ranks in the graph" );
 578
 579 # Detach the erroneously collated reading
 580 my( $newr, @del_rdgs ) = $sc->duplicate_reading( 'n131', 'Ba96' );
 581 ok( $newr, "New reading was created" );
 582 ok( $sc->reading('n131_0'), "Detached the bad collation with a new reading" );
 583 is( scalar( $sc->readings ), $numr + 1, "A reading was added to the graph" );
 584 is( $sc->end->rank, 10, "There are now only ten ranks in the graph" );
 585 my $csucc = $sc->common_successor( 'n131', 'n131_0' );
 586 is( $csucc->id, 'n136', "Found correct common successor to duped reading" );
 587
 588 # Check that the bad transposition is gone
 589 is( scalar @del_rdgs, 1, "Deleted reading was returned by API call" );
 590 is( $sc->get_relationship( 'n130', 'n135' ), undef, "Bad transposition relationship is gone" );
 591
 592 # The collation should not be fixed
 593 my @pairs = $sc->identical_readings();
 594 is( scalar @pairs, 0, "Not re-collated yet" );
 595 # Fix the collation
 596 ok( $sc->merge_readings( 'n124', 'n131_0' ), "Collated the readings correctly" );
 597 @pairs = $sc->identical_readings( start => 'n124', end => $csucc->id );
 598 is( scalar @pairs, 3, "Found three more identical readings" );
 599 is( $sc->end->rank, 11, "The ranks shifted appropriately" );
 600 $sc->flatten_ranks();
 601 is( scalar( $sc->readings ), $numr - 3, "Now we are collated correctly" );
 602
 603 # Check that we can't "duplicate" a reading with no wits or with all wits
 604 try {
 605         my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124' );
 606         ok( 0, "Reading duplication without witnesses throws an error" );
 607 } catch( Text::Tradition::Error $e ) {
 608         like( $e->message, qr/Must specify one or more witnesses/,
 609                 "Reading duplication without witnesses throws the expected error" );
 610 } catch {
 611         ok( 0, "Reading duplication without witnesses threw the wrong error" );
 612 }
 613
 614 try {
 615         my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124', 'Ba96', 'Mü11475' );
 616         ok( 0, "Reading duplication with all witnesses throws an error" );
 617 } catch( Text::Tradition::Error $e ) {
 618         like( $e->message, qr/Cannot join all witnesses/,
 619                 "Reading duplication with all witnesses throws the expected error" );
 620 } catch {
 621         ok( 0, "Reading duplication with all witnesses threw the wrong error" );
 622 }
 623
 624 =end testing
 625
 626 =cut
 627
 628 sub duplicate_reading {
 629         my( $self, $r, @wits ) = @_;
 630         # Check that we are not doing anything unwise.
 631         throw( "Must specify one or more witnesses for the duplicated reading" )
 632                 unless @wits;
 633         unless( ref( $r ) eq 'Text::Tradition::Collation::Reading' ) {
 634                 $r = $self->reading( $r );
 635         }
 636         throw( "Cannot duplicate a meta-reading" )
 637                 if $r->is_meta;
 638         throw( "Cannot join all witnesses to the new reading" )
 639                 if scalar( @wits ) == scalar( $r->witnesses );
 640
 641         # Get all the reading attributes and duplicate them.
 642         my $rmeta = Text::Tradition::Collation::Reading->meta;
 643         my %args;
 644     foreach my $attr( $rmeta->get_all_attributes ) {
 645                 next if $attr->name =~ /^_/;
 646                 my $acc = $attr->get_read_method;
 647                 if( !$acc && $attr->has_applied_traits ) {
 648                         my $tr = $attr->applied_traits;
 649                         if( $tr->[0] =~ /::(Array|Hash)$/ ) {
 650                                 my $which = $1;
 651                                 my %methods = reverse %{$attr->handles};
 652                                 $acc = $methods{elements};
 653                                 $args{$attr->name} = $which eq 'Array'
 654                                         ? [ $r->$acc ] : { $r->$acc };
 655                         }
 656                 } else {
 657                         $args{$attr->name} = $r->$acc if $acc;
 658                 }
 659         }
 660         # By definition the new reading will no longer be common.
 661         $args{is_common} = 0;
 662         # The new reading also needs its own ID.
 663         $args{id} = $self->_generate_dup_id( $r->id );
 664
 665         # Try to make the new reading.
 666         my $newr = $self->add_reading( \%args );
 667         # The old reading is also no longer common.
 668         $r->is_common( 0 );
 669
 670         # For each of the witnesses, dissociate from the old reading and
 671         # associate with the new.
 672         foreach my $wit ( @wits ) {
 673                 my $prior = $self->prior_reading( $r, $wit );
 674                 my $next = $self->next_reading( $r, $wit );
 675                 $self->del_path( $prior, $r, $wit );
 676                 $self->add_path( $prior, $newr, $wit );
 677                 $self->del_path( $r, $next, $wit );
 678                 $self->add_path( $newr, $next, $wit );
 679         }
 680
 681         # If the graph is ranked, we need to look for relationships that are now
 682         # invalid (i.e. 'non-colocation' types that might now be colocated) and
 683         # remove them. If not, we can skip it.
 684         my $succ;
 685         my %rrk;
 686         my @deleted_relations;
 687         if( $self->end->has_rank ) {
 688                 # Find the point where we can stop checking
 689                 $succ = $self->common_successor( $r, $newr );
 690
 691                 # Hash the existing ranks
 692                 foreach my $rdg ( $self->readings ) {
 693                         $rrk{$rdg->id} = $rdg->rank;
 694                 }
 695                 # Calculate the new ranks
 696                 $self->calculate_ranks();
 697
 698                 # Check for invalid non-colocated relationships among changed-rank readings
 699                 # from where the ranks start changing up to $succ
 700                 my $lastrank = $succ->rank;
 701                 foreach my $rdg ( $self->readings ) {
 702                         next if $rdg->rank > $lastrank;
 703                         next if $rdg->rank == $rrk{$rdg->id};
 704                         my @noncolo = $rdg->related_readings( sub { !$_[0]->colocated } );
 705                         next unless @noncolo;
 706                         foreach my $nc ( @noncolo ) {
 707                                 unless( $self->relations->verify_or_delete( $rdg, $nc ) ) {
 708                                         push( @deleted_relations, [ $rdg->id, $nc->id ] );
 709                                 }
 710                         }
 711                 }
 712         }
 713         return ( $newr, @deleted_relations );
 714 }
 715
 716 sub _generate_dup_id {
 717         my( $self, $rid ) = @_;
 718         my $newid;
 719         my $i = 0;
 720         while( !$newid ) {
 721                 $newid = $rid."_$i";
 722                 if( $self->has_reading( $newid ) ) {
 723                         $newid = '';
 724                         $i++;
 725                 }
 726         }
 727         return $newid;
 728 }
 729
 730 ### Path logic
 731
 732 sub add_path {
 733         my $self = shift;
 734
 735         # We only need the IDs for adding paths to the graph, not the reading
 736         # objects themselves.
 737     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 738
 739         $self->_graphcalc_done(0);
 740         # Connect the readings
 741         unless( $self->sequence->has_edge( $source, $target ) ) {
 742             $self->sequence->add_edge( $source, $target );
 743             $self->relations->add_equivalence_edge( $source, $target );
 744         }
 745     # Note the witness in question
 746     $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
 747 }
 748
 749 sub del_path {
 750         my $self = shift;
 751         my @args;
 752         if( ref( $_[0] ) eq 'ARRAY' ) {
 753                 my $e = shift @_;
 754                 @args = ( @$e, @_ );
 755         } else {
 756                 @args = @_;
 757         }
 758
 759         # We only need the IDs for removing paths from the graph, not the reading
 760         # objects themselves.
 761     my( $source, $target, $wit ) = $self->_stringify_args( @args );
 762
 763         $self->_graphcalc_done(0);
 764         if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
 765                 $self->sequence->delete_edge_attribute( $source, $target, $wit );
 766         }
 767         unless( $self->sequence->has_edge_attributes( $source, $target ) ) {
 768                 $self->sequence->delete_edge( $source, $target );
 769                 $self->relations->delete_equivalence_edge( $source, $target );
 770         }
 771 }
 772
 773
 774 # Extra graph-alike utility
 775 sub has_path {
 776         my $self = shift;
 777     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 778         return undef unless $self->sequence->has_edge( $source, $target );
 779         return $self->sequence->has_edge_attribute( $source, $target, $wit );
 780 }
 781
 782 =head2 clear_witness( @sigil_list )
 783
 784 Clear the given witnesses out of the collation entirely, removing references
 785 to them in paths, and removing readings that belong only to them.  Should only
 786 be called via $tradition->del_witness.
 787
 788 =cut
 789
 790 sub clear_witness {
 791         my( $self, @sigils ) = @_;
 792
 793         $self->_graphcalc_done(0);
 794         # Clear the witness(es) out of the paths
 795         foreach my $e ( $self->paths ) {
 796                 foreach my $sig ( @sigils ) {
 797                         $self->del_path( $e, $sig );
 798                 }
 799         }
 800
 801         # Clear out the newly unused readings
 802         foreach my $r ( $self->readings ) {
 803                 unless( $self->reading_witnesses( $r ) ) {
 804                         $self->del_reading( $r );
 805                 }
 806         }
 807 }
 808
 809 sub add_relationship {
 810         my $self = shift;
 811     my( $source, $target, $opts ) = $self->_stringify_args( @_ );
 812     my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
 813     foreach my $v ( @vectors ) {
 814         next unless $self->get_relationship( $v )->colocated;
 815         if( $self->reading( $v->[0] )->has_rank && $self->reading( $v->[1] )->has_rank
 816                 && $self->reading( $v->[0] )->rank ne $self->reading( $v->[1] )->rank ) {
 817                         $self->_graphcalc_done(0);
 818                         $self->_clear_cache;
 819                         last;
 820         }
 821     }
 822     return @vectors;
 823 }
 824
 825 around qw/ get_relationship del_relationship / => sub {
 826         my $orig = shift;
 827         my $self = shift;
 828         my @args = @_;
 829         if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
 830                 @args = @{$_[0]};
 831         }
 832         my @stringargs = $self->_stringify_args( @args );
 833         $self->$orig( @stringargs );
 834 };
 835
 836 =head2 reading_witnesses( $reading )
 837
 838 Return a list of sigils corresponding to the witnesses in which the reading appears.
 839
 840 =cut
 841
 842 sub reading_witnesses {
 843         my( $self, $reading ) = @_;
 844         # We need only check either the incoming or the outgoing edges; I have
 845         # arbitrarily chosen "incoming".  Thus, special-case the start node.
 846         if( $reading eq $self->start ) {
 847                 return map { $_->sigil } grep { $_->is_collated } $self->tradition->witnesses;
 848         }
 849         my %all_witnesses;
 850         foreach my $e ( $self->sequence->edges_to( $reading ) ) {
 851                 my $wits = $self->sequence->get_edge_attributes( @$e );
 852                 @all_witnesses{ keys %$wits } = 1;
 853         }
 854         my $acstr = $self->ac_label;
 855         foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
 856                 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
 857         }
 858         return keys %all_witnesses;
 859 }
 860
 861 =head1 OUTPUT METHODS
 862
 863 =head2 as_svg( \%options )
 864
 865 Returns an SVG string that represents the graph, via as_dot and graphviz.
 866 See as_dot for a list of options.  Must have GraphViz (dot) installed to run.
 867
 868 =cut
 869
 870 sub as_svg {
 871     my( $self, $opts ) = @_;
 872     throw( "Need GraphViz installed to output SVG" )
 873         unless File::Which::which( 'dot' );
 874     my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
 875     $self->calculate_ranks()
 876         unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
 877         my @cmd = qw/dot -Tsvg/;
 878         my( $svg, $err );
 879         my $dotfile = File::Temp->new();
 880         ## USE FOR DEBUGGING
 881         # $dotfile->unlink_on_destroy(0);
 882         binmode $dotfile, ':utf8';
 883         print $dotfile $self->as_dot( $opts );
 884         push( @cmd, $dotfile->filename );
 885         run( \@cmd, ">", binary(), \$svg );
 886         $svg = decode_utf8( $svg );
 887         return $svg;
 888 }
 889
 890
 891 =head2 as_dot( \%options )
 892
 893 Returns a string that is the collation graph expressed in dot
 894 (i.e. GraphViz) format.  Options include:
 895
 896 =over 4
 897
 898 =item * from
 899
 900 =item * to
 901
 902 =item * color_common
 903
 904 =back
 905
 906 =cut
 907
 908 sub as_dot {
 909     my( $self, $opts ) = @_;
 910     my $startrank = $opts->{'from'} if $opts;
 911     my $endrank = $opts->{'to'} if $opts;
 912     my $color_common = $opts->{'color_common'} if $opts;
 913     my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
 914        && $self->end->rank > 100;
 915     $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
 916
 917     # Check the arguments
 918     if( $startrank ) {
 919         return if $endrank && $startrank > $endrank;
 920         return if $startrank > $self->end->rank;
 921         }
 922         if( defined $endrank ) {
 923                 return if $endrank < 0;
 924                 $endrank = undef if $endrank == $self->end->rank;
 925         }
 926
 927     my $graph_name = $self->tradition->name;
 928     $graph_name =~ s/[^\w\s]//g;
 929     $graph_name = join( '_', split( /\s+/, $graph_name ) );
 930
 931     my %graph_attrs = (
 932         'rankdir' => 'LR',
 933         'bgcolor' => 'none',
 934         );
 935     my %node_attrs = (
 936         'fontsize' => 14,
 937         'fillcolor' => 'white',
 938         'style' => 'filled',
 939         'shape' => 'ellipse'
 940         );
 941     my %edge_attrs = (
 942         'arrowhead' => 'open',
 943         'color' => '#000000',
 944         'fontcolor' => '#000000',
 945         );
 946
 947     my $dot = sprintf( "digraph %s {\n", $graph_name );
 948     $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
 949     $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
 950
 951         # Output substitute start/end readings if necessary
 952         if( $startrank ) {
 953                 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
 954         }
 955         if( $endrank ) {
 956                 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
 957         }
 958         if( $STRAIGHTENHACK ) {
 959                 ## HACK part 1
 960                 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
 961                 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
 962                 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
 963         }
 964         my %used;  # Keep track of the readings that actually appear in the graph
 965         # Sort the readings by rank if we have ranks; this speeds layout.
 966         my @all_readings = $self->end->has_rank
 967                 ? sort { $a->rank <=> $b->rank } $self->readings
 968                 : $self->readings;
 969         # TODO Refrain from outputting lacuna nodes - just grey out the edges.
 970     foreach my $reading ( @all_readings ) {
 971         # Only output readings within our rank range.
 972         next if $startrank && $reading->rank < $startrank;
 973         next if $endrank && $reading->rank > $endrank;
 974         $used{$reading->id} = 1;
 975         # Need not output nodes without separate labels
 976         next if $reading->id eq $reading->text;
 977         my $rattrs;
 978         my $label = $reading->text;
 979         $label .= '-' if $reading->join_next;
 980         $label = "-$label" if $reading->join_prior;
 981         $label =~ s/\"/\\\"/g;
 982                 $rattrs->{'label'} = $label;
 983                 $rattrs->{'id'} = $reading->id;
 984                 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
 985         $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
 986     }
 987
 988         # Add the real edges. Need to weight one edge per rank jump, in a
 989         # continuous line.
 990         # my $weighted = $self->_add_edge_weights;
 991     my @edges = $self->paths;
 992         my( %substart, %subend );
 993     foreach my $edge ( @edges ) {
 994         # Do we need to output this edge?
 995         if( $used{$edge->[0]} && $used{$edge->[1]} ) {
 996                 my $label = $self->_path_display_label( $opts,
 997                         $self->path_witnesses( $edge ) );
 998                         my $variables = { %edge_attrs, 'label' => $label };
 999
1000                         # Account for the rank gap if necessary
1001                         my $rank0 = $self->reading( $edge->[0] )->rank
1002                                 if $self->reading( $edge->[0] )->has_rank;
1003                         my $rank1 = $self->reading( $edge->[1] )->rank
1004                                 if $self->reading( $edge->[1] )->has_rank;
1005                         if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
1006                                 $variables->{'minlen'} = $rank1 - $rank0;
1007                         }
1008
1009                         # Add the calculated edge weights
1010                         # if( exists $weighted->{$edge->[0]}
1011                         #       && $weighted->{$edge->[0]} eq $edge->[1] ) {
1012                         #       # $variables->{'color'} = 'red';
1013                         #       $variables->{'weight'} = 3.0;
1014                         # }
1015
1016                         # EXPERIMENTAL: make edge width reflect no. of witnesses
1017                         my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
1018                         $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
1019
1020                         my $varopts = _dot_attr_string( $variables );
1021                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1022                                 $edge->[0], $edge->[1], $varopts );
1023         } elsif( $used{$edge->[0]} ) {
1024                 $subend{$edge->[0]} = $edge->[1];
1025         } elsif( $used{$edge->[1]} ) {
1026                 $substart{$edge->[1]} = $edge->[0];
1027         }
1028     }
1029
1030     # If we are asked to, add relationship links
1031     if( exists $opts->{show_relations} ) {
1032         my $filter = $opts->{show_relations}; # can be 'transposition' or 'all'
1033         if( $filter eq 'transposition' ) {
1034                 $filter =~ qr/^transposition$/;
1035         }
1036         my %typecolors;
1037         my @types = sort( map { $_->name } $self->relations->types );
1038         if( exists $opts->{graphcolors} ) {
1039                 foreach my $tdx ( 0 .. $#types ) {
1040                         $typecolors{$types[$tdx]} = $opts->{graphcolors}->[$tdx];
1041                 }
1042         } else {
1043                 map { $typecolors{$_} = '#FFA14F' } @types;
1044         }
1045         foreach my $redge ( $self->relationships ) {
1046                 if( $used{$redge->[0]} && $used{$redge->[1]} ) {
1047                                 my $rel = $self->get_relationship( $redge );
1048                                 next unless $filter eq 'all' || $rel->type =~ /$filter/;
1049                                 my $variables = {
1050                                         arrowhead => 'none',
1051                                         color => $typecolors{$rel->type},
1052                                         constraint => 'false',
1053                                         penwidth => '3',
1054                                 };
1055                                 unless( exists $opts->{graphcolors} ) {
1056                                         $variables->{label} = uc( substr( $rel->type, 0, 4 ) ),
1057                                 }
1058                                 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1059                                         $redge->[0], $redge->[1], _dot_attr_string( $variables ) );
1060                 }
1061         }
1062     }
1063
1064     # Add substitute start and end edges if necessary
1065     foreach my $node ( keys %substart ) {
1066         my $witstr = $self->_path_display_label( $opts,
1067                 $self->path_witnesses( $substart{$node}, $node ) );
1068         my $variables = { %edge_attrs, 'label' => $witstr };
1069         my $nrdg = $self->reading( $node );
1070         if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
1071                 # Substart is actually one lower than $startrank
1072                 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
1073         }
1074         my $varopts = _dot_attr_string( $variables );
1075         $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
1076         }
1077     foreach my $node ( keys %subend ) {
1078         my $witstr = $self->_path_display_label( $opts,
1079                 $self->path_witnesses( $node, $subend{$node} ) );
1080         my $variables = { %edge_attrs, 'label' => $witstr };
1081         my $varopts = _dot_attr_string( $variables );
1082         $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
1083         }
1084         # HACK part 2
1085         if( $STRAIGHTENHACK ) {
1086                 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
1087                 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
1088         }
1089
1090     $dot .= "}\n";
1091     return $dot;
1092 }
1093
1094 sub _dot_attr_string {
1095         my( $hash ) = @_;
1096         my @attrs;
1097         foreach my $k ( sort keys %$hash ) {
1098                 my $v = $hash->{$k};
1099                 push( @attrs, $k.'="'.$v.'"' );
1100         }
1101         return( '[ ' . join( ', ', @attrs ) . ' ]' );
1102 }
1103
1104 sub _add_edge_weights {
1105         my $self = shift;
1106         # Walk the graph from START to END, choosing the successor node with
1107         # the largest number of witness paths each time.
1108         my $weighted = {};
1109         my $curr = $self->start->id;
1110         my $ranked = $self->end->has_rank;
1111         while( $curr ne $self->end->id ) {
1112                 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
1113                 my @succ = sort { $self->path_witnesses( $curr, $a )
1114                                                         <=> $self->path_witnesses( $curr, $b ) }
1115                         $self->sequence->successors( $curr );
1116                 my $next = pop @succ;
1117                 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
1118                 # Try to avoid lacunae in the weighted path.
1119                 while( @succ &&
1120                            ( $self->reading( $next )->is_lacuna ||
1121                                  $nextrank - $rank > 1 ) ){
1122                         $next = pop @succ;
1123                 }
1124                 $weighted->{$curr} = $next;
1125                 $curr = $next;
1126         }
1127         return $weighted;
1128 }
1129
1130 =head2 path_witnesses( $edge )
1131
1132 Returns the list of sigils whose witnesses are associated with the given edge.
1133 The edge can be passed as either an array or an arrayref of ( $source, $target ).
1134
1135 =cut
1136
1137 sub path_witnesses {
1138         my( $self, @edge ) = @_;
1139         # If edge is an arrayref, cope.
1140         if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
1141                 my $e = shift @edge;
1142                 @edge = @$e;
1143         }
1144         my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
1145         return @wits;
1146 }
1147
1148 # Helper function. Make a display label for the given witnesses, showing a.c.
1149 # witnesses only where the main witness is not also in the list.
1150 sub _path_display_label {
1151         my $self = shift;
1152         my $opts = shift;
1153         my %wits;
1154         map { $wits{$_} = 1 } @_;
1155
1156         # If an a.c. wit is listed, remove it if the main wit is also listed.
1157         # Otherwise keep it for explicit listing.
1158         my $aclabel = $self->ac_label;
1159         my @disp_ac;
1160         foreach my $w ( sort keys %wits ) {
1161                 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
1162                         if( exists $wits{$1} ) {
1163                                 delete $wits{$w};
1164                         } else {
1165                                 push( @disp_ac, $w );
1166                         }
1167                 }
1168         }
1169
1170         if( $opts->{'explicit_wits'} ) {
1171                 return join( ', ', sort keys %wits );
1172         } else {
1173                 # See if we are in a majority situation.
1174                 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
1175                 $maj = $maj > 5 ? $maj : 5;
1176                 if( scalar keys %wits > $maj ) {
1177                         unshift( @disp_ac, 'majority' );
1178                         return join( ', ', @disp_ac );
1179                 } else {
1180                         return join( ', ', sort keys %wits );
1181                 }
1182         }
1183 }
1184
1185 =head2 readings_at_rank( $rank )
1186
1187 Returns a list of readings at a given rank, taken from the alignment table.
1188
1189 =cut
1190
1191 sub readings_at_rank {
1192         my( $self, $rank ) = @_;
1193         my $table = $self->alignment_table;
1194         # Table rank is real rank - 1.
1195         my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
1196         my %readings;
1197         foreach my $e ( @elements ) {
1198                 next unless ref( $e ) eq 'HASH';
1199                 next unless exists $e->{'t'};
1200                 $readings{$e->{'t'}->id} = $e->{'t'};
1201         }
1202         return values %readings;
1203 }
1204
1205 =head2 as_graphml
1206
1207 Returns a GraphML representation of the collation.  The GraphML will contain
1208 two graphs. The first expresses the attributes of the readings and the witness
1209 paths that link them; the second expresses the relationships that link the
1210 readings.  This is the native transfer format for a tradition.
1211
1212 =begin testing
1213
1214 use Text::Tradition;
1215 use TryCatch;
1216
1217 my $READINGS = 311;
1218 my $PATHS = 361;
1219
1220 my $datafile = 't/data/florilegium_tei_ps.xml';
1221 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1222                                       'name' => 'test0',
1223                                       'file' => $datafile,
1224                                       'linear' => 1 );
1225
1226 ok( $tradition, "Got a tradition object" );
1227 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
1228 ok( $tradition->collation, "Tradition has a collation" );
1229
1230 my $c = $tradition->collation;
1231 is( scalar $c->readings, $READINGS, "Collation has all readings" );
1232 is( scalar $c->paths, $PATHS, "Collation has all paths" );
1233 is( scalar $c->relationships, 0, "Collation has all relationships" );
1234
1235 # Add a few relationships
1236 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
1237 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
1238 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
1239
1240 # Now write it to GraphML and parse it again.
1241
1242 my $graphml = $c->as_graphml;
1243 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
1244 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
1245 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
1246 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
1247
1248 # Now add a stemma, write to GraphML, and look at the output.
1249 SKIP: {
1250         skip "Analysis module not present", 3 unless $tradition->can( 'add_stemma' );
1251         my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
1252         is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
1253         is( $tradition->stemmata, 1, "Tradition now has the stemma" );
1254         $graphml = $c->as_graphml;
1255         like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
1256 }
1257
1258 =end testing
1259
1260 =cut
1261
1262 ## TODO MOVE this to Tradition.pm and modularize it better
1263 sub as_graphml {
1264     my( $self, $options ) = @_;
1265         $self->calculate_ranks unless $self->_graphcalc_done;
1266
1267         my $start = $options->{'from'}
1268                 ? $self->reading( $options->{'from'} ) : $self->start;
1269         my $end = $options->{'to'}
1270                 ? $self->reading( $options->{'to'} ) : $self->end;
1271         if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
1272                 throw( 'Start node must be before end node' );
1273         }
1274         # The readings need to be ranked for this to work.
1275         $start = $self->start unless $start->has_rank;
1276         $end = $self->end unless $end->has_rank;
1277         my $rankoffset = 0;
1278         unless( $start eq $self->start ) {
1279                 $rankoffset = $start->rank - 1;
1280         }
1281         my %use_readings;
1282
1283     # Some namespaces
1284     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
1285     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
1286     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
1287         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
1288
1289     # Create the document and root node
1290     require XML::LibXML;
1291     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
1292     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
1293     $graphml->setDocumentElement( $root );
1294     $root->setNamespace( $xsi_ns, 'xsi', 0 );
1295     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
1296
1297     # List of attribute types to save on our objects and their corresponding
1298     # GraphML types
1299     my %save_types = (
1300         'Str' => 'string',
1301         'Int' => 'int',
1302         'Bool' => 'boolean',
1303         'ReadingID' => 'string',
1304         'RelationshipType' => 'string',
1305         'RelationshipScope' => 'string',
1306     );
1307
1308     # Add the data keys for the graph. Include an extra key 'version' for the
1309     # GraphML output version.
1310     my %graph_data_keys;
1311     my $gdi = 0;
1312     my %graph_attributes = ( 'version' => 'string' );
1313         # Graph attributes include those of Tradition and those of Collation.
1314         my %gattr_from;
1315         # TODO Use meta introspection method from duplicate_reading to do this
1316         # instead of naming custom keys.
1317         my $tmeta = $self->tradition->meta;
1318         my $cmeta = $self->meta;
1319         map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
1320         map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
1321         foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
1322                 next if $attr->name =~ /^_/;
1323                 next unless $save_types{$attr->type_constraint->name};
1324                 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1325         }
1326     # Extra custom keys for complex objects that should be saved in some form.
1327     # The subroutine should return a string, or undef/empty.
1328     if( $tmeta->has_method('stemmata') ) {
1329                 $graph_attributes{'stemmata'} = sub {
1330                         my @stemstrs;
1331                         map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1332                                 $self->tradition->stemmata;
1333                         join( "\n", @stemstrs );
1334                 };
1335         }
1336
1337         if( $tmeta->has_method('user') ) {
1338                 $graph_attributes{'user'} = sub {
1339                         $self->tradition->user ? $self->tradition->user->id : undef
1340                 };
1341         }
1342
1343     foreach my $datum ( sort keys %graph_attributes ) {
1344         $graph_data_keys{$datum} = 'dg'.$gdi++;
1345         my $key = $root->addNewChild( $graphml_ns, 'key' );
1346         my $dtype = ref( $graph_attributes{$datum} ) ? 'string'
1347                 : $graph_attributes{$datum};
1348         $key->setAttribute( 'attr.name', $datum );
1349         $key->setAttribute( 'attr.type', $dtype );
1350         $key->setAttribute( 'for', 'graph' );
1351         $key->setAttribute( 'id', $graph_data_keys{$datum} );
1352     }
1353
1354     # Add the data keys for reading nodes
1355     my %reading_attributes;
1356     my $rmeta = Text::Tradition::Collation::Reading->meta;
1357     foreach my $attr( $rmeta->get_all_attributes ) {
1358                 next if $attr->name =~ /^_/;
1359                 next unless $save_types{$attr->type_constraint->name};
1360                 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1361         }
1362         if( $self->start->does('Text::Tradition::Morphology' ) ) {
1363                 # Extra custom key for the reading morphology
1364                 $reading_attributes{'lexemes'} = 'string';
1365         }
1366
1367     my %node_data_keys;
1368     my $ndi = 0;
1369     foreach my $datum ( sort keys %reading_attributes ) {
1370         $node_data_keys{$datum} = 'dn'.$ndi++;
1371         my $key = $root->addNewChild( $graphml_ns, 'key' );
1372         $key->setAttribute( 'attr.name', $datum );
1373         $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
1374         $key->setAttribute( 'for', 'node' );
1375         $key->setAttribute( 'id', $node_data_keys{$datum} );
1376     }
1377
1378     # Add the data keys for edges, that is, paths and relationships. Path
1379     # data does not come from a Moose class so is here manually.
1380     my $edi = 0;
1381     my %edge_data_keys;
1382     my %edge_attributes = (
1383         witness => 'string',                    # ID/label for a path
1384         extra => 'boolean',                             # Path key
1385         );
1386     my @path_attributes = keys %edge_attributes; # track our manual additions
1387     my $pmeta = Text::Tradition::Collation::Relationship->meta;
1388     foreach my $attr( $pmeta->get_all_attributes ) {
1389                 next if $attr->name =~ /^_/;
1390                 next unless $save_types{$attr->type_constraint->name};
1391                 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1392         }
1393     foreach my $datum ( sort keys %edge_attributes ) {
1394         $edge_data_keys{$datum} = 'de'.$edi++;
1395         my $key = $root->addNewChild( $graphml_ns, 'key' );
1396         $key->setAttribute( 'attr.name', $datum );
1397         $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
1398         $key->setAttribute( 'for', 'edge' );
1399         $key->setAttribute( 'id', $edge_data_keys{$datum} );
1400     }
1401
1402     # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1403     my $xmlidname = $self->tradition->name;
1404     $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1405     if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1406         $xmlidname = '_'.$xmlidname;
1407     }
1408     my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1409     $sgraph->setAttribute( 'edgedefault', 'directed' );
1410     $sgraph->setAttribute( 'id', $xmlidname );
1411     $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
1412     $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
1413     $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
1414     $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
1415     $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
1416
1417     # Tradition/collation attribute data
1418     foreach my $datum ( keys %graph_attributes ) {
1419         my $value;
1420         if( $datum eq 'version' ) {
1421                 $value = '3.2';
1422         } elsif( ref( $graph_attributes{$datum} ) ) {
1423                 my $sub = $graph_attributes{$datum};
1424                 $value = &$sub();
1425         } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1426                 $value = $self->tradition->$datum;
1427         } else {
1428                 $value = $self->$datum;
1429         }
1430                 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
1431         }
1432
1433     my $node_ctr = 0;
1434     my %node_hash;
1435     # Add our readings to the graph
1436     foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
1437         next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1438                 ( $n->rank < $start->rank || $n->rank > $end->rank );
1439         $use_readings{$n->id} = 1;
1440         # Add to the main graph
1441         my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
1442         my $node_xmlid = 'n' . $node_ctr++;
1443         $node_hash{ $n->id } = $node_xmlid;
1444         $node_el->setAttribute( 'id', $node_xmlid );
1445         foreach my $d ( keys %reading_attributes ) {
1446                 my $nval = $n->$d;
1447                 # Custom serialization
1448                 if( $d eq 'lexemes' ) {
1449                                 # If nval is a true value, we have lexemes so we need to
1450                                 # serialize them. Otherwise set nval to undef so that the
1451                                 # key is excluded from this reading.
1452                         $nval = $nval ? $n->_serialize_lexemes : undef;
1453                 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1454                         $nval = undef;
1455                 }
1456                 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
1457                         # Adjust the ranks within the subgraph.
1458                         $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1459                                 : $nval - $rankoffset;
1460                 }
1461                 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1462                         if defined $nval;
1463         }
1464     }
1465
1466     # Add the path edges to the sequence graph
1467     my $edge_ctr = 0;
1468     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1469         # We add an edge in the graphml for every witness in $e.
1470         next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1471         my @edge_wits = sort $self->path_witnesses( $e );
1472         $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1473         $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1474         # Skip any path from start to end; that witness is not in the subgraph.
1475         next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
1476         foreach my $wit ( @edge_wits ) {
1477                         my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1478                                                                                 $node_hash{ $e->[0] },
1479                                                                                 $node_hash{ $e->[1] } );
1480                         my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
1481                         $edge_el->setAttribute( 'source', $from );
1482                         $edge_el->setAttribute( 'target', $to );
1483                         $edge_el->setAttribute( 'id', $id );
1484
1485                         # It's a witness path, so add the witness
1486                         my $base = $wit;
1487                         my $key = $edge_data_keys{'witness'};
1488                         # Is this an ante-corr witness?
1489                         my $aclabel = $self->ac_label;
1490                         if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1491                                 # Keep the base witness
1492                                 $base = $1;
1493                                 # ...and record that this is an 'extra' reading path
1494                                 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1495                         }
1496                         _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1497                 }
1498         }
1499
1500         # Report the actual number of nodes and edges that went in
1501         $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1502         $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1503
1504         # Add the relationship graph to the XML
1505         map { delete $edge_data_keys{$_} } @path_attributes;
1506         $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1507                 $node_data_keys{'id'}, \%edge_data_keys );
1508
1509     # Save and return the thing
1510     my $result = decode_utf8( $graphml->toString(1) );
1511     return $result;
1512 }
1513
1514 sub _add_graphml_data {
1515     my( $el, $key, $value ) = @_;
1516     return unless defined $value;
1517     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
1518     $data_el->setAttribute( 'key', $key );
1519     $data_el->appendText( $value );
1520 }
1521
1522 =head2 as_csv
1523
1524 Returns a CSV alignment table representation of the collation graph, one
1525 row per witness (or witness uncorrected.)
1526
1527 =head2 as_tsv
1528
1529 Returns a tab-separated alignment table representation of the collation graph,
1530 one row per witness (or witness uncorrected.)
1531
1532 =begin testing
1533
1534 use Text::Tradition;
1535 use Text::CSV;
1536
1537 my $READINGS = 311;
1538 my $PATHS = 361;
1539 my $WITS = 13;
1540 my $WITAC = 4;
1541
1542 my $datafile = 't/data/florilegium_tei_ps.xml';
1543 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1544                                       'name' => 'test0',
1545                                       'file' => $datafile,
1546                                       'linear' => 1 );
1547
1548 my $c = $tradition->collation;
1549 # Export the thing to CSV
1550 my $csvstr = $c->as_csv();
1551 # Count the columns
1552 my $csv = Text::CSV->new({ sep_char => ',', binary => 1 });
1553 my @lines = split(/\n/, $csvstr );
1554 ok( $csv->parse( $lines[0] ), "Successfully parsed first line of CSV" );
1555 is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1556 my @q_ac = grep { $_ eq 'Q'.$c->ac_label } $csv->fields;
1557 ok( @q_ac, "Found a layered witness" );
1558
1559 my $t2 = Text::Tradition->new( input => 'Tabular',
1560                                                            name => 'test2',
1561                                                            string => $csvstr,
1562                                                            sep_char => ',' );
1563 is( scalar $t2->collation->readings, $READINGS, "Reparsed CSV collation has all readings" );
1564 is( scalar $t2->collation->paths, $PATHS, "Reparsed CSV collation has all paths" );
1565
1566 # Now do it with TSV
1567 my $tsvstr = $c->as_tsv();
1568 my $t3 = Text::Tradition->new( input => 'Tabular',
1569                                                            name => 'test3',
1570                                                            string => $tsvstr,
1571                                                            sep_char => "\t" );
1572 is( scalar $t3->collation->readings, $READINGS, "Reparsed TSV collation has all readings" );
1573 is( scalar $t3->collation->paths, $PATHS, "Reparsed TSV collation has all paths" );
1574
1575 my $table = $c->alignment_table;
1576 my $noaccsv = $c->as_csv({ noac => 1 });
1577 my @noaclines = split(/\n/, $noaccsv );
1578 ok( $csv->parse( $noaclines[0] ), "Successfully parsed first line of no-ac CSV" );
1579 is( scalar( $csv->fields ), $WITS, "CSV has correct number of witness columns" );
1580 is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
1581
1582 my $safecsv = $c->as_csv({ safe_ac => 1});
1583 my @safelines = split(/\n/, $safecsv );
1584 ok( $csv->parse( $safelines[0] ), "Successfully parsed first line of safe CSV" );
1585 is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1586 @q_ac = grep { $_ eq 'Q__L' } $csv->fields;
1587 ok( @q_ac, "Found a sanitized layered witness" );
1588 is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
1589
1590 =end testing
1591
1592 =cut
1593
1594 sub _tabular {
1595     my( $self, $opts ) = @_;
1596     my $table = $self->alignment_table( $opts );
1597         my $csv_options = { binary => 1, quote_null => 0 };
1598         $csv_options->{'sep_char'} = $opts->{fieldsep};
1599         if( $opts->{fieldsep} eq "\t" ) {
1600                 # If it is really tab separated, nothing is an escape char.
1601                 $csv_options->{'quote_char'} = undef;
1602                 $csv_options->{'escape_char'} = '';
1603         }
1604     my $csv = Text::CSV->new( $csv_options );
1605     my @result;
1606     # Make the header row
1607     $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1608         push( @result, $csv->string );
1609     # Make the rest of the rows
1610     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1611         my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1612         my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
1613         $csv->combine( @row );
1614         push( @result, $csv->string );
1615     }
1616     return join( "\n", @result );
1617 }
1618
1619 sub as_csv {
1620         my $self = shift;
1621         my $opts = shift || {};
1622         $opts->{fieldsep} = ',';
1623         return $self->_tabular( $opts );
1624 }
1625
1626 sub as_tsv {
1627         my $self = shift;
1628         my $opts = shift || {};
1629         $opts->{fieldsep} = "\t";
1630         return $self->_tabular( $opts );
1631 }
1632
1633 =head2 alignment_table
1634
1635 Return a reference to an alignment table, in a slightly enhanced CollateX
1636 format which looks like this:
1637
1638  $table = { alignment => [ { witness => "SIGIL",
1639                              tokens => [ { t => "TEXT" }, ... ] },
1640                            { witness => "SIG2",
1641                              tokens => [ { t => "TEXT" }, ... ] },
1642                            ... ],
1643             length => TEXTLEN };
1644
1645 =cut
1646
1647 sub alignment_table {
1648     my( $self, $opts ) = @_;
1649     if( $self->has_cached_table ) {
1650                 return $self->cached_table
1651                         unless $opts->{noac} || $opts->{safe_ac};
1652     }
1653
1654     # Make sure we can do this
1655         throw( "Need a linear graph in order to make an alignment table" )
1656                 unless $self->linear;
1657     $self->calculate_ranks()
1658         unless $self->_graphcalc_done && $self->end->has_rank;
1659
1660     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
1661     my @all_pos = ( 1 .. $self->end->rank - 1 );
1662     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
1663         # say STDERR "Making witness row(s) for " . $wit->sigil;
1664         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1665         my @row = _make_witness_row( \@wit_path, \@all_pos );
1666         my $witobj = { 'witness' => $wit->sigil, 'tokens' => \@row };
1667         $witobj->{'identifier'} = $wit->identifier if $wit->identifier;
1668         push( @{$table->{'alignment'}}, $witobj );
1669         if( $wit->is_layered && !$opts->{noac} ) {
1670                 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
1671                         $wit->sigil.$self->ac_label );
1672             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
1673             my $witlabel = $opts->{safe_ac}
1674                 ? $wit->sigil . '__L' : $wit->sigil.$self->ac_label;
1675             my $witacobj = { 'witness' => $witlabel,
1676                 'tokens' => \@ac_row };
1677             $witacobj->{'identifier'} = $wit->identifier if $wit->identifier;
1678                         push( @{$table->{'alignment'}}, $witacobj );
1679         }
1680     }
1681     unless( $opts->{noac} || $opts->{safe_ac} ) {
1682             $self->cached_table( $table );
1683         }
1684     return $table;
1685 }
1686
1687 sub _make_witness_row {
1688     my( $path, $positions ) = @_;
1689     my %char_hash;
1690     map { $char_hash{$_} = undef } @$positions;
1691     my $debug = 0;
1692     foreach my $rdg ( @$path ) {
1693         say STDERR "rank " . $rdg->rank if $debug;
1694         # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
1695         $char_hash{$rdg->rank} = { 't' => $rdg };
1696     }
1697     my @row = map { $char_hash{$_} } @$positions;
1698     # Fill in lacuna markers for undef spots in the row
1699     my $last_el = shift @row;
1700     my @filled_row = ( $last_el );
1701     foreach my $el ( @row ) {
1702         # If we are using node reference, make the lacuna node appear many times
1703         # in the table.  If not, use the lacuna tag.
1704         if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1705             $el = $last_el;
1706         }
1707         push( @filled_row, $el );
1708         $last_el = $el;
1709     }
1710     return @filled_row;
1711 }
1712
1713
1714 =head1 NAVIGATION METHODS
1715
1716 =head2 reading_sequence( $first, $last, $sigil, $backup )
1717
1718 Returns the ordered list of readings, starting with $first and ending
1719 with $last, for the witness given in $sigil. If a $backup sigil is
1720 specified (e.g. when walking a layered witness), it will be used wherever
1721 no $sigil path exists.  If there is a base text reading, that will be
1722 used wherever no path exists for $sigil or $backup.
1723
1724 =cut
1725
1726 # TODO Think about returning some lazy-eval iterator.
1727 # TODO Get rid of backup; we should know from what witness is whether we need it.
1728
1729 sub reading_sequence {
1730     my( $self, $start, $end, $witness ) = @_;
1731
1732     $witness = $self->baselabel unless $witness;
1733     my @readings = ( $start );
1734     my %seen;
1735     my $n = $start;
1736     while( $n && $n->id ne $end->id ) {
1737         if( exists( $seen{$n->id} ) ) {
1738             throw( "Detected loop for $witness at " . $n->id );
1739         }
1740         $seen{$n->id} = 1;
1741
1742         my $next = $self->next_reading( $n, $witness );
1743         unless( $next ) {
1744             throw( "Did not find any path for $witness from reading " . $n->id );
1745         }
1746         push( @readings, $next );
1747         $n = $next;
1748     }
1749     # Check that the last reading is our end reading.
1750     my $last = $readings[$#readings];
1751     throw( "Last reading found from " . $start->text .
1752         " for witness $witness is not the end!" ) # TODO do we get this far?
1753         unless $last->id eq $end->id;
1754
1755     return @readings;
1756 }
1757
1758 =head2 next_reading( $reading, $sigil );
1759
1760 Returns the reading that follows the given reading along the given witness
1761 path.
1762
1763 =cut
1764
1765 sub next_reading {
1766     # Return the successor via the corresponding path.
1767     my $self = shift;
1768     my $answer = $self->_find_linked_reading( 'next', @_ );
1769         return undef unless $answer;
1770     return $self->reading( $answer );
1771 }
1772
1773 =head2 prior_reading( $reading, $sigil )
1774
1775 Returns the reading that precedes the given reading along the given witness
1776 path.
1777
1778 =cut
1779
1780 sub prior_reading {
1781     # Return the predecessor via the corresponding path.
1782     my $self = shift;
1783     my $answer = $self->_find_linked_reading( 'prior', @_ );
1784     return $self->reading( $answer );
1785 }
1786
1787 sub _find_linked_reading {
1788     my( $self, $direction, $node, $path ) = @_;
1789
1790     # Get a backup if we are dealing with a layered witness
1791     my $alt_path;
1792     my $aclabel = $self->ac_label;
1793     if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1794         $alt_path = $1;
1795     }
1796
1797     my @linked_paths = $direction eq 'next'
1798         ? $self->sequence->edges_from( $node )
1799         : $self->sequence->edges_to( $node );
1800     return undef unless scalar( @linked_paths );
1801
1802     # We have to find the linked path that contains all of the
1803     # witnesses supplied in $path.
1804     my( @path_wits, @alt_path_wits );
1805     @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1806     @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
1807     my $base_le;
1808     my $alt_le;
1809     foreach my $le ( @linked_paths ) {
1810         if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
1811             $base_le = $le;
1812         }
1813                 my @le_wits = sort $self->path_witnesses( $le );
1814                 if( _is_within( \@path_wits, \@le_wits ) ) {
1815                         # This is the right path.
1816                         return $direction eq 'next' ? $le->[1] : $le->[0];
1817                 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1818                         $alt_le = $le;
1819                 }
1820     }
1821     # Got this far? Return the alternate path if it exists.
1822     return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
1823         if $alt_le;
1824
1825     # Got this far? Return the base path if it exists.
1826     return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
1827         if $base_le;
1828
1829     # Got this far? We have no appropriate path.
1830     warn "Could not find $direction node from " . $node->id
1831         . " along path $path";
1832     return undef;
1833 }
1834
1835 # Some set logic.
1836 sub _is_within {
1837     my( $set1, $set2 ) = @_;
1838     my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
1839     foreach my $el ( @$set1 ) {
1840         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
1841     }
1842     return $ret;
1843 }
1844
1845 # Return the string that joins together a list of witnesses for
1846 # display on a single path.
1847 sub _witnesses_of_label {
1848     my( $self, $label ) = @_;
1849     my $regex = $self->wit_list_separator;
1850     my @answer = split( /\Q$regex\E/, $label );
1851     return @answer;
1852 }
1853
1854 =head2 common_readings
1855
1856 Returns the list of common readings in the graph (i.e. those readings that are
1857 shared by all non-lacunose witnesses.)
1858
1859 =cut
1860
1861 sub common_readings {
1862         my $self = shift;
1863         my @common = grep { $_->is_common } $self->readings;
1864         return @common;
1865 }
1866
1867 =head2 path_text( $sigil, [, $start, $end ] )
1868
1869 Returns the text of a witness (plus its backup, if we are using a layer)
1870 as stored in the collation.  The text is returned as a string, where the
1871 individual readings are joined with spaces and the meta-readings (e.g.
1872 lacunae) are omitted.  Optional specification of $start and $end allows
1873 the generation of a subset of the witness text.
1874
1875 =cut
1876
1877 sub path_text {
1878         my( $self, $wit, $start, $end ) = @_;
1879         $start = $self->start unless $start;
1880         $end = $self->end unless $end;
1881         my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
1882         my $pathtext = '';
1883         my $last;
1884         foreach my $r ( @path ) {
1885                 unless ( $r->join_prior || !$last || $last->join_next ) {
1886                         $pathtext .= ' ';
1887                 }
1888                 $pathtext .= $r->text;
1889                 $last = $r;
1890         }
1891         return $pathtext;
1892 }
1893
1894 =head1 INITIALIZATION METHODS
1895
1896 These are mostly for use by parsers.
1897
1898 =head2 make_witness_path( $witness )
1899
1900 Link the array of readings contained in $witness->path (and in
1901 $witness->uncorrected_path if it exists) into collation paths.
1902 Clear out the arrays when finished.
1903
1904 =head2 make_witness_paths
1905
1906 Call make_witness_path for all witnesses in the tradition.
1907
1908 =cut
1909
1910 # For use when a collation is constructed from a base text and an apparatus.
1911 # We have the sequences of readings and just need to add path edges.
1912 # When we are done, clear out the witness path attributes, as they are no
1913 # longer needed.
1914 # TODO Find a way to replace the witness path attributes with encapsulated functions?
1915
1916 sub make_witness_paths {
1917     my( $self ) = @_;
1918     foreach my $wit ( $self->tradition->witnesses ) {
1919         # say STDERR "Making path for " . $wit->sigil;
1920         $self->make_witness_path( $wit );
1921     }
1922 }
1923
1924 sub make_witness_path {
1925     my( $self, $wit ) = @_;
1926     my @chain = @{$wit->path};
1927     my $sig = $wit->sigil;
1928     # Add start and end if necessary
1929     unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1930     push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1931     foreach my $idx ( 0 .. $#chain-1 ) {
1932         $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
1933     }
1934     if( $wit->is_layered ) {
1935         @chain = @{$wit->uncorrected_path};
1936                 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1937                 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1938         foreach my $idx( 0 .. $#chain-1 ) {
1939             my $source = $chain[$idx];
1940             my $target = $chain[$idx+1];
1941             $self->add_path( $source, $target, $sig.$self->ac_label )
1942                 unless $self->has_path( $source, $target, $sig );
1943         }
1944     }
1945     $wit->clear_path;
1946     $wit->clear_uncorrected_path;
1947 }
1948
1949 =head2 calculate_ranks
1950
1951 Calculate the reading ranks (that is, their aligned positions relative
1952 to each other) for the graph.  This can only be called on linear collations.
1953
1954 =begin testing
1955
1956 use Text::Tradition;
1957
1958 my $cxfile = 't/data/Collatex-16.xml';
1959 my $t = Text::Tradition->new(
1960     'name'  => 'inline',
1961     'input' => 'CollateX',
1962     'file'  => $cxfile,
1963     );
1964 my $c = $t->collation;
1965
1966 # Make an svg
1967 my $table = $c->alignment_table;
1968 ok( $c->has_cached_table, "Alignment table was cached" );
1969 is( $c->alignment_table, $table, "Cached table returned upon second call" );
1970 $c->calculate_ranks;
1971 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
1972 $c->add_relationship( 'n13', 'n23', { type => 'repetition' } );
1973 is( $c->alignment_table, $table, "Alignment table unchanged after non-colo relationship add" );
1974 $c->add_relationship( 'n24', 'n23', { type => 'spelling' } );
1975 isnt( $c->alignment_table, $table, "Alignment table changed after colo relationship add" );
1976
1977 =end testing
1978
1979 =cut
1980
1981 sub calculate_ranks {
1982     my $self = shift;
1983     # Save the existing ranks, in case we need to invalidate the cached SVG.
1984     throw( "Cannot calculate ranks on a non-linear graph" )
1985         unless $self->linear;
1986     my %existing_ranks;
1987     map { $existing_ranks{$_} = $_->rank } $self->readings;
1988
1989     # Do the rankings based on the relationship equivalence graph, starting
1990     # with the start node.
1991     my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
1992
1993     # Transfer our rankings from the topological graph to the real one.
1994     foreach my $r ( $self->readings ) {
1995         if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
1996             $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
1997         } else {
1998                 # Die. Find the last rank we calculated.
1999                 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
2000                                  <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
2001                         $self->readings;
2002                 my $last = pop @all_defined;
2003             throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
2004         }
2005     }
2006     # Do we need to invalidate the cached data?
2007     if( $self->has_cached_table ) {
2008         foreach my $r ( $self->readings ) {
2009                 next if defined( $existing_ranks{$r} )
2010                         && $existing_ranks{$r} == $r->rank;
2011                 # Something has changed, so clear the cache
2012                 $self->_clear_cache;
2013                         # ...and recalculate the common readings.
2014                         $self->calculate_common_readings();
2015                 last;
2016         }
2017     }
2018         # The graph calculation information is now up to date.
2019         $self->_graphcalc_done(1);
2020 }
2021
2022 sub _clear_cache {
2023         my $self = shift;
2024         $self->wipe_table if $self->has_cached_table;
2025 }
2026
2027
2028 =head2 flatten_ranks
2029
2030 A convenience method for parsing collation data.  Searches the graph for readings
2031 with the same text at the same rank, and merges any that are found.
2032
2033 =cut
2034
2035 sub flatten_ranks {
2036     my ( $self, %args ) = shift;
2037     my %unique_rank_rdg;
2038     my $changed;
2039     foreach my $p ( $self->identical_readings( %args ) ) {
2040                 # say STDERR "Combining readings at same rank: @$p";
2041                 $changed = 1;
2042                 $self->merge_readings( @$p );
2043                 # TODO see if this now makes a common point.
2044     }
2045     # If we merged readings, the ranks are still fine but the alignment
2046     # table is wrong. Wipe it.
2047     $self->wipe_table() if $changed;
2048 }
2049
2050 =head2 identical_readings
2051 =head2 identical_readings( start => $startnode, end => $endnode )
2052 =head2 identical_readings( startrank => $startrank, endrank => $endrank )
2053
2054 Goes through the graph identifying all pairs of readings that appear to be
2055 identical, and therefore able to be merged into a single reading. Returns the
2056 relevant identical pairs. Can be restricted to run over only a part of the
2057 graph, specified either by node or by rank.
2058
2059 =cut
2060
2061 sub identical_readings {
2062         my ( $self, %args ) = @_;
2063     # Find where we should start and end.
2064     my $startrank = $args{startrank} || 0;
2065     if( $args{start} ) {
2066         throw( "Starting reading has no rank" ) unless $self->reading( $args{start} )
2067                 && $self->reading( $args{start} )->has_rank;
2068         $startrank = $self->reading( $args{start} )->rank;
2069     }
2070     my $endrank = $args{endrank} || $self->end->rank;
2071     if( $args{end} ) {
2072         throw( "Ending reading has no rank" ) unless $self->reading( $args{end} )
2073                 && $self->reading( $args{end} )->has_rank;
2074         $endrank = $self->reading( $args{end} )->rank;
2075     }
2076
2077     # Make sure the ranks are correct.
2078     unless( $self->_graphcalc_done ) {
2079         $self->calculate_ranks;
2080     }
2081     # Go through the readings looking for duplicates.
2082     my %unique_rank_rdg;
2083     my @pairs;
2084     foreach my $rdg ( $self->readings ) {
2085         next unless $rdg->has_rank;
2086         my $rk = $rdg->rank;
2087         next if $rk > $endrank || $rk < $startrank;
2088         my $key = $rk . "||" . $rdg->text;
2089         if( exists $unique_rank_rdg{$key} ) {
2090                 # Make sure they don't have different grammatical forms
2091                         my $ur = $unique_rank_rdg{$key};
2092                 if( $rdg->is_identical( $ur ) ) {
2093                                 push( @pairs, [ $ur, $rdg ] );
2094                         }
2095         } else {
2096             $unique_rank_rdg{$key} = $rdg;
2097         }
2098     }
2099
2100     return @pairs;
2101 }
2102
2103
2104 =head2 calculate_common_readings
2105
2106 Goes through the graph identifying the readings that appear in every witness
2107 (apart from those with lacunae at that spot.) Marks them as common and returns
2108 the list.
2109
2110 =begin testing
2111
2112 use Text::Tradition;
2113
2114 my $cxfile = 't/data/Collatex-16.xml';
2115 my $t = Text::Tradition->new(
2116     'name'  => 'inline',
2117     'input' => 'CollateX',
2118     'file'  => $cxfile,
2119     );
2120 my $c = $t->collation;
2121
2122 my @common = $c->calculate_common_readings();
2123 is( scalar @common, 8, "Found correct number of common readings" );
2124 my @marked = sort $c->common_readings();
2125 is( scalar @common, 8, "All common readings got marked as such" );
2126 my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
2127 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
2128
2129 =end testing
2130
2131 =cut
2132
2133 sub calculate_common_readings {
2134         my $self = shift;
2135         my @common;
2136         map { $_->is_common( 0 ) } $self->readings;
2137         # Implicitly calls calculate_ranks
2138         my $table = $self->alignment_table;
2139         foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
2140                 my @row = map { $_->{'tokens'}->[$idx]
2141                                                         ? $_->{'tokens'}->[$idx]->{'t'} : '' }
2142                                         @{$table->{'alignment'}};
2143                 my %hash;
2144                 foreach my $r ( @row ) {
2145                         if( $r ) {
2146                                 $hash{$r->id} = $r unless $r->is_meta;
2147                         } else {
2148                                 $hash{'UNDEF'} = $r;
2149                         }
2150                 }
2151                 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
2152                         my( $r ) = values %hash;
2153                         $r->is_common( 1 );
2154                         push( @common, $r );
2155                 }
2156         }
2157         return @common;
2158 }
2159
2160 =head2 text_from_paths
2161
2162 Calculate the text array for all witnesses from the path, for later consistency
2163 checking.  Only to be used if there is no non-graph-based way to know the
2164 original texts.
2165
2166 =cut
2167
2168 sub text_from_paths {
2169         my $self = shift;
2170     foreach my $wit ( $self->tradition->witnesses ) {
2171         my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
2172         my @text;
2173         foreach my $r ( @readings ) {
2174                 next if $r->is_meta;
2175                 push( @text, $r->text );
2176         }
2177         $wit->text( \@text );
2178         if( $wit->is_layered ) {
2179                         my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
2180                                                                                                   $wit->sigil.$self->ac_label );
2181                         my @uctext;
2182                         foreach my $r ( @ucrdgs ) {
2183                                 next if $r->is_meta;
2184                                 push( @uctext, $r->text );
2185                         }
2186                         $wit->layertext( \@uctext );
2187         }
2188     }
2189 }
2190
2191 =head1 UTILITY FUNCTIONS
2192
2193 =head2 common_predecessor( $reading_a, $reading_b )
2194
2195 Find the last reading that occurs in sequence before both the given readings.
2196 At the very least this should be $self->start.
2197
2198 =head2 common_successor( $reading_a, $reading_b )
2199
2200 Find the first reading that occurs in sequence after both the given readings.
2201 At the very least this should be $self->end.
2202
2203 =begin testing
2204
2205 use Text::Tradition;
2206
2207 my $cxfile = 't/data/Collatex-16.xml';
2208 my $t = Text::Tradition->new(
2209     'name'  => 'inline',
2210     'input' => 'CollateX',
2211     'file'  => $cxfile,
2212     );
2213 my $c = $t->collation;
2214
2215 is( $c->common_predecessor( 'n24', 'n23' )->id,
2216     'n20', "Found correct common predecessor" );
2217 is( $c->common_successor( 'n24', 'n23' )->id,
2218     '__END__', "Found correct common successor" );
2219
2220 is( $c->common_predecessor( 'n19', 'n17' )->id,
2221     'n16', "Found correct common predecessor for readings on same path" );
2222 is( $c->common_successor( 'n21', 'n10' )->id,
2223     '__END__', "Found correct common successor for readings on same path" );
2224
2225 =end testing
2226
2227 =cut
2228
2229 ## Return the closest reading that is a predecessor of both the given readings.
2230 sub common_predecessor {
2231         my $self = shift;
2232         my( $r1, $r2 ) = $self->_objectify_args( @_ );
2233         return $self->_common_in_path( $r1, $r2, 'predecessors' );
2234 }
2235
2236 sub common_successor {
2237         my $self = shift;
2238         my( $r1, $r2 ) = $self->_objectify_args( @_ );
2239         return $self->_common_in_path( $r1, $r2, 'successors' );
2240 }
2241
2242
2243 # TODO think about how to do this without ranks...
2244 sub _common_in_path {
2245         my( $self, $r1, $r2, $dir ) = @_;
2246         my $iter = $self->end->rank;
2247         my @candidates;
2248         my @last_r1 = ( $r1 );
2249         my @last_r2 = ( $r2 );
2250         # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
2251         my %all_seen;
2252         # say STDERR "Finding common $dir for $r1, $r2";
2253         while( !@candidates ) {
2254                 last unless $iter--;  # Avoid looping infinitely
2255                 # Iterate separately down the graph from r1 and r2
2256                 my( @new_lc1, @new_lc2 );
2257                 foreach my $lc ( @last_r1 ) {
2258                         foreach my $p ( $lc->$dir ) {
2259                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
2260                                         # say STDERR "Path candidate $p from $lc";
2261                                         push( @candidates, $p );
2262                                 } elsif( !$all_seen{$p->id} ) {
2263                                         $all_seen{$p->id} = 'r1';
2264                                         push( @new_lc1, $p );
2265                                 }
2266                         }
2267                 }
2268                 foreach my $lc ( @last_r2 ) {
2269                         foreach my $p ( $lc->$dir ) {
2270                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
2271                                         # say STDERR "Path candidate $p from $lc";
2272                                         push( @candidates, $p );
2273                                 } elsif( !$all_seen{$p->id} ) {
2274                                         $all_seen{$p->id} = 'r2';
2275                                         push( @new_lc2, $p );
2276                                 }
2277                         }
2278                 }
2279                 @last_r1 = @new_lc1;
2280                 @last_r2 = @new_lc2;
2281         }
2282         my @answer = sort { $a->rank <=> $b->rank } @candidates;
2283         return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
2284 }
2285
2286 sub throw {
2287         Text::Tradition::Error->throw(
2288                 'ident' => 'Collation error',
2289                 'message' => $_[0],
2290                 );
2291 }
2292
2293 no Moose;
2294 __PACKAGE__->meta->make_immutable;
2295
2296 =head1 BUGS/TODO
2297
2298 =over
2299
2300 =item * Rework XML serialization in a more modular way
2301
2302 =back
2303
2304 =head1 LICENSE
2305
2306 This package is free software and is provided "as is" without express
2307 or implied warranty.  You can redistribute it and/or modify it under
2308 the same terms as Perl itself.
2309
2310 =head1 AUTHOR
2311
2312 Tara L Andrews E<lt>aurum@cpan.orgE<gt>