base/lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use feature 'say';
   4 use Encode qw( decode_utf8 );
   5 use File::Temp;
   6 use File::Which;
   7 use Graph;
   8 use IPC::Run qw( run binary );
   9 use Text::CSV;
  10 use Text::Tradition::Collation::Data;
  11 use Text::Tradition::Collation::Reading;
  12 use Text::Tradition::Collation::RelationshipStore;
  13 use Text::Tradition::Error;
  14 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
  15 use XML::LibXML;
  16 use XML::LibXML::XPathContext;
  17 use Moose;
  18
  19 has _data => (
  20         isa      => 'Text::Tradition::Collation::Data',
  21         is       => 'ro',
  22         required => 1,
  23         handles  => [ qw(
  24                 sequence
  25                 paths
  26                 _set_relations
  27                 relations
  28                 _set_start
  29                 _set_end
  30                 ac_label
  31                 has_cached_table
  32                 relationships
  33                 related_readings
  34                 get_relationship
  35                 del_relationship
  36                 equivalence
  37                 equivalence_graph
  38                 readings
  39                 reading
  40                 _add_reading
  41                 del_reading
  42                 has_reading
  43                 wit_list_separator
  44                 baselabel
  45                 linear
  46                 wordsep
  47                 start
  48                 end
  49                 cached_table
  50                 _graphcalc_done
  51                 has_cached_svg
  52                 wipe_table
  53         )]
  54 );
  55
  56 has 'tradition' => (
  57     is => 'ro',
  58     isa => 'Text::Tradition',
  59     writer => '_set_tradition',
  60     weak_ref => 1,
  61     );
  62
  63 =head1 NAME
  64
  65 Text::Tradition::Collation - a software model for a text collation
  66
  67 =head1 SYNOPSIS
  68
  69   use Text::Tradition;
  70   my $t = Text::Tradition->new(
  71     'name' => 'this is a text',
  72     'input' => 'TEI',
  73     'file' => '/path/to/tei_parallel_seg_file.xml' );
  74
  75   my $c = $t->collation;
  76   my @readings = $c->readings;
  77   my @paths = $c->paths;
  78   my @relationships = $c->relationships;
  79
  80   my $svg_variant_graph = $t->collation->as_svg();
  81
  82 =head1 DESCRIPTION
  83
  84 Text::Tradition is a library for representation and analysis of collated
  85 texts, particularly medieval ones.  The Collation is the central feature of
  86 a Tradition, where the text, its sequence of readings, and its relationships
  87 between readings are actually kept.
  88
  89 =head1 CONSTRUCTOR
  90
  91 =head2 new
  92
  93 The constructor.  Takes a hash or hashref of the following arguments:
  94
  95 =over
  96
  97 =item * tradition - The Text::Tradition object to which the collation
  98 belongs. Required.
  99
 100 =item * linear - Whether the collation should be linear; that is, whether
 101 transposed readings should be treated as two linked readings rather than one,
 102 and therefore whether the collation graph is acyclic.  Defaults to true.
 103
 104 =item * baselabel - The default label for the path taken by a base text
 105 (if any). Defaults to 'base text'.
 106
 107 =item * wit_list_separator - The string to join a list of witnesses for
 108 purposes of making labels in display graphs.  Defaults to ', '.
 109
 110 =item * ac_label - The extra label to tack onto a witness sigil when
 111 representing another layer of path for the given witness - that is, when
 112 a text has more than one possible reading due to scribal corrections or
 113 the like.  Defaults to ' (a.c.)'.
 114
 115 =item * wordsep - The string used to separate words in the original text.
 116 Defaults to ' '.
 117
 118 =back
 119
 120 =head1 ACCESSORS
 121
 122 =head2 tradition
 123
 124 =head2 linear
 125
 126 =head2 wit_list_separator
 127
 128 =head2 baselabel
 129
 130 =head2 ac_label
 131
 132 =head2 wordsep
 133
 134 Simple accessors for collation attributes.
 135
 136 =head2 start
 137
 138 The meta-reading at the start of every witness path.
 139
 140 =head2 end
 141
 142 The meta-reading at the end of every witness path.
 143
 144 =head2 readings
 145
 146 Returns all Reading objects in the graph.
 147
 148 =head2 reading( $id )
 149
 150 Returns the Reading object corresponding to the given ID.
 151
 152 =head2 add_reading( $reading_args )
 153
 154 Adds a new reading object to the collation.
 155 See L<Text::Tradition::Collation::Reading> for the available arguments.
 156
 157 =head2 del_reading( $object_or_id )
 158
 159 Removes the given reading from the collation, implicitly removing its
 160 paths and relationships.
 161
 162 =head2 has_reading( $id )
 163
 164 Predicate to see whether a given reading ID is in the graph.
 165
 166 =head2 reading_witnesses( $object_or_id )
 167
 168 Returns a list of sigils whose witnesses contain the reading.
 169
 170 =head2 paths
 171
 172 Returns all reading paths within the document - that is, all edges in the
 173 collation graph.  Each path is an arrayref of [ $source, $target ] reading IDs.
 174
 175 =head2 add_path( $source, $target, $sigil )
 176
 177 Links the given readings in the collation in sequence, under the given witness
 178 sigil.  The readings may be specified by object or ID.
 179
 180 =head2 del_path( $source, $target, $sigil )
 181
 182 Links the given readings in the collation in sequence, under the given witness
 183 sigil.  The readings may be specified by object or ID.
 184
 185 =head2 has_path( $source, $target );
 186
 187 Returns true if the two readings are linked in sequence in any witness.
 188 The readings may be specified by object or ID.
 189
 190 =head2 relationships
 191
 192 Returns all Relationship objects in the collation.
 193
 194 =head2 add_relationship( $reading, $other_reading, $options )
 195
 196 Adds a new relationship of the type given in $options between the two readings,
 197 which may be specified by object or ID.  Returns a value of ( $status, @vectors)
 198 where $status is true on success, and @vectors is a list of relationship edges
 199 that were ultimately added.
 200 See L<Text::Tradition::Collation::Relationship> for the available options.
 201
 202 =cut
 203
 204 sub BUILDARGS {
 205         my ( $class, @args ) = @_;
 206         my %args = @args == 1 ? %{ $args[0] } : @args;
 207         # TODO determine these from the Moose::Meta object
 208         my @delegate_attrs = qw(sequence relations readings wit_list_separator baselabel
 209                 linear wordsep start end cached_table _graphcalc_done);
 210         my %data_args;
 211         for my $attr (@delegate_attrs) {
 212                 $data_args{$attr} = delete $args{$attr} if exists $args{$attr};
 213         }
 214         $args{_data} = Text::Tradition::Collation::Data->new(%data_args);
 215         return \%args;
 216 }
 217
 218 sub BUILD {
 219     my $self = shift;
 220     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
 221     $self->_set_start( $self->add_reading(
 222         { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
 223     $self->_set_end( $self->add_reading(
 224         { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
 225 }
 226
 227 sub register_relationship_type {
 228         my $self = shift;
 229         my %args = @_ == 1 ? %{$_[0]} : @_;
 230         if( $self->relations->has_type( $args{name} ) ) {
 231                 throw( 'Relationship type ' . $args{name} . ' already registered' );
 232         }
 233         $self->relations->add_type( %args );
 234 }
 235
 236 sub get_relationship_type {
 237         my( $self, $name ) = @_;
 238                 return $self->relations->has_type( $name )
 239                         ? $self->relations->type( $name ) : undef;
 240 }
 241
 242 ### Reading construct/destruct functions
 243
 244 sub add_reading {
 245         my( $self, $reading ) = @_;
 246         unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
 247                 my %args = %$reading;
 248                 if( $args{'init'} ) {
 249                         # If we are initializing an empty collation, don't assume that we
 250                         # have set a tradition.
 251                         delete $args{'init'};
 252                 } elsif( $self->tradition->can('language') && $self->tradition->has_language
 253                         && !exists $args{'language'} ) {
 254                         $args{'language'} = $self->tradition->language;
 255                 }
 256                 $reading = Text::Tradition::Collation::Reading->new(
 257                         'collation' => $self,
 258                         %args );
 259         }
 260         # First check to see if a reading with this ID exists.
 261         if( $self->reading( $reading->id ) ) {
 262                 throw( "Collation already has a reading with id " . $reading->id );
 263         }
 264         $self->_graphcalc_done(0);
 265         $self->_add_reading( $reading->id => $reading );
 266         # Once the reading has been added, put it in both graphs.
 267         $self->sequence->add_vertex( $reading->id );
 268         $self->relations->add_reading( $reading->id );
 269         return $reading;
 270 };
 271
 272 around del_reading => sub {
 273         my $orig = shift;
 274         my $self = shift;
 275         my $arg = shift;
 276
 277         if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
 278                 $arg = $arg->id;
 279         }
 280         # Remove the reading from the graphs.
 281         $self->_graphcalc_done(0);
 282         $self->_clear_cache; # Explicitly clear caches to GC the reading
 283         $self->sequence->delete_vertex( $arg );
 284         $self->relations->delete_reading( $arg );
 285
 286         # Carry on.
 287         $self->$orig( $arg );
 288 };
 289
 290 =head2 merge_readings( $main, $second, $concatenate, $with_str )
 291
 292 Merges the $second reading into the $main one. If $concatenate is true, then
 293 the merged node will carry the text of both readings, concatenated with either
 294 $with_str (if specified) or a sensible default (the empty string if the
 295 appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
 296
 297 The first two arguments may be either readings or reading IDs.
 298
 299 =begin testing
 300
 301 use Text::Tradition;
 302 use TryCatch;
 303
 304 my $cxfile = 't/data/Collatex-16.xml';
 305 my $t = Text::Tradition->new(
 306     'name'  => 'inline',
 307     'input' => 'CollateX',
 308     'file'  => $cxfile,
 309     );
 310 my $c = $t->collation;
 311
 312 my $rno = scalar $c->readings;
 313 # Split n21 ('unto') for testing purposes
 314 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
 315 my $old_r = $c->reading( 'n21' );
 316 $old_r->alter_text( 'to' );
 317 $c->del_path( 'n20', 'n21', 'A' );
 318 $c->add_path( 'n20', 'n21p0', 'A' );
 319 $c->add_path( 'n21p0', 'n21', 'A' );
 320 $c->add_relationship( 'n21', 'n22', { type => 'collated', scope => 'local' } );
 321 $c->flatten_ranks();
 322 ok( $c->reading( 'n21p0' ), "New reading exists" );
 323 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
 324
 325 # Combine n3 and n4 ( with his )
 326 $c->merge_readings( 'n3', 'n4', 1 );
 327 ok( !$c->reading('n4'), "Reading n4 is gone" );
 328 is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
 329
 330 # Collapse n9 and n10 ( rood / root )
 331 $c->merge_readings( 'n9', 'n10' );
 332 ok( !$c->reading('n10'), "Reading n10 is gone" );
 333 is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
 334
 335 # Try to combine n21 and n21p0. This should break.
 336 my $remaining = $c->reading('n21');
 337 $remaining ||= $c->reading('n22');  # one of these should still exist
 338 try {
 339         $c->merge_readings( 'n21p0', $remaining, 1 );
 340         ok( 0, "Bad reading merge changed the graph" );
 341 } catch( Text::Tradition::Error $e ) {
 342         like( $e->message, qr/neither concatenated nor collated/, "Expected exception from bad concatenation" );
 343 } catch {
 344         ok( 0, "Unexpected error on bad reading merge: $@" );
 345 }
 346
 347 try {
 348         $c->calculate_ranks();
 349         ok( 1, "Graph is still evidently whole" );
 350 } catch( Text::Tradition::Error $e ) {
 351         ok( 0, "Caught a rank exception: " . $e->message );
 352 }
 353
 354 =end testing
 355
 356 =cut
 357
 358 sub merge_readings {
 359         my $self = shift;
 360
 361         # Sanity check
 362         my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
 363         my $mergemeta = $kept_obj->is_meta;
 364         throw( "Cannot merge meta and non-meta reading" )
 365                 unless ( $mergemeta && $del_obj->is_meta )
 366                         || ( !$mergemeta && !$del_obj->is_meta );
 367         if( $mergemeta ) {
 368                 throw( "Cannot merge with start or end node" )
 369                         if( $kept_obj eq $self->start || $kept_obj eq $self->end
 370                                 || $del_obj eq $self->start || $del_obj eq $self->end );
 371                 throw( "Cannot combine text of meta readings" ) if $combine;
 372         }
 373         # We can only merge readings in a linear graph if:
 374         # - they are contiguous with only one edge between them, OR
 375         # - they are at equivalent ranks in the graph.
 376         if( $self->linear ) {
 377                 my @delpred = $del_obj->predecessors;
 378                 my @keptsuc = $kept_obj->successors;
 379                 unless ( @delpred == 1 && $delpred[0] eq $kept_obj
 380                         && @keptsuc == 1 && $keptsuc[0] eq $del_obj ) {
 381                         my( $is_ok, $msg ) = $self->relations->relationship_valid(
 382                                 $kept_obj, $del_obj, 'collated' );
 383                         unless( $is_ok ) {
 384                                 throw( "Readings $kept_obj and $del_obj can be neither concatenated nor collated" );
 385                         }
 386                 }
 387         }
 388
 389         # We only need the IDs for adding paths to the graph, not the reading
 390         # objects themselves.
 391         my $kept = $kept_obj->id;
 392         my $deleted = $del_obj->id;
 393         $self->_graphcalc_done(0);
 394
 395     # The kept reading should inherit the paths and the relationships
 396     # of the deleted reading.
 397         foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
 398                 my @vector = ( $kept );
 399                 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
 400                 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
 401                 next if $vector[0] eq $vector[1]; # Don't add a self loop
 402                 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
 403                 $self->sequence->add_edge( @vector );
 404                 my $fwits = $self->sequence->get_edge_attributes( @vector );
 405                 @wits{keys %$fwits} = values %$fwits;
 406                 $self->sequence->set_edge_attributes( @vector, \%wits );
 407         }
 408         $self->relations->merge_readings( $kept, $deleted, $combine );
 409
 410         # Do the deletion deed.
 411         if( $combine ) {
 412                 # Combine the text of the readings
 413                 my $joinstr = $combine_char;
 414                 unless( defined $joinstr ) {
 415                         $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
 416                         $joinstr = $self->wordsep unless defined $joinstr;
 417                 }
 418                 $kept_obj->_combine( $del_obj, $joinstr );
 419         }
 420         $self->del_reading( $deleted );
 421 }
 422
 423 =head2 merge_related( @relationship_types )
 424
 425 Merge all readings linked with the relationship types given. If any of the selected type(s) is not a colocation, the graph will no longer be linear. The majority/plurality reading in each case will be the one kept.
 426
 427 WARNING: This operation cannot be undone.
 428
 429 =cut
 430
 431 =begin testing
 432
 433 use Test::Warn;
 434 use Text::Tradition;
 435 use TryCatch;
 436
 437 my $t;
 438 warnings_exist {
 439         $t = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/legendfrag.xml' );
 440 } [qr/Cannot set relationship on a meta reading/],
 441         "Got expected relationship drop warning on parse";
 442
 443 my $c = $t->collation;
 444
 445 my %rdg_ids;
 446 map { $rdg_ids{$_} = 1 } $c->readings;
 447 $c->merge_related( 'orthographic' );
 448 is( scalar( $c->readings ), keys( %rdg_ids ) - 8,
 449         "Successfully collapsed orthographic variation" );
 450 map { $rdg_ids{$_} = undef } qw/ r13.3 r11.4 r8.5 r8.2 r7.7 r7.5 r7.4 r7.1 /;
 451 foreach my $rid ( keys %rdg_ids ) {
 452         my $exp = $rdg_ids{$rid};
 453         is( !$c->reading( $rid ), !$exp, "Reading $rid correctly " .
 454                 ( $exp ? "retained" : "removed" ) );
 455 }
 456 ok( $c->linear, "Graph is still linear" );
 457 try {
 458         $c->calculate_ranks; # This should succeed
 459         ok( 1, "Can still calculate ranks on the new graph" );
 460 } catch {
 461         ok( 0, "Rank calculation on merged graph failed: $@" );
 462 }
 463
 464 # Now add some transpositions
 465 $c->add_relationship( 'r8.4', 'r10.4', { type => 'transposition' } );
 466 $c->merge_related( 'transposition' );
 467 is( scalar( $c->readings ), keys( %rdg_ids ) - 9,
 468         "Transposed relationship is merged away" );
 469 ok( !$c->reading('r8.4'), "Correct transposed reading removed" );
 470 ok( !$c->linear, "Graph is no longer linear" );
 471 try {
 472         $c->calculate_ranks; # This should fail
 473         ok( 0, "Rank calculation happened on nonlinear graph?!" );
 474 } catch ( Text::Tradition::Error $e ) {
 475         is( $e->message, 'Cannot calculate ranks on a non-linear graph',
 476                 "Rank calculation on merged graph threw an error" );
 477 }
 478
 479 =end testing
 480
 481 =cut
 482
 483 # TODO: there should be a way to display merged without affecting the underlying data!
 484
 485 sub merge_related {
 486         my $self = shift;
 487         my %reltypehash;
 488         map { $reltypehash{$_} = 1 } @_;
 489
 490         # Set up the filter for finding related readings
 491         my $filter = sub {
 492                 exists $reltypehash{$_[0]->type};
 493         };
 494
 495         # Go through all readings looking for related ones
 496         foreach my $r ( $self->readings ) {
 497                 next unless $self->reading( "$r" ); # might have been deleted meanwhile
 498                 my @related = $self->related_readings( $r, $filter );
 499                 if( @related ) {
 500                         push( @related, $r );
 501                         @related = sort {
 502                                         scalar $b->witnesses <=> scalar $a->witnesses
 503                                 } @related;
 504                         my $keep = shift @related;
 505                         foreach my $delr ( @related ) {
 506                                 $self->linear( 0 )
 507                                         unless( $self->get_relationship( $keep, $delr )->colocated );
 508                                 $self->merge_readings( $keep, $delr );
 509                         }
 510                 }
 511         }
 512 }
 513
 514 =head2 compress_readings
 515
 516 Where possible in the graph, compresses plain sequences of readings into a
 517 single reading. The sequences must consist of readings with no
 518 relationships to other readings, with only a single witness path between
 519 them and no other witness paths from either that would skip the other. The
 520 readings must also not be marked as nonsense or bad grammar.
 521
 522 WARNING: This operation cannot be undone.
 523
 524 =cut
 525
 526 sub compress_readings {
 527         my $self = shift;
 528         # Anywhere in the graph that there is a reading that joins only to a single
 529         # successor, and neither of these have any relationships, just join the two
 530         # readings.
 531         foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
 532                 # Now look for readings that can be joined to their successors.
 533                 next unless $rdg->is_combinable;
 534                 my %seen;
 535                 while( $self->sequence->successors( $rdg ) == 1 ) {
 536                         my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
 537                         throw( "Infinite loop" ) if $seen{$next->id};
 538                         $seen{$next->id} = 1;
 539                         last if $self->sequence->predecessors( $next ) > 1;
 540                         last unless $next->is_combinable;
 541                         say "Joining readings $rdg and $next";
 542                         $self->merge_readings( $rdg, $next, 1 );
 543                 }
 544         }
 545         # Make sure we haven't screwed anything up
 546         foreach my $wit ( $self->tradition->witnesses ) {
 547                 my $pathtext = $self->path_text( $wit->sigil );
 548                 my $origtext = join( ' ', @{$wit->text} );
 549                 throw( "Text differs for witness " . $wit->sigil )
 550                         unless $pathtext eq $origtext;
 551                 if( $wit->is_layered ) {
 552                         $pathtext = $self->path_text( $wit->sigil.$self->ac_label );
 553                         $origtext = join( ' ', @{$wit->layertext} );
 554                         throw( "Ante-corr text differs for witness " . $wit->sigil )
 555                                 unless $pathtext eq $origtext;
 556                 }
 557         }
 558
 559         $self->relations->rebuild_equivalence();
 560         $self->calculate_ranks();
 561 }
 562
 563 # Helper function for manipulating the graph.
 564 sub _stringify_args {
 565         my( $self, $first, $second, @args ) = @_;
 566     $first = $first->id
 567         if ref( $first ) eq 'Text::Tradition::Collation::Reading';
 568     $second = $second->id
 569         if ref( $second ) eq 'Text::Tradition::Collation::Reading';
 570     return( $first, $second, @args );
 571 }
 572
 573 # Helper function for manipulating the graph.
 574 sub _objectify_args {
 575         my( $self, $first, $second, $arg ) = @_;
 576     $first = $self->reading( $first )
 577         unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
 578     $second = $self->reading( $second )
 579         unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
 580     return( $first, $second, $arg );
 581 }
 582
 583 =head2 duplicate_reading( $reading, @witlist )
 584
 585 Split the given reading into two, so that the new reading is in the path for
 586 the witnesses given in @witlist. If the result is that certain non-colocated
 587 relationships (e.g. transpositions) are no longer valid, these will be removed.
 588 Returns the newly-created reading.
 589
 590 =begin testing
 591
 592 use Test::More::UTF8;
 593 use Text::Tradition;
 594 use TryCatch;
 595
 596 my $st = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/collatecorr.xml' );
 597 is( ref( $st ), 'Text::Tradition', "Got a tradition from test file" );
 598 ok( $st->has_witness('Ba96'), "Tradition has the affected witness" );
 599
 600 my $sc = $st->collation;
 601 my $numr = 17;
 602 ok( $sc->reading('n131'), "Tradition has the affected reading" );
 603 is( scalar( $sc->readings ), $numr, "There are $numr readings in the graph" );
 604 is( $sc->end->rank, 14, "There are fourteen ranks in the graph" );
 605
 606 # Detach the erroneously collated reading
 607 my( $newr, @del_rdgs ) = $sc->duplicate_reading( 'n131', 'Ba96' );
 608 ok( $newr, "New reading was created" );
 609 ok( $sc->reading('n131_0'), "Detached the bad collation with a new reading" );
 610 is( scalar( $sc->readings ), $numr + 1, "A reading was added to the graph" );
 611 is( $sc->end->rank, 10, "There are now only ten ranks in the graph" );
 612 my $csucc = $sc->common_successor( 'n131', 'n131_0' );
 613 is( $csucc->id, 'n136', "Found correct common successor to duped reading" );
 614
 615 # Check that the bad transposition is gone
 616 is( scalar @del_rdgs, 1, "Deleted reading was returned by API call" );
 617 is( $sc->get_relationship( 'n130', 'n135' ), undef, "Bad transposition relationship is gone" );
 618
 619 # The collation should not be fixed
 620 my @pairs = $sc->identical_readings();
 621 is( scalar @pairs, 0, "Not re-collated yet" );
 622 # Fix the collation
 623 ok( $sc->merge_readings( 'n124', 'n131_0' ), "Collated the readings correctly" );
 624 @pairs = $sc->identical_readings( start => 'n124', end => $csucc->id );
 625 is( scalar @pairs, 3, "Found three more identical readings" );
 626 is( $sc->end->rank, 11, "The ranks shifted appropriately" );
 627 $sc->flatten_ranks();
 628 is( scalar( $sc->readings ), $numr - 3, "Now we are collated correctly" );
 629
 630 # Check that we can't "duplicate" a reading with no wits or with all wits
 631 try {
 632         my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124' );
 633         ok( 0, "Reading duplication without witnesses throws an error" );
 634 } catch( Text::Tradition::Error $e ) {
 635         like( $e->message, qr/Must specify one or more witnesses/,
 636                 "Reading duplication without witnesses throws the expected error" );
 637 } catch {
 638         ok( 0, "Reading duplication without witnesses threw the wrong error" );
 639 }
 640
 641 try {
 642         my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124', 'Ba96', 'Mü11475' );
 643         ok( 0, "Reading duplication with all witnesses throws an error" );
 644 } catch( Text::Tradition::Error $e ) {
 645         like( $e->message, qr/Cannot join all witnesses/,
 646                 "Reading duplication with all witnesses throws the expected error" );
 647 } catch {
 648         ok( 0, "Reading duplication with all witnesses threw the wrong error" );
 649 }
 650
 651 try {
 652         $sc->calculate_ranks();
 653         ok( 1, "Graph is still evidently whole" );
 654 } catch( Text::Tradition::Error $e ) {
 655         ok( 0, "Caught a rank exception: " . $e->message );
 656 }
 657
 658 =end testing
 659
 660 =cut
 661
 662 sub duplicate_reading {
 663         my( $self, $r, @wits ) = @_;
 664         # Check that we are not doing anything unwise.
 665         throw( "Must specify one or more witnesses for the duplicated reading" )
 666                 unless @wits;
 667         unless( ref( $r ) eq 'Text::Tradition::Collation::Reading' ) {
 668                 $r = $self->reading( $r );
 669         }
 670         throw( "Cannot duplicate a meta-reading" )
 671                 if $r->is_meta;
 672         throw( "Cannot join all witnesses to the new reading" )
 673                 if scalar( @wits ) == scalar( $r->witnesses );
 674
 675         # Get all the reading attributes and duplicate them.
 676         my $rmeta = Text::Tradition::Collation::Reading->meta;
 677         my %args;
 678     foreach my $attr( $rmeta->get_all_attributes ) {
 679                 next if $attr->name =~ /^_/;
 680                 my $acc = $attr->get_read_method;
 681                 if( !$acc && $attr->has_applied_traits ) {
 682                         my $tr = $attr->applied_traits;
 683                         if( $tr->[0] =~ /::(Array|Hash)$/ ) {
 684                                 my $which = $1;
 685                                 my %methods = reverse %{$attr->handles};
 686                                 $acc = $methods{elements};
 687                                 $args{$attr->name} = $which eq 'Array'
 688                                         ? [ $r->$acc ] : { $r->$acc };
 689                         }
 690                 } else {
 691                         $args{$attr->name} = $r->$acc if $acc;
 692                 }
 693         }
 694         # By definition the new reading will no longer be common.
 695         $args{is_common} = 0;
 696         # The new reading also needs its own ID.
 697         $args{id} = $self->_generate_dup_id( $r->id );
 698
 699         # Try to make the new reading.
 700         my $newr = $self->add_reading( \%args );
 701         # The old reading is also no longer common.
 702         $r->is_common( 0 );
 703
 704         # For each of the witnesses, dissociate from the old reading and
 705         # associate with the new.
 706         foreach my $wit ( @wits ) {
 707                 my $prior = $self->prior_reading( $r, $wit );
 708                 my $next = $self->next_reading( $r, $wit );
 709                 $self->del_path( $prior, $r, $wit );
 710                 $self->add_path( $prior, $newr, $wit );
 711                 $self->del_path( $r, $next, $wit );
 712                 $self->add_path( $newr, $next, $wit );
 713         }
 714
 715         # If the graph is ranked, we need to look for relationships that are now
 716         # invalid (i.e. 'non-colocation' types that might now be colocated) and
 717         # remove them. If not, we can skip it.
 718         my $succ;
 719         my %rrk;
 720         my @deleted_relations;
 721         if( $self->end->has_rank ) {
 722                 # Find the point where we can stop checking
 723                 $succ = $self->common_successor( $r, $newr );
 724
 725                 # Hash the existing ranks
 726                 foreach my $rdg ( $self->readings ) {
 727                         $rrk{$rdg->id} = $rdg->rank;
 728                 }
 729                 # Calculate the new ranks
 730                 $self->calculate_ranks();
 731
 732                 # Check for invalid non-colocated relationships among changed-rank readings
 733                 # from where the ranks start changing up to $succ
 734                 my $lastrank = $succ->rank;
 735                 foreach my $rdg ( $self->readings ) {
 736                         next if $rdg->rank > $lastrank;
 737                         next if $rdg->rank == $rrk{$rdg->id};
 738                         my @noncolo = $rdg->related_readings( sub { !$_[0]->colocated } );
 739                         next unless @noncolo;
 740                         foreach my $nc ( @noncolo ) {
 741                                 unless( $self->relations->verify_or_delete( $rdg, $nc ) ) {
 742                                         push( @deleted_relations, [ $rdg->id, $nc->id ] );
 743                                 }
 744                         }
 745                 }
 746         }
 747         return ( $newr, @deleted_relations );
 748 }
 749
 750 sub _generate_dup_id {
 751         my( $self, $rid ) = @_;
 752         my $newid;
 753         my $i = 0;
 754         while( !$newid ) {
 755                 $newid = $rid."_$i";
 756                 if( $self->has_reading( $newid ) ) {
 757                         $newid = '';
 758                         $i++;
 759                 }
 760         }
 761         return $newid;
 762 }
 763
 764 ### Path logic
 765
 766 sub add_path {
 767         my $self = shift;
 768
 769         # We only need the IDs for adding paths to the graph, not the reading
 770         # objects themselves.
 771     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 772
 773         $self->_graphcalc_done(0);
 774         # Connect the readings
 775         unless( $self->sequence->has_edge( $source, $target ) ) {
 776             $self->sequence->add_edge( $source, $target );
 777             $self->relations->add_equivalence_edge( $source, $target );
 778         }
 779     # Note the witness in question
 780     $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
 781 }
 782
 783 sub del_path {
 784         my $self = shift;
 785         my @args;
 786         if( ref( $_[0] ) eq 'ARRAY' ) {
 787                 my $e = shift @_;
 788                 @args = ( @$e, @_ );
 789         } else {
 790                 @args = @_;
 791         }
 792
 793         # We only need the IDs for removing paths from the graph, not the reading
 794         # objects themselves.
 795     my( $source, $target, $wit ) = $self->_stringify_args( @args );
 796
 797         $self->_graphcalc_done(0);
 798         if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
 799                 $self->sequence->delete_edge_attribute( $source, $target, $wit );
 800         }
 801         unless( $self->sequence->has_edge_attributes( $source, $target ) ) {
 802                 $self->sequence->delete_edge( $source, $target );
 803                 $self->relations->delete_equivalence_edge( $source, $target );
 804         }
 805 }
 806
 807
 808 # Extra graph-alike utility
 809 sub has_path {
 810         my $self = shift;
 811     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 812         return undef unless $self->sequence->has_edge( $source, $target );
 813         return $self->sequence->has_edge_attribute( $source, $target, $wit );
 814 }
 815
 816 =head2 clear_witness( @sigil_list )
 817
 818 Clear the given witnesses out of the collation entirely, removing references
 819 to them in paths, and removing readings that belong only to them.  Should only
 820 be called via $tradition->del_witness.
 821
 822 =cut
 823
 824 sub clear_witness {
 825         my( $self, @sigils ) = @_;
 826
 827         $self->_graphcalc_done(0);
 828         # Clear the witness(es) out of the paths
 829         foreach my $e ( $self->paths ) {
 830                 foreach my $sig ( @sigils ) {
 831                         $self->del_path( $e, $sig );
 832                 }
 833         }
 834
 835         # Clear out the newly unused readings
 836         foreach my $r ( $self->readings ) {
 837                 unless( $self->reading_witnesses( $r ) ) {
 838                         $self->del_reading( $r );
 839                 }
 840         }
 841 }
 842
 843 sub add_relationship {
 844         my $self = shift;
 845     my( $source, $target, $opts ) = $self->_stringify_args( @_ );
 846     my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
 847     foreach my $v ( @vectors ) {
 848         next unless $self->get_relationship( $v )->colocated;
 849         if( $self->reading( $v->[0] )->has_rank && $self->reading( $v->[1] )->has_rank
 850                 && $self->reading( $v->[0] )->rank ne $self->reading( $v->[1] )->rank ) {
 851                         $self->_graphcalc_done(0);
 852                         $self->_clear_cache;
 853                         last;
 854         }
 855     }
 856     return @vectors;
 857 }
 858
 859 around qw/ get_relationship del_relationship / => sub {
 860         my $orig = shift;
 861         my $self = shift;
 862         my @args = @_;
 863         if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
 864                 @args = @{$_[0]};
 865         }
 866         my @stringargs = $self->_stringify_args( @args );
 867         $self->$orig( @stringargs );
 868 };
 869
 870 =head2 reading_witnesses( $reading )
 871
 872 Return a list of sigils corresponding to the witnesses in which the reading appears.
 873
 874 =cut
 875
 876 sub reading_witnesses {
 877         my( $self, $reading ) = @_;
 878         # We need only check either the incoming or the outgoing edges; I have
 879         # arbitrarily chosen "incoming".  Thus, special-case the start node.
 880         if( $reading eq $self->start ) {
 881                 return map { $_->sigil } grep { $_->is_collated } $self->tradition->witnesses;
 882         }
 883         my %all_witnesses;
 884         foreach my $e ( $self->sequence->edges_to( $reading ) ) {
 885                 my $wits = $self->sequence->get_edge_attributes( @$e );
 886                 @all_witnesses{ keys %$wits } = 1;
 887         }
 888         my $acstr = $self->ac_label;
 889         foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
 890                 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
 891         }
 892         return keys %all_witnesses;
 893 }
 894
 895 =head1 OUTPUT METHODS
 896
 897 =head2 as_svg( \%options )
 898
 899 Returns an SVG string that represents the graph, via as_dot and graphviz.
 900 See as_dot for a list of options.  Must have GraphViz (dot) installed to run.
 901
 902 =cut
 903
 904 sub as_svg {
 905     my( $self, $opts ) = @_;
 906     throw( "Need GraphViz installed to output SVG" )
 907         unless File::Which::which( 'dot' );
 908     my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
 909     $self->calculate_ranks()
 910         unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
 911         my @cmd = qw/dot -Tsvg/;
 912         my( $svg, $err );
 913         my $dotfile = File::Temp->new();
 914         ## USE FOR DEBUGGING
 915         # $dotfile->unlink_on_destroy(0);
 916         binmode $dotfile, ':utf8';
 917         print $dotfile $self->as_dot( $opts );
 918         push( @cmd, $dotfile->filename );
 919         run( \@cmd, ">", binary(), \$svg );
 920         $svg = decode_utf8( $svg );
 921         return $svg;
 922 }
 923
 924
 925 =head2 as_dot( \%options )
 926
 927 Returns a string that is the collation graph expressed in dot
 928 (i.e. GraphViz) format.  Options include:
 929
 930 =over 4
 931
 932 =item * from
 933
 934 =item * to
 935
 936 =item * color_common
 937
 938 =back
 939
 940 =cut
 941
 942 sub as_dot {
 943     my( $self, $opts ) = @_;
 944     my $startrank = $opts->{'from'} if $opts;
 945     my $endrank = $opts->{'to'} if $opts;
 946     my $color_common = $opts->{'color_common'} if $opts;
 947     my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
 948        && $self->end->rank > 100;
 949     $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
 950
 951     # Check the arguments
 952     if( $startrank ) {
 953         return if $endrank && $startrank > $endrank;
 954         return if $startrank > $self->end->rank;
 955         }
 956         if( defined $endrank ) {
 957                 return if $endrank < 0;
 958                 $endrank = undef if $endrank == $self->end->rank;
 959         }
 960
 961     my $graph_name = $self->tradition->name;
 962     $graph_name =~ s/[^\w\s]//g;
 963     $graph_name = join( '_', split( /\s+/, $graph_name ) );
 964
 965     my %graph_attrs = (
 966         'rankdir' => 'LR',
 967         'bgcolor' => 'none',
 968         );
 969     my %node_attrs = (
 970         'fontsize' => 14,
 971         'fillcolor' => 'white',
 972         'style' => 'filled',
 973         'shape' => 'ellipse'
 974         );
 975     my %edge_attrs = (
 976         'arrowhead' => 'open',
 977         'color' => '#000000',
 978         'fontcolor' => '#000000',
 979         );
 980
 981     my $dot = sprintf( "digraph %s {\n", $graph_name );
 982     $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
 983     $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
 984
 985         # Output substitute start/end readings if necessary
 986         if( $startrank ) {
 987                 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
 988         }
 989         if( $endrank ) {
 990                 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
 991         }
 992         if( $STRAIGHTENHACK ) {
 993                 ## HACK part 1
 994                 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
 995                 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
 996                 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
 997         }
 998         my %used;  # Keep track of the readings that actually appear in the graph
 999         # Sort the readings by rank if we have ranks; this speeds layout.
1000         my @all_readings = $self->end->has_rank
1001                 ? sort { $a->rank <=> $b->rank } $self->readings
1002                 : $self->readings;
1003         # TODO Refrain from outputting lacuna nodes - just grey out the edges.
1004     foreach my $reading ( @all_readings ) {
1005         # Only output readings within our rank range.
1006         next if $startrank && $reading->rank < $startrank;
1007         next if $endrank && $reading->rank > $endrank;
1008         $used{$reading->id} = 1;
1009         # Need not output nodes without separate labels
1010         next if $reading->id eq $reading->text;
1011         my $rattrs;
1012         my $label = $reading->text;
1013         $label .= '-' if $reading->join_next;
1014         $label = "-$label" if $reading->join_prior;
1015         $label =~ s/\"/\\\"/g;
1016                 $rattrs->{'label'} = $label;
1017                 $rattrs->{'id'} = $reading->id;
1018                 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
1019         $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
1020     }
1021
1022         # Add the real edges. Need to weight one edge per rank jump, in a
1023         # continuous line.
1024         # my $weighted = $self->_add_edge_weights;
1025     my @edges = $self->paths;
1026         my( %substart, %subend );
1027     foreach my $edge ( @edges ) {
1028         # Do we need to output this edge?
1029         if( $used{$edge->[0]} && $used{$edge->[1]} ) {
1030                 my $label = $self->_path_display_label( $opts,
1031                         $self->path_witnesses( $edge ) );
1032                         my $variables = { %edge_attrs, 'label' => $label };
1033
1034                         # Account for the rank gap if necessary
1035                         my $rank0 = $self->reading( $edge->[0] )->rank
1036                                 if $self->reading( $edge->[0] )->has_rank;
1037                         my $rank1 = $self->reading( $edge->[1] )->rank
1038                                 if $self->reading( $edge->[1] )->has_rank;
1039                         if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
1040                                 $variables->{'minlen'} = $rank1 - $rank0;
1041                         }
1042
1043                         # Add the calculated edge weights
1044                         # if( exists $weighted->{$edge->[0]}
1045                         #       && $weighted->{$edge->[0]} eq $edge->[1] ) {
1046                         #       # $variables->{'color'} = 'red';
1047                         #       $variables->{'weight'} = 3.0;
1048                         # }
1049
1050                         # EXPERIMENTAL: make edge width reflect no. of witnesses
1051                         my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
1052                         $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
1053
1054                         my $varopts = _dot_attr_string( $variables );
1055                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1056                                 $edge->[0], $edge->[1], $varopts );
1057         } elsif( $used{$edge->[0]} ) {
1058                 $subend{$edge->[0]} = $edge->[1];
1059         } elsif( $used{$edge->[1]} ) {
1060                 $substart{$edge->[1]} = $edge->[0];
1061         }
1062     }
1063
1064     # If we are asked to, add relationship links
1065     if( exists $opts->{show_relations} ) {
1066         my $filter = $opts->{show_relations}; # can be 'transposition' or 'all'
1067         if( $filter eq 'transposition' ) {
1068                 $filter =~ qr/^transposition$/;
1069         }
1070         my %typecolors;
1071         my @types = sort( map { $_->name } $self->relations->types );
1072         if( exists $opts->{graphcolors} ) {
1073                 foreach my $tdx ( 0 .. $#types ) {
1074                         $typecolors{$types[$tdx]} = $opts->{graphcolors}->[$tdx];
1075                 }
1076         } else {
1077                 map { $typecolors{$_} = '#FFA14F' } @types;
1078         }
1079         foreach my $redge ( $self->relationships ) {
1080                 if( $used{$redge->[0]} && $used{$redge->[1]} ) {
1081                                 my $rel = $self->get_relationship( $redge );
1082                                 next unless $filter eq 'all' || $rel->type =~ /$filter/;
1083                                 my $variables = {
1084                                         arrowhead => 'none',
1085                                         color => $typecolors{$rel->type},
1086                                         constraint => 'false',
1087                                         penwidth => '3',
1088                                 };
1089                                 unless( exists $opts->{graphcolors} ) {
1090                                         $variables->{label} = uc( substr( $rel->type, 0, 4 ) ),
1091                                 }
1092                                 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1093                                         $redge->[0], $redge->[1], _dot_attr_string( $variables ) );
1094                 }
1095         }
1096     }
1097
1098     # Add substitute start and end edges if necessary
1099     foreach my $node ( keys %substart ) {
1100         my $witstr = $self->_path_display_label( $opts,
1101                 $self->path_witnesses( $substart{$node}, $node ) );
1102         my $variables = { %edge_attrs, 'label' => $witstr };
1103         my $nrdg = $self->reading( $node );
1104         if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
1105                 # Substart is actually one lower than $startrank
1106                 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
1107         }
1108         my $varopts = _dot_attr_string( $variables );
1109         $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
1110         }
1111     foreach my $node ( keys %subend ) {
1112         my $witstr = $self->_path_display_label( $opts,
1113                 $self->path_witnesses( $node, $subend{$node} ) );
1114         my $variables = { %edge_attrs, 'label' => $witstr };
1115         my $varopts = _dot_attr_string( $variables );
1116         $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
1117         }
1118         # HACK part 2
1119         if( $STRAIGHTENHACK ) {
1120                 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
1121                 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
1122         }
1123
1124     $dot .= "}\n";
1125     return $dot;
1126 }
1127
1128 sub _dot_attr_string {
1129         my( $hash ) = @_;
1130         my @attrs;
1131         foreach my $k ( sort keys %$hash ) {
1132                 my $v = $hash->{$k};
1133                 push( @attrs, $k.'="'.$v.'"' );
1134         }
1135         return( '[ ' . join( ', ', @attrs ) . ' ]' );
1136 }
1137
1138 sub _add_edge_weights {
1139         my $self = shift;
1140         # Walk the graph from START to END, choosing the successor node with
1141         # the largest number of witness paths each time.
1142         my $weighted = {};
1143         my $curr = $self->start->id;
1144         my $ranked = $self->end->has_rank;
1145         while( $curr ne $self->end->id ) {
1146                 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
1147                 my @succ = sort { $self->path_witnesses( $curr, $a )
1148                                                         <=> $self->path_witnesses( $curr, $b ) }
1149                         $self->sequence->successors( $curr );
1150                 my $next = pop @succ;
1151                 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
1152                 # Try to avoid lacunae in the weighted path.
1153                 while( @succ &&
1154                            ( $self->reading( $next )->is_lacuna ||
1155                                  $nextrank - $rank > 1 ) ){
1156                         $next = pop @succ;
1157                 }
1158                 $weighted->{$curr} = $next;
1159                 $curr = $next;
1160         }
1161         return $weighted;
1162 }
1163
1164 =head2 path_witnesses( $edge )
1165
1166 Returns the list of sigils whose witnesses are associated with the given edge.
1167 The edge can be passed as either an array or an arrayref of ( $source, $target ).
1168
1169 =cut
1170
1171 sub path_witnesses {
1172         my( $self, @edge ) = @_;
1173         # If edge is an arrayref, cope.
1174         if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
1175                 my $e = shift @edge;
1176                 @edge = @$e;
1177         }
1178         my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
1179         return @wits;
1180 }
1181
1182 # Helper function. Make a display label for the given witnesses, showing a.c.
1183 # witnesses only where the main witness is not also in the list.
1184 sub _path_display_label {
1185         my $self = shift;
1186         my $opts = shift;
1187         my %wits;
1188         map { $wits{$_} = 1 } @_;
1189
1190         # If an a.c. wit is listed, remove it if the main wit is also listed.
1191         # Otherwise keep it for explicit listing.
1192         my $aclabel = $self->ac_label;
1193         my @disp_ac;
1194         foreach my $w ( sort keys %wits ) {
1195                 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
1196                         if( exists $wits{$1} ) {
1197                                 delete $wits{$w};
1198                         } else {
1199                                 push( @disp_ac, $w );
1200                         }
1201                 }
1202         }
1203
1204         if( $opts->{'explicit_wits'} ) {
1205                 return join( ', ', sort keys %wits );
1206         } else {
1207                 # See if we are in a majority situation.
1208                 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
1209                 $maj = $maj > 5 ? $maj : 5;
1210                 if( scalar keys %wits > $maj ) {
1211                         unshift( @disp_ac, 'majority' );
1212                         return join( ', ', @disp_ac );
1213                 } else {
1214                         return join( ', ', sort keys %wits );
1215                 }
1216         }
1217 }
1218
1219 =head2 readings_at_rank( $rank )
1220
1221 Returns a list of readings at a given rank, taken from the alignment table.
1222
1223 =cut
1224
1225 sub readings_at_rank {
1226         my( $self, $rank ) = @_;
1227         my $table = $self->alignment_table;
1228         # Table rank is real rank - 1.
1229         my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
1230         my %readings;
1231         foreach my $e ( @elements ) {
1232                 next unless ref( $e ) eq 'HASH';
1233                 next unless exists $e->{'t'};
1234                 $readings{$e->{'t'}->id} = $e->{'t'};
1235         }
1236         return values %readings;
1237 }
1238
1239 =head2 as_graphml
1240
1241 Returns a GraphML representation of the collation.  The GraphML will contain
1242 two graphs. The first expresses the attributes of the readings and the witness
1243 paths that link them; the second expresses the relationships that link the
1244 readings.  This is the native transfer format for a tradition.
1245
1246 =begin testing
1247
1248 use Text::Tradition;
1249 use TryCatch;
1250
1251 my $READINGS = 311;
1252 my $PATHS = 361;
1253
1254 my $datafile = 't/data/florilegium_tei_ps.xml';
1255 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1256                                       'name' => 'test0',
1257                                       'file' => $datafile,
1258                                       'linear' => 1 );
1259
1260 ok( $tradition, "Got a tradition object" );
1261 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
1262 ok( $tradition->collation, "Tradition has a collation" );
1263
1264 my $c = $tradition->collation;
1265 is( scalar $c->readings, $READINGS, "Collation has all readings" );
1266 is( scalar $c->paths, $PATHS, "Collation has all paths" );
1267 is( scalar $c->relationships, 0, "Collation has all relationships" );
1268
1269 # Add a few relationships
1270 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
1271 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
1272 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
1273
1274 # Now write it to GraphML and parse it again.
1275
1276 my $graphml = $c->as_graphml;
1277 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
1278 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
1279 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
1280 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
1281
1282 # Now add a stemma, write to GraphML, and look at the output.
1283 SKIP: {
1284         skip "Analysis module not present", 3 unless $tradition->can( 'add_stemma' );
1285         my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
1286         is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
1287         is( $tradition->stemmata, 1, "Tradition now has the stemma" );
1288         $graphml = $c->as_graphml;
1289         like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
1290 }
1291
1292 =end testing
1293
1294 =cut
1295
1296 ## TODO MOVE this to Tradition.pm and modularize it better
1297 sub as_graphml {
1298     my( $self, $options ) = @_;
1299         $self->calculate_ranks unless $self->_graphcalc_done;
1300
1301         my $start = $options->{'from'}
1302                 ? $self->reading( $options->{'from'} ) : $self->start;
1303         my $end = $options->{'to'}
1304                 ? $self->reading( $options->{'to'} ) : $self->end;
1305         if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
1306                 throw( 'Start node must be before end node' );
1307         }
1308         # The readings need to be ranked for this to work.
1309         $start = $self->start unless $start->has_rank;
1310         $end = $self->end unless $end->has_rank;
1311         my $rankoffset = 0;
1312         unless( $start eq $self->start ) {
1313                 $rankoffset = $start->rank - 1;
1314         }
1315         my %use_readings;
1316
1317     # Some namespaces
1318     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
1319     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
1320     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
1321         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
1322
1323     # Create the document and root node
1324     require XML::LibXML;
1325     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
1326     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
1327     $graphml->setDocumentElement( $root );
1328     $root->setNamespace( $xsi_ns, 'xsi', 0 );
1329     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
1330
1331     # List of attribute types to save on our objects and their corresponding
1332     # GraphML types
1333     my %save_types = (
1334         'Str' => 'string',
1335         'Int' => 'int',
1336         'Bool' => 'boolean',
1337         'ReadingID' => 'string',
1338         'RelationshipType' => 'string',
1339         'RelationshipScope' => 'string',
1340     );
1341
1342     # Add the data keys for the graph. Include an extra key 'version' for the
1343     # GraphML output version.
1344     my %graph_data_keys;
1345     my $gdi = 0;
1346     my %graph_attributes = ( 'version' => 'string' );
1347         # Graph attributes include those of Tradition and those of Collation.
1348         my %gattr_from;
1349         # TODO Use meta introspection method from duplicate_reading to do this
1350         # instead of naming custom keys.
1351         my $tmeta = $self->tradition->meta;
1352         my $cmeta = $self->meta;
1353         map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
1354         map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
1355         foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
1356                 next if $attr->name =~ /^_/;
1357                 next unless $save_types{$attr->type_constraint->name};
1358                 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1359         }
1360     # Extra custom keys for complex objects that should be saved in some form.
1361     # The subroutine should return a string, or undef/empty.
1362     if( $tmeta->has_method('stemmata') ) {
1363                 $graph_attributes{'stemmata'} = sub {
1364                         my @stemstrs;
1365                         map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1366                                 $self->tradition->stemmata;
1367                         join( "\n", @stemstrs );
1368                 };
1369         }
1370
1371         if( $tmeta->has_method('user') ) {
1372                 $graph_attributes{'user'} = sub {
1373                         $self->tradition->user ? $self->tradition->user->id : undef
1374                 };
1375         }
1376
1377     foreach my $datum ( sort keys %graph_attributes ) {
1378         $graph_data_keys{$datum} = 'dg'.$gdi++;
1379         my $key = $root->addNewChild( $graphml_ns, 'key' );
1380         my $dtype = ref( $graph_attributes{$datum} ) ? 'string'
1381                 : $graph_attributes{$datum};
1382         $key->setAttribute( 'attr.name', $datum );
1383         $key->setAttribute( 'attr.type', $dtype );
1384         $key->setAttribute( 'for', 'graph' );
1385         $key->setAttribute( 'id', $graph_data_keys{$datum} );
1386     }
1387
1388     # Add the data keys for reading nodes
1389     my %reading_attributes;
1390     my $rmeta = Text::Tradition::Collation::Reading->meta;
1391     foreach my $attr( $rmeta->get_all_attributes ) {
1392                 next if $attr->name =~ /^_/;
1393                 next unless $save_types{$attr->type_constraint->name};
1394                 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1395         }
1396         if( $self->start->does('Text::Tradition::Morphology' ) ) {
1397                 # Extra custom key for the reading morphology
1398                 $reading_attributes{'lexemes'} = 'string';
1399         }
1400
1401     my %node_data_keys;
1402     my $ndi = 0;
1403     foreach my $datum ( sort keys %reading_attributes ) {
1404         $node_data_keys{$datum} = 'dn'.$ndi++;
1405         my $key = $root->addNewChild( $graphml_ns, 'key' );
1406         $key->setAttribute( 'attr.name', $datum );
1407         $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
1408         $key->setAttribute( 'for', 'node' );
1409         $key->setAttribute( 'id', $node_data_keys{$datum} );
1410     }
1411
1412     # Add the data keys for edges, that is, paths and relationships. Path
1413     # data does not come from a Moose class so is here manually.
1414     my $edi = 0;
1415     my %edge_data_keys;
1416     my %edge_attributes = (
1417         witness => 'string',                    # ID/label for a path
1418         extra => 'boolean',                             # Path key
1419         );
1420     my @path_attributes = keys %edge_attributes; # track our manual additions
1421     my $pmeta = Text::Tradition::Collation::Relationship->meta;
1422     foreach my $attr( $pmeta->get_all_attributes ) {
1423                 next if $attr->name =~ /^_/;
1424                 next unless $save_types{$attr->type_constraint->name};
1425                 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1426         }
1427     foreach my $datum ( sort keys %edge_attributes ) {
1428         $edge_data_keys{$datum} = 'de'.$edi++;
1429         my $key = $root->addNewChild( $graphml_ns, 'key' );
1430         $key->setAttribute( 'attr.name', $datum );
1431         $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
1432         $key->setAttribute( 'for', 'edge' );
1433         $key->setAttribute( 'id', $edge_data_keys{$datum} );
1434     }
1435
1436     # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1437     my $xmlidname = $self->tradition->name;
1438     $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1439     if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1440         $xmlidname = '_'.$xmlidname;
1441     }
1442     my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1443     $sgraph->setAttribute( 'edgedefault', 'directed' );
1444     $sgraph->setAttribute( 'id', $xmlidname );
1445     $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
1446     $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
1447     $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
1448     $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
1449     $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
1450
1451     # Tradition/collation attribute data
1452     foreach my $datum ( keys %graph_attributes ) {
1453         my $value;
1454         if( $datum eq 'version' ) {
1455                 $value = '3.2';
1456         } elsif( ref( $graph_attributes{$datum} ) ) {
1457                 my $sub = $graph_attributes{$datum};
1458                 $value = &$sub();
1459         } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1460                 $value = $self->tradition->$datum;
1461         } else {
1462                 $value = $self->$datum;
1463         }
1464                 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
1465         }
1466
1467     my $node_ctr = 0;
1468     my %node_hash;
1469     # Add our readings to the graph
1470     foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
1471         next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1472                 ( $n->rank < $start->rank || $n->rank > $end->rank );
1473         $use_readings{$n->id} = 1;
1474         # Add to the main graph
1475         my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
1476         my $node_xmlid = 'n' . $node_ctr++;
1477         $node_hash{ $n->id } = $node_xmlid;
1478         $node_el->setAttribute( 'id', $node_xmlid );
1479         foreach my $d ( keys %reading_attributes ) {
1480                 my $nval = $n->$d;
1481                 # Custom serialization
1482                 if( $d eq 'lexemes' ) {
1483                                 # If nval is a true value, we have lexemes so we need to
1484                                 # serialize them. Otherwise set nval to undef so that the
1485                                 # key is excluded from this reading.
1486                         $nval = $nval ? $n->_serialize_lexemes : undef;
1487                 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1488                         $nval = undef;
1489                 }
1490                 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
1491                         # Adjust the ranks within the subgraph.
1492                         $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1493                                 : $nval - $rankoffset;
1494                 }
1495                 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1496                         if defined $nval;
1497         }
1498     }
1499
1500     # Add the path edges to the sequence graph
1501     my $edge_ctr = 0;
1502     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1503         # We add an edge in the graphml for every witness in $e.
1504         next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1505         my @edge_wits = sort $self->path_witnesses( $e );
1506         $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1507         $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1508         # Skip any path from start to end; that witness is not in the subgraph.
1509         next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
1510         foreach my $wit ( @edge_wits ) {
1511                         my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1512                                                                                 $node_hash{ $e->[0] },
1513                                                                                 $node_hash{ $e->[1] } );
1514                         my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
1515                         $edge_el->setAttribute( 'source', $from );
1516                         $edge_el->setAttribute( 'target', $to );
1517                         $edge_el->setAttribute( 'id', $id );
1518
1519                         # It's a witness path, so add the witness
1520                         my $base = $wit;
1521                         my $key = $edge_data_keys{'witness'};
1522                         # Is this an ante-corr witness?
1523                         my $aclabel = $self->ac_label;
1524                         if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1525                                 # Keep the base witness
1526                                 $base = $1;
1527                                 # ...and record that this is an 'extra' reading path
1528                                 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1529                         }
1530                         _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1531                 }
1532         }
1533
1534         # Report the actual number of nodes and edges that went in
1535         $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1536         $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1537
1538         # Add the relationship graph to the XML
1539         map { delete $edge_data_keys{$_} } @path_attributes;
1540         $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1541                 $node_data_keys{'id'}, \%edge_data_keys );
1542
1543     # Save and return the thing
1544     my $result = decode_utf8( $graphml->toString(1) );
1545     return $result;
1546 }
1547
1548 sub _add_graphml_data {
1549     my( $el, $key, $value ) = @_;
1550     return unless defined $value;
1551     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
1552     $data_el->setAttribute( 'key', $key );
1553     $data_el->appendText( $value );
1554 }
1555
1556 =head2 as_csv
1557
1558 Returns a CSV alignment table representation of the collation graph, one
1559 row per witness (or witness uncorrected.)
1560
1561 =head2 as_tsv
1562
1563 Returns a tab-separated alignment table representation of the collation graph,
1564 one row per witness (or witness uncorrected.)
1565
1566 =begin testing
1567
1568 use Text::Tradition;
1569 use Text::CSV;
1570
1571 my $READINGS = 311;
1572 my $PATHS = 361;
1573 my $WITS = 13;
1574 my $WITAC = 4;
1575
1576 my $datafile = 't/data/florilegium_tei_ps.xml';
1577 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1578                                       'name' => 'test0',
1579                                       'file' => $datafile,
1580                                       'linear' => 1 );
1581
1582 my $c = $tradition->collation;
1583 # Export the thing to CSV
1584 my $csvstr = $c->as_csv();
1585 # Count the columns
1586 my $csv = Text::CSV->new({ sep_char => ',', binary => 1 });
1587 my @lines = split(/\n/, $csvstr );
1588 ok( $csv->parse( $lines[0] ), "Successfully parsed first line of CSV" );
1589 is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1590 my @q_ac = grep { $_ eq 'Q'.$c->ac_label } $csv->fields;
1591 ok( @q_ac, "Found a layered witness" );
1592
1593 my $t2 = Text::Tradition->new( input => 'Tabular',
1594                                                            name => 'test2',
1595                                                            string => $csvstr,
1596                                                            sep_char => ',' );
1597 is( scalar $t2->collation->readings, $READINGS, "Reparsed CSV collation has all readings" );
1598 is( scalar $t2->collation->paths, $PATHS, "Reparsed CSV collation has all paths" );
1599
1600 # Now do it with TSV
1601 my $tsvstr = $c->as_tsv();
1602 my $t3 = Text::Tradition->new( input => 'Tabular',
1603                                                            name => 'test3',
1604                                                            string => $tsvstr,
1605                                                            sep_char => "\t" );
1606 is( scalar $t3->collation->readings, $READINGS, "Reparsed TSV collation has all readings" );
1607 is( scalar $t3->collation->paths, $PATHS, "Reparsed TSV collation has all paths" );
1608
1609 my $table = $c->alignment_table;
1610 my $noaccsv = $c->as_csv({ noac => 1 });
1611 my @noaclines = split(/\n/, $noaccsv );
1612 ok( $csv->parse( $noaclines[0] ), "Successfully parsed first line of no-ac CSV" );
1613 is( scalar( $csv->fields ), $WITS, "CSV has correct number of witness columns" );
1614 is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
1615
1616 my $safecsv = $c->as_csv({ safe_ac => 1});
1617 my @safelines = split(/\n/, $safecsv );
1618 ok( $csv->parse( $safelines[0] ), "Successfully parsed first line of safe CSV" );
1619 is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1620 @q_ac = grep { $_ eq 'Q__L' } $csv->fields;
1621 ok( @q_ac, "Found a sanitized layered witness" );
1622 is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
1623
1624 =end testing
1625
1626 =cut
1627
1628 sub _tabular {
1629     my( $self, $opts ) = @_;
1630     my $table = $self->alignment_table( $opts );
1631         my $csv_options = { binary => 1, quote_null => 0 };
1632         $csv_options->{'sep_char'} = $opts->{fieldsep};
1633         if( $opts->{fieldsep} eq "\t" ) {
1634                 # If it is really tab separated, nothing is an escape char.
1635                 $csv_options->{'quote_char'} = undef;
1636                 $csv_options->{'escape_char'} = '';
1637         }
1638     my $csv = Text::CSV->new( $csv_options );
1639     my @result;
1640     # Make the header row
1641     $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1642         push( @result, $csv->string );
1643     # Make the rest of the rows
1644     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1645         my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1646         my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
1647         $csv->combine( @row );
1648         push( @result, $csv->string );
1649     }
1650     return join( "\n", @result );
1651 }
1652
1653 sub as_csv {
1654         my $self = shift;
1655         my $opts = shift || {};
1656         $opts->{fieldsep} = ',';
1657         return $self->_tabular( $opts );
1658 }
1659
1660 sub as_tsv {
1661         my $self = shift;
1662         my $opts = shift || {};
1663         $opts->{fieldsep} = "\t";
1664         return $self->_tabular( $opts );
1665 }
1666
1667 =head2 alignment_table
1668
1669 Return a reference to an alignment table, in a slightly enhanced CollateX
1670 format which looks like this:
1671
1672  $table = { alignment => [ { witness => "SIGIL",
1673                              tokens => [ { t => "TEXT" }, ... ] },
1674                            { witness => "SIG2",
1675                              tokens => [ { t => "TEXT" }, ... ] },
1676                            ... ],
1677             length => TEXTLEN };
1678
1679 =cut
1680
1681 sub alignment_table {
1682     my( $self, $opts ) = @_;
1683     if( $self->has_cached_table ) {
1684                 return $self->cached_table
1685                         unless $opts->{noac} || $opts->{safe_ac};
1686     }
1687
1688     # Make sure we can do this
1689         throw( "Need a linear graph in order to make an alignment table" )
1690                 unless $self->linear;
1691     $self->calculate_ranks()
1692         unless $self->_graphcalc_done && $self->end->has_rank;
1693
1694     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
1695     my @all_pos = ( 1 .. $self->end->rank - 1 );
1696     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
1697         # say STDERR "Making witness row(s) for " . $wit->sigil;
1698         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1699         my @row = _make_witness_row( \@wit_path, \@all_pos );
1700         my $witobj = { 'witness' => $wit->sigil, 'tokens' => \@row };
1701         $witobj->{'identifier'} = $wit->identifier if $wit->identifier;
1702         push( @{$table->{'alignment'}}, $witobj );
1703         if( $wit->is_layered && !$opts->{noac} ) {
1704                 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
1705                         $wit->sigil.$self->ac_label );
1706             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
1707             my $witlabel = $opts->{safe_ac}
1708                 ? $wit->sigil . '__L' : $wit->sigil.$self->ac_label;
1709             my $witacobj = { 'witness' => $witlabel,
1710                 'tokens' => \@ac_row };
1711             $witacobj->{'identifier'} = $wit->identifier if $wit->identifier;
1712                         push( @{$table->{'alignment'}}, $witacobj );
1713         }
1714     }
1715     unless( $opts->{noac} || $opts->{safe_ac} ) {
1716             $self->cached_table( $table );
1717         }
1718     return $table;
1719 }
1720
1721 sub _make_witness_row {
1722     my( $path, $positions ) = @_;
1723     my %char_hash;
1724     map { $char_hash{$_} = undef } @$positions;
1725     my $debug = 0;
1726     foreach my $rdg ( @$path ) {
1727         say STDERR "rank " . $rdg->rank if $debug;
1728         # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
1729         $char_hash{$rdg->rank} = { 't' => $rdg };
1730     }
1731     my @row = map { $char_hash{$_} } @$positions;
1732     # Fill in lacuna markers for undef spots in the row
1733     my $last_el = shift @row;
1734     my @filled_row = ( $last_el );
1735     foreach my $el ( @row ) {
1736         # If we are using node reference, make the lacuna node appear many times
1737         # in the table.  If not, use the lacuna tag.
1738         if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1739             $el = $last_el;
1740         }
1741         push( @filled_row, $el );
1742         $last_el = $el;
1743     }
1744     return @filled_row;
1745 }
1746
1747
1748 =head1 NAVIGATION METHODS
1749
1750 =head2 reading_sequence( $first, $last, $sigil, $backup )
1751
1752 Returns the ordered list of readings, starting with $first and ending
1753 with $last, for the witness given in $sigil. If a $backup sigil is
1754 specified (e.g. when walking a layered witness), it will be used wherever
1755 no $sigil path exists.  If there is a base text reading, that will be
1756 used wherever no path exists for $sigil or $backup.
1757
1758 =cut
1759
1760 # TODO Think about returning some lazy-eval iterator.
1761 # TODO Get rid of backup; we should know from what witness is whether we need it.
1762
1763 sub reading_sequence {
1764     my( $self, $start, $end, $witness ) = @_;
1765
1766     $witness = $self->baselabel unless $witness;
1767     my @readings = ( $start );
1768     my %seen;
1769     my $n = $start;
1770     while( $n && $n->id ne $end->id ) {
1771         if( exists( $seen{$n->id} ) ) {
1772             throw( "Detected loop for $witness at " . $n->id );
1773         }
1774         $seen{$n->id} = 1;
1775
1776         my $next = $self->next_reading( $n, $witness );
1777         unless( $next ) {
1778             throw( "Did not find any path for $witness from reading " . $n->id );
1779         }
1780         push( @readings, $next );
1781         $n = $next;
1782     }
1783     # Check that the last reading is our end reading.
1784     my $last = $readings[$#readings];
1785     throw( "Last reading found from " . $start->text .
1786         " for witness $witness is not the end!" ) # TODO do we get this far?
1787         unless $last->id eq $end->id;
1788
1789     return @readings;
1790 }
1791
1792 =head2 next_reading( $reading, $sigil );
1793
1794 Returns the reading that follows the given reading along the given witness
1795 path.
1796
1797 =cut
1798
1799 sub next_reading {
1800     # Return the successor via the corresponding path.
1801     my $self = shift;
1802     my $answer = $self->_find_linked_reading( 'next', @_ );
1803         return undef unless $answer;
1804     return $self->reading( $answer );
1805 }
1806
1807 =head2 prior_reading( $reading, $sigil )
1808
1809 Returns the reading that precedes the given reading along the given witness
1810 path.
1811
1812 =cut
1813
1814 sub prior_reading {
1815     # Return the predecessor via the corresponding path.
1816     my $self = shift;
1817     my $answer = $self->_find_linked_reading( 'prior', @_ );
1818     return $self->reading( $answer );
1819 }
1820
1821 sub _find_linked_reading {
1822     my( $self, $direction, $node, $path ) = @_;
1823
1824     # Get a backup if we are dealing with a layered witness
1825     my $alt_path;
1826     my $aclabel = $self->ac_label;
1827     if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1828         $alt_path = $1;
1829     }
1830
1831     my @linked_paths = $direction eq 'next'
1832         ? $self->sequence->edges_from( $node )
1833         : $self->sequence->edges_to( $node );
1834     return undef unless scalar( @linked_paths );
1835
1836     # We have to find the linked path that contains all of the
1837     # witnesses supplied in $path.
1838     my( @path_wits, @alt_path_wits );
1839     @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1840     @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
1841     my $base_le;
1842     my $alt_le;
1843     foreach my $le ( @linked_paths ) {
1844         if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
1845             $base_le = $le;
1846         }
1847                 my @le_wits = sort $self->path_witnesses( $le );
1848                 if( _is_within( \@path_wits, \@le_wits ) ) {
1849                         # This is the right path.
1850                         return $direction eq 'next' ? $le->[1] : $le->[0];
1851                 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1852                         $alt_le = $le;
1853                 }
1854     }
1855     # Got this far? Return the alternate path if it exists.
1856     return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
1857         if $alt_le;
1858
1859     # Got this far? Return the base path if it exists.
1860     return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
1861         if $base_le;
1862
1863     # Got this far? We have no appropriate path.
1864     warn "Could not find $direction node from " . $node->id
1865         . " along path $path";
1866     return undef;
1867 }
1868
1869 # Some set logic.
1870 sub _is_within {
1871     my( $set1, $set2 ) = @_;
1872     my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
1873     foreach my $el ( @$set1 ) {
1874         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
1875     }
1876     return $ret;
1877 }
1878
1879 # Return the string that joins together a list of witnesses for
1880 # display on a single path.
1881 sub _witnesses_of_label {
1882     my( $self, $label ) = @_;
1883     my $regex = $self->wit_list_separator;
1884     my @answer = split( /\Q$regex\E/, $label );
1885     return @answer;
1886 }
1887
1888 =head2 common_readings
1889
1890 Returns the list of common readings in the graph (i.e. those readings that are
1891 shared by all non-lacunose witnesses.)
1892
1893 =cut
1894
1895 sub common_readings {
1896         my $self = shift;
1897         my @common = grep { $_->is_common } $self->readings;
1898         return @common;
1899 }
1900
1901 =head2 path_text( $sigil, [, $start, $end ] )
1902
1903 Returns the text of a witness (plus its backup, if we are using a layer)
1904 as stored in the collation.  The text is returned as a string, where the
1905 individual readings are joined with spaces and the meta-readings (e.g.
1906 lacunae) are omitted.  Optional specification of $start and $end allows
1907 the generation of a subset of the witness text.
1908
1909 =cut
1910
1911 sub path_text {
1912         my( $self, $wit, $start, $end ) = @_;
1913         $start = $self->start unless $start;
1914         $end = $self->end unless $end;
1915         my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
1916         my $pathtext = '';
1917         my $last;
1918         foreach my $r ( @path ) {
1919                 unless ( $r->join_prior || !$last || $last->join_next ) {
1920                         $pathtext .= ' ';
1921                 }
1922                 $pathtext .= $r->text;
1923                 $last = $r;
1924         }
1925         return $pathtext;
1926 }
1927
1928 =head1 INITIALIZATION METHODS
1929
1930 These are mostly for use by parsers.
1931
1932 =head2 make_witness_path( $witness )
1933
1934 Link the array of readings contained in $witness->path (and in
1935 $witness->uncorrected_path if it exists) into collation paths.
1936 Clear out the arrays when finished.
1937
1938 =head2 make_witness_paths
1939
1940 Call make_witness_path for all witnesses in the tradition.
1941
1942 =cut
1943
1944 # For use when a collation is constructed from a base text and an apparatus.
1945 # We have the sequences of readings and just need to add path edges.
1946 # When we are done, clear out the witness path attributes, as they are no
1947 # longer needed.
1948 # TODO Find a way to replace the witness path attributes with encapsulated functions?
1949
1950 sub make_witness_paths {
1951     my( $self ) = @_;
1952     foreach my $wit ( $self->tradition->witnesses ) {
1953         # say STDERR "Making path for " . $wit->sigil;
1954         $self->make_witness_path( $wit );
1955     }
1956 }
1957
1958 sub make_witness_path {
1959     my( $self, $wit ) = @_;
1960     my @chain = @{$wit->path};
1961     my $sig = $wit->sigil;
1962     # Add start and end if necessary
1963     unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1964     push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1965     foreach my $idx ( 0 .. $#chain-1 ) {
1966         $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
1967     }
1968     if( $wit->is_layered ) {
1969         @chain = @{$wit->uncorrected_path};
1970                 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1971                 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1972         foreach my $idx( 0 .. $#chain-1 ) {
1973             my $source = $chain[$idx];
1974             my $target = $chain[$idx+1];
1975             $self->add_path( $source, $target, $sig.$self->ac_label )
1976                 unless $self->has_path( $source, $target, $sig );
1977         }
1978     }
1979     $wit->clear_path;
1980     $wit->clear_uncorrected_path;
1981 }
1982
1983 =head2 calculate_ranks
1984
1985 Calculate the reading ranks (that is, their aligned positions relative
1986 to each other) for the graph.  This can only be called on linear collations.
1987
1988 =begin testing
1989
1990 use Text::Tradition;
1991
1992 my $cxfile = 't/data/Collatex-16.xml';
1993 my $t = Text::Tradition->new(
1994     'name'  => 'inline',
1995     'input' => 'CollateX',
1996     'file'  => $cxfile,
1997     );
1998 my $c = $t->collation;
1999
2000 # Make an svg
2001 my $table = $c->alignment_table;
2002 ok( $c->has_cached_table, "Alignment table was cached" );
2003 is( $c->alignment_table, $table, "Cached table returned upon second call" );
2004 $c->calculate_ranks;
2005 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
2006 $c->add_relationship( 'n13', 'n23', { type => 'repetition' } );
2007 is( $c->alignment_table, $table, "Alignment table unchanged after non-colo relationship add" );
2008 $c->add_relationship( 'n24', 'n23', { type => 'spelling' } );
2009 isnt( $c->alignment_table, $table, "Alignment table changed after colo relationship add" );
2010
2011 =end testing
2012
2013 =cut
2014
2015 sub calculate_ranks {
2016     my $self = shift;
2017     # Save the existing ranks, in case we need to invalidate the cached SVG.
2018     throw( "Cannot calculate ranks on a non-linear graph" )
2019         unless $self->linear;
2020     my %existing_ranks;
2021     map { $existing_ranks{$_} = $_->rank } $self->readings;
2022
2023     # Do the rankings based on the relationship equivalence graph, starting
2024     # with the start node.
2025     my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
2026
2027     # Transfer our rankings from the topological graph to the real one.
2028     foreach my $r ( $self->readings ) {
2029         if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
2030             $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
2031         } else {
2032                 # Die. Find the last rank we calculated.
2033                 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
2034                                  <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
2035                         $self->readings;
2036                 my $last = pop @all_defined;
2037             throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
2038         }
2039     }
2040     # Do we need to invalidate the cached data?
2041     if( $self->has_cached_table ) {
2042         foreach my $r ( $self->readings ) {
2043                 next if defined( $existing_ranks{$r} )
2044                         && $existing_ranks{$r} == $r->rank;
2045                 # Something has changed, so clear the cache
2046                 $self->_clear_cache;
2047                         # ...and recalculate the common readings.
2048                         $self->calculate_common_readings();
2049                 last;
2050         }
2051     }
2052         # The graph calculation information is now up to date.
2053         $self->_graphcalc_done(1);
2054 }
2055
2056 sub _clear_cache {
2057         my $self = shift;
2058         $self->wipe_table if $self->has_cached_table;
2059 }
2060
2061
2062 =head2 flatten_ranks
2063
2064 A convenience method for parsing collation data.  Searches the graph for readings
2065 with the same text at the same rank, and merges any that are found.
2066
2067 =cut
2068
2069 sub flatten_ranks {
2070     my ( $self, %args ) = shift;
2071     my %unique_rank_rdg;
2072     my $changed;
2073     foreach my $p ( $self->identical_readings( %args ) ) {
2074                 # say STDERR "Combining readings at same rank: @$p";
2075                 $changed = 1;
2076                 $self->merge_readings( @$p );
2077                 # TODO see if this now makes a common point.
2078     }
2079     # If we merged readings, the ranks are still fine but the alignment
2080     # table is wrong. Wipe it.
2081     $self->wipe_table() if $changed;
2082 }
2083
2084 =head2 identical_readings
2085 =head2 identical_readings( start => $startnode, end => $endnode )
2086 =head2 identical_readings( startrank => $startrank, endrank => $endrank )
2087
2088 Goes through the graph identifying all pairs of readings that appear to be
2089 identical, and therefore able to be merged into a single reading. Returns the
2090 relevant identical pairs. Can be restricted to run over only a part of the
2091 graph, specified either by node or by rank.
2092
2093 =cut
2094
2095 sub identical_readings {
2096         my ( $self, %args ) = @_;
2097     # Find where we should start and end.
2098     my $startrank = $args{startrank} || 0;
2099     if( $args{start} ) {
2100         throw( "Starting reading has no rank" ) unless $self->reading( $args{start} )
2101                 && $self->reading( $args{start} )->has_rank;
2102         $startrank = $self->reading( $args{start} )->rank;
2103     }
2104     my $endrank = $args{endrank} || $self->end->rank;
2105     if( $args{end} ) {
2106         throw( "Ending reading has no rank" ) unless $self->reading( $args{end} )
2107                 && $self->reading( $args{end} )->has_rank;
2108         $endrank = $self->reading( $args{end} )->rank;
2109     }
2110
2111     # Make sure the ranks are correct.
2112     unless( $self->_graphcalc_done ) {
2113         $self->calculate_ranks;
2114     }
2115     # Go through the readings looking for duplicates.
2116     my %unique_rank_rdg;
2117     my @pairs;
2118     foreach my $rdg ( $self->readings ) {
2119         next unless $rdg->has_rank;
2120         my $rk = $rdg->rank;
2121         next if $rk > $endrank || $rk < $startrank;
2122         my $key = $rk . "||" . $rdg->text;
2123         if( exists $unique_rank_rdg{$key} ) {
2124                 # Make sure they don't have different grammatical forms
2125                         my $ur = $unique_rank_rdg{$key};
2126                 if( $rdg->is_identical( $ur ) ) {
2127                                 push( @pairs, [ $ur, $rdg ] );
2128                         }
2129         } else {
2130             $unique_rank_rdg{$key} = $rdg;
2131         }
2132     }
2133
2134     return @pairs;
2135 }
2136
2137
2138 =head2 calculate_common_readings
2139
2140 Goes through the graph identifying the readings that appear in every witness
2141 (apart from those with lacunae at that spot.) Marks them as common and returns
2142 the list.
2143
2144 =begin testing
2145
2146 use Text::Tradition;
2147
2148 my $cxfile = 't/data/Collatex-16.xml';
2149 my $t = Text::Tradition->new(
2150     'name'  => 'inline',
2151     'input' => 'CollateX',
2152     'file'  => $cxfile,
2153     );
2154 my $c = $t->collation;
2155
2156 my @common = $c->calculate_common_readings();
2157 is( scalar @common, 8, "Found correct number of common readings" );
2158 my @marked = sort $c->common_readings();
2159 is( scalar @common, 8, "All common readings got marked as such" );
2160 my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
2161 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
2162
2163 =end testing
2164
2165 =cut
2166
2167 sub calculate_common_readings {
2168         my $self = shift;
2169         my @common;
2170         map { $_->is_common( 0 ) } $self->readings;
2171         # Implicitly calls calculate_ranks
2172         my $table = $self->alignment_table;
2173         foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
2174                 my @row = map { $_->{'tokens'}->[$idx]
2175                                                         ? $_->{'tokens'}->[$idx]->{'t'} : '' }
2176                                         @{$table->{'alignment'}};
2177                 my %hash;
2178                 foreach my $r ( @row ) {
2179                         if( $r ) {
2180                                 $hash{$r->id} = $r unless $r->is_meta;
2181                         } else {
2182                                 $hash{'UNDEF'} = $r;
2183                         }
2184                 }
2185                 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
2186                         my( $r ) = values %hash;
2187                         $r->is_common( 1 );
2188                         push( @common, $r );
2189                 }
2190         }
2191         return @common;
2192 }
2193
2194 =head2 text_from_paths
2195
2196 Calculate the text array for all witnesses from the path, for later consistency
2197 checking.  Only to be used if there is no non-graph-based way to know the
2198 original texts.
2199
2200 =cut
2201
2202 sub text_from_paths {
2203         my $self = shift;
2204     foreach my $wit ( $self->tradition->witnesses ) {
2205         my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
2206         my @text;
2207         foreach my $r ( @readings ) {
2208                 next if $r->is_meta;
2209                 push( @text, $r->text );
2210         }
2211         $wit->text( \@text );
2212         if( $wit->is_layered ) {
2213                         my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
2214                                                                                                   $wit->sigil.$self->ac_label );
2215                         my @uctext;
2216                         foreach my $r ( @ucrdgs ) {
2217                                 next if $r->is_meta;
2218                                 push( @uctext, $r->text );
2219                         }
2220                         $wit->layertext( \@uctext );
2221         }
2222     }
2223 }
2224
2225 =head1 UTILITY FUNCTIONS
2226
2227 =head2 common_predecessor( $reading_a, $reading_b )
2228
2229 Find the last reading that occurs in sequence before both the given readings.
2230 At the very least this should be $self->start.
2231
2232 =head2 common_successor( $reading_a, $reading_b )
2233
2234 Find the first reading that occurs in sequence after both the given readings.
2235 At the very least this should be $self->end.
2236
2237 =begin testing
2238
2239 use Text::Tradition;
2240
2241 my $cxfile = 't/data/Collatex-16.xml';
2242 my $t = Text::Tradition->new(
2243     'name'  => 'inline',
2244     'input' => 'CollateX',
2245     'file'  => $cxfile,
2246     );
2247 my $c = $t->collation;
2248
2249 is( $c->common_predecessor( 'n24', 'n23' )->id,
2250     'n20', "Found correct common predecessor" );
2251 is( $c->common_successor( 'n24', 'n23' )->id,
2252     '__END__', "Found correct common successor" );
2253
2254 is( $c->common_predecessor( 'n19', 'n17' )->id,
2255     'n16', "Found correct common predecessor for readings on same path" );
2256 is( $c->common_successor( 'n21', 'n10' )->id,
2257     '__END__', "Found correct common successor for readings on same path" );
2258
2259 =end testing
2260
2261 =cut
2262
2263 ## Return the closest reading that is a predecessor of both the given readings.
2264 sub common_predecessor {
2265         my $self = shift;
2266         my( $r1, $r2 ) = $self->_objectify_args( @_ );
2267         return $self->_common_in_path( $r1, $r2, 'predecessors' );
2268 }
2269
2270 sub common_successor {
2271         my $self = shift;
2272         my( $r1, $r2 ) = $self->_objectify_args( @_ );
2273         return $self->_common_in_path( $r1, $r2, 'successors' );
2274 }
2275
2276
2277 # TODO think about how to do this without ranks...
2278 sub _common_in_path {
2279         my( $self, $r1, $r2, $dir ) = @_;
2280         my $iter = $self->end->rank;
2281         my @candidates;
2282         my @last_r1 = ( $r1 );
2283         my @last_r2 = ( $r2 );
2284         # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
2285         my %all_seen;
2286         # say STDERR "Finding common $dir for $r1, $r2";
2287         while( !@candidates ) {
2288                 last unless $iter--;  # Avoid looping infinitely
2289                 # Iterate separately down the graph from r1 and r2
2290                 my( @new_lc1, @new_lc2 );
2291                 foreach my $lc ( @last_r1 ) {
2292                         foreach my $p ( $lc->$dir ) {
2293                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
2294                                         # say STDERR "Path candidate $p from $lc";
2295                                         push( @candidates, $p );
2296                                 } elsif( !$all_seen{$p->id} ) {
2297                                         $all_seen{$p->id} = 'r1';
2298                                         push( @new_lc1, $p );
2299                                 }
2300                         }
2301                 }
2302                 foreach my $lc ( @last_r2 ) {
2303                         foreach my $p ( $lc->$dir ) {
2304                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
2305                                         # say STDERR "Path candidate $p from $lc";
2306                                         push( @candidates, $p );
2307                                 } elsif( !$all_seen{$p->id} ) {
2308                                         $all_seen{$p->id} = 'r2';
2309                                         push( @new_lc2, $p );
2310                                 }
2311                         }
2312                 }
2313                 @last_r1 = @new_lc1;
2314                 @last_r2 = @new_lc2;
2315         }
2316         my @answer = sort { $a->rank <=> $b->rank } @candidates;
2317         return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
2318 }
2319
2320 sub throw {
2321         Text::Tradition::Error->throw(
2322                 'ident' => 'Collation error',
2323                 'message' => $_[0],
2324                 );
2325 }
2326
2327 no Moose;
2328 __PACKAGE__->meta->make_immutable;
2329
2330 =head1 BUGS/TODO
2331
2332 =over
2333
2334 =item * Rework XML serialization in a more modular way
2335
2336 =back
2337
2338 =head1 LICENSE
2339
2340 This package is free software and is provided "as is" without express
2341 or implied warranty.  You can redistribute it and/or modify it under
2342 the same terms as Perl itself.
2343
2344 =head1 AUTHOR
2345
2346 Tara L Andrews E<lt>aurum@cpan.orgE<gt>