base/lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use feature 'say';
   4 use Encode qw( decode_utf8 );
   5 use File::Temp;
   6 use File::Which;
   7 use Graph;
   8 use IPC::Run qw( run binary );
   9 use Text::CSV;
  10 use Text::Tradition::Collation::Data;
  11 use Text::Tradition::Collation::Reading;
  12 use Text::Tradition::Collation::RelationshipStore;
  13 use Text::Tradition::Error;
  14 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
  15 use XML::LibXML;
  16 use XML::LibXML::XPathContext;
  17 use Moose;
  18
  19 has _data => (
  20         isa      => 'Text::Tradition::Collation::Data',
  21         is       => 'ro',
  22         required => 1,
  23         handles  => [ qw(
  24                 sequence
  25                 paths
  26                 _set_relations
  27                 relations
  28                 _set_start
  29                 _set_end
  30                 ac_label
  31                 has_cached_table
  32                 relationships
  33                 related_readings
  34                 get_relationship
  35                 del_relationship
  36                 equivalence
  37                 equivalence_graph
  38                 readings
  39                 reading
  40                 _add_reading
  41                 del_reading
  42                 has_reading
  43                 wit_list_separator
  44                 baselabel
  45                 linear
  46                 wordsep
  47                 start
  48                 end
  49                 cached_table
  50                 _graphcalc_done
  51                 has_cached_svg
  52                 wipe_table
  53         )]
  54 );
  55
  56 has 'tradition' => (
  57     is => 'ro',
  58     isa => 'Text::Tradition',
  59     writer => '_set_tradition',
  60     weak_ref => 1,
  61     );
  62
  63 =head1 NAME
  64
  65 Text::Tradition::Collation - a software model for a text collation
  66
  67 =head1 SYNOPSIS
  68
  69   use Text::Tradition;
  70   my $t = Text::Tradition->new(
  71     'name' => 'this is a text',
  72     'input' => 'TEI',
  73     'file' => '/path/to/tei_parallel_seg_file.xml' );
  74
  75   my $c = $t->collation;
  76   my @readings = $c->readings;
  77   my @paths = $c->paths;
  78   my @relationships = $c->relationships;
  79
  80   my $svg_variant_graph = $t->collation->as_svg();
  81
  82 =head1 DESCRIPTION
  83
  84 Text::Tradition is a library for representation and analysis of collated
  85 texts, particularly medieval ones.  The Collation is the central feature of
  86 a Tradition, where the text, its sequence of readings, and its relationships
  87 between readings are actually kept.
  88
  89 =head1 CONSTRUCTOR
  90
  91 =head2 new
  92
  93 The constructor.  Takes a hash or hashref of the following arguments:
  94
  95 =over
  96
  97 =item * tradition - The Text::Tradition object to which the collation
  98 belongs. Required.
  99
 100 =item * linear - Whether the collation should be linear; that is, whether
 101 transposed readings should be treated as two linked readings rather than one,
 102 and therefore whether the collation graph is acyclic.  Defaults to true.
 103
 104 =item * baselabel - The default label for the path taken by a base text
 105 (if any). Defaults to 'base text'.
 106
 107 =item * wit_list_separator - The string to join a list of witnesses for
 108 purposes of making labels in display graphs.  Defaults to ', '.
 109
 110 =item * ac_label - The extra label to tack onto a witness sigil when
 111 representing another layer of path for the given witness - that is, when
 112 a text has more than one possible reading due to scribal corrections or
 113 the like.  Defaults to ' (a.c.)'.
 114
 115 =item * wordsep - The string used to separate words in the original text.
 116 Defaults to ' '.
 117
 118 =back
 119
 120 =head1 ACCESSORS
 121
 122 =head2 tradition
 123
 124 =head2 linear
 125
 126 =head2 wit_list_separator
 127
 128 =head2 baselabel
 129
 130 =head2 ac_label
 131
 132 =head2 wordsep
 133
 134 Simple accessors for collation attributes.
 135
 136 =head2 start
 137
 138 The meta-reading at the start of every witness path.
 139
 140 =head2 end
 141
 142 The meta-reading at the end of every witness path.
 143
 144 =head2 readings
 145
 146 Returns all Reading objects in the graph.
 147
 148 =head2 reading( $id )
 149
 150 Returns the Reading object corresponding to the given ID.
 151
 152 =head2 add_reading( $reading_args )
 153
 154 Adds a new reading object to the collation.
 155 See L<Text::Tradition::Collation::Reading> for the available arguments.
 156
 157 =head2 del_reading( $object_or_id )
 158
 159 Removes the given reading from the collation, implicitly removing its
 160 paths and relationships.
 161
 162 =head2 has_reading( $id )
 163
 164 Predicate to see whether a given reading ID is in the graph.
 165
 166 =head2 reading_witnesses( $object_or_id )
 167
 168 Returns a list of sigils whose witnesses contain the reading.
 169
 170 =head2 paths
 171
 172 Returns all reading paths within the document - that is, all edges in the
 173 collation graph.  Each path is an arrayref of [ $source, $target ] reading IDs.
 174
 175 =head2 add_path( $source, $target, $sigil )
 176
 177 Links the given readings in the collation in sequence, under the given witness
 178 sigil.  The readings may be specified by object or ID.
 179
 180 =head2 del_path( $source, $target, $sigil )
 181
 182 Links the given readings in the collation in sequence, under the given witness
 183 sigil.  The readings may be specified by object or ID.
 184
 185 =head2 has_path( $source, $target );
 186
 187 Returns true if the two readings are linked in sequence in any witness.
 188 The readings may be specified by object or ID.
 189
 190 =head2 relationships
 191
 192 Returns all Relationship objects in the collation.
 193
 194 =head2 add_relationship( $reading, $other_reading, $options )
 195
 196 Adds a new relationship of the type given in $options between the two readings,
 197 which may be specified by object or ID.  Returns a value of ( $status, @vectors)
 198 where $status is true on success, and @vectors is a list of relationship edges
 199 that were ultimately added.
 200 See L<Text::Tradition::Collation::Relationship> for the available options.
 201
 202 =cut
 203
 204 sub BUILDARGS {
 205         my ( $class, @args ) = @_;
 206         my %args = @args == 1 ? %{ $args[0] } : @args;
 207         # TODO determine these from the Moose::Meta object
 208         my @delegate_attrs = qw(sequence relations readings wit_list_separator baselabel
 209                 linear wordsep start end cached_table _graphcalc_done);
 210         my %data_args;
 211         for my $attr (@delegate_attrs) {
 212                 $data_args{$attr} = delete $args{$attr} if exists $args{$attr};
 213         }
 214         $args{_data} = Text::Tradition::Collation::Data->new(%data_args);
 215         return \%args;
 216 }
 217
 218 sub BUILD {
 219     my $self = shift;
 220     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
 221     $self->_set_start( $self->add_reading(
 222         { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
 223     $self->_set_end( $self->add_reading(
 224         { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
 225 }
 226
 227 sub register_relationship_type {
 228         my $self = shift;
 229         my %args = @_ == 1 ? %{$_[0]} : @_;
 230         if( $self->relations->has_type( $args{name} ) ) {
 231                 throw( 'Relationship type ' . $args{name} . ' already registered' );
 232         }
 233         $self->relations->add_type( %args );
 234 }
 235
 236 ### Reading construct/destruct functions
 237
 238 sub add_reading {
 239         my( $self, $reading ) = @_;
 240         unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
 241                 my %args = %$reading;
 242                 if( $args{'init'} ) {
 243                         # If we are initializing an empty collation, don't assume that we
 244                         # have set a tradition.
 245                         delete $args{'init'};
 246                 } elsif( $self->tradition->can('language') && $self->tradition->has_language
 247                         && !exists $args{'language'} ) {
 248                         $args{'language'} = $self->tradition->language;
 249                 }
 250                 $reading = Text::Tradition::Collation::Reading->new(
 251                         'collation' => $self,
 252                         %args );
 253         }
 254         # First check to see if a reading with this ID exists.
 255         if( $self->reading( $reading->id ) ) {
 256                 throw( "Collation already has a reading with id " . $reading->id );
 257         }
 258         $self->_graphcalc_done(0);
 259         $self->_add_reading( $reading->id => $reading );
 260         # Once the reading has been added, put it in both graphs.
 261         $self->sequence->add_vertex( $reading->id );
 262         $self->relations->add_reading( $reading->id );
 263         return $reading;
 264 };
 265
 266 around del_reading => sub {
 267         my $orig = shift;
 268         my $self = shift;
 269         my $arg = shift;
 270
 271         if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
 272                 $arg = $arg->id;
 273         }
 274         # Remove the reading from the graphs.
 275         $self->_graphcalc_done(0);
 276         $self->_clear_cache; # Explicitly clear caches to GC the reading
 277         $self->sequence->delete_vertex( $arg );
 278         $self->relations->delete_reading( $arg );
 279
 280         # Carry on.
 281         $self->$orig( $arg );
 282 };
 283
 284 =head2 merge_readings( $main, $second, $concatenate, $with_str )
 285
 286 Merges the $second reading into the $main one. If $concatenate is true, then
 287 the merged node will carry the text of both readings, concatenated with either
 288 $with_str (if specified) or a sensible default (the empty string if the
 289 appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
 290
 291 The first two arguments may be either readings or reading IDs.
 292
 293 =begin testing
 294
 295 use Text::Tradition;
 296
 297 my $cxfile = 't/data/Collatex-16.xml';
 298 my $t = Text::Tradition->new(
 299     'name'  => 'inline',
 300     'input' => 'CollateX',
 301     'file'  => $cxfile,
 302     );
 303 my $c = $t->collation;
 304
 305 my $rno = scalar $c->readings;
 306 # Split n21 ('unto') for testing purposes
 307 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
 308 my $old_r = $c->reading( 'n21' );
 309 $old_r->alter_text( 'to' );
 310 $c->del_path( 'n20', 'n21', 'A' );
 311 $c->add_path( 'n20', 'n21p0', 'A' );
 312 $c->add_path( 'n21p0', 'n21', 'A' );
 313 $c->add_relationship( 'n21', 'n22', { type => 'collated', scope => 'local' } );
 314 $c->flatten_ranks();
 315 ok( $c->reading( 'n21p0' ), "New reading exists" );
 316 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
 317
 318 # Combine n3 and n4 ( with his )
 319 $c->merge_readings( 'n3', 'n4', 1 );
 320 ok( !$c->reading('n4'), "Reading n4 is gone" );
 321 is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
 322
 323 # Collapse n9 and n10 ( rood / root )
 324 $c->merge_readings( 'n9', 'n10' );
 325 ok( !$c->reading('n10'), "Reading n10 is gone" );
 326 is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
 327
 328 # Combine n21 and n21p0
 329 my $remaining = $c->reading('n21');
 330 $remaining ||= $c->reading('n22');  # one of these should still exist
 331 $c->merge_readings( 'n21p0', $remaining, 1 );
 332 ok( !$c->reading('n21'), "Reading $remaining is gone" );
 333 is( $c->reading('n21p0')->text, 'unto', "Reading n21p0 merged correctly" );
 334
 335 =end testing
 336
 337 =cut
 338
 339 sub merge_readings {
 340         my $self = shift;
 341
 342         # Sanity check
 343         my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
 344         my $mergemeta = $kept_obj->is_meta;
 345         throw( "Cannot merge meta and non-meta reading" )
 346                 unless ( $mergemeta && $del_obj->is_meta )
 347                         || ( !$mergemeta && !$del_obj->is_meta );
 348         if( $mergemeta ) {
 349                 throw( "Cannot merge with start or end node" )
 350                         if( $kept_obj eq $self->start || $kept_obj eq $self->end
 351                                 || $del_obj eq $self->start || $del_obj eq $self->end );
 352                 throw( "Cannot combine text of meta readings" ) if $combine;
 353         }
 354         # We only need the IDs for adding paths to the graph, not the reading
 355         # objects themselves.
 356         my $kept = $kept_obj->id;
 357         my $deleted = $del_obj->id;
 358         $self->_graphcalc_done(0);
 359
 360     # The kept reading should inherit the paths and the relationships
 361     # of the deleted reading.
 362         foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
 363                 my @vector = ( $kept );
 364                 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
 365                 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
 366                 next if $vector[0] eq $vector[1]; # Don't add a self loop
 367                 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
 368                 $self->sequence->add_edge( @vector );
 369                 my $fwits = $self->sequence->get_edge_attributes( @vector );
 370                 @wits{keys %$fwits} = values %$fwits;
 371                 $self->sequence->set_edge_attributes( @vector, \%wits );
 372         }
 373         $self->relations->merge_readings( $kept, $deleted, $combine );
 374
 375         # Do the deletion deed.
 376         if( $combine ) {
 377                 # Combine the text of the readings
 378                 my $joinstr = $combine_char;
 379                 unless( defined $joinstr ) {
 380                         $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
 381                         $joinstr = $self->wordsep unless defined $joinstr;
 382                 }
 383                 $kept_obj->_combine( $del_obj, $joinstr );
 384         }
 385         $self->del_reading( $deleted );
 386 }
 387
 388 =head2 merge_related( @relationship_types )
 389
 390 Merge all readings linked with the relationship types given. If any of the selected type(s) is not a colocation, the graph will no longer be linear. The majority/plurality reading in each case will be the one kept.
 391
 392 WARNING: This operation cannot be undone.
 393
 394 =cut
 395
 396 =begin testing
 397
 398 use Text::Tradition;
 399 use TryCatch;
 400
 401 my $t = Text::Tradition->new(
 402     'name'  => 'inline',
 403     'input' => 'Self',
 404     'file'  => 't/data/legendfrag.xml',
 405     );
 406 my $c = $t->collation;
 407
 408 my %rdg_ids;
 409 map { $rdg_ids{$_} = 1 } $c->readings;
 410 $c->merge_related( 'orthographic' );
 411 is( scalar( $c->readings ), keys( %rdg_ids ) - 8,
 412         "Successfully collapsed orthographic variation" );
 413 map { $rdg_ids{$_} = undef } qw/ r13.3 r11.4 r8.5 r8.2 r7.7 r7.5 r7.4 r7.1 /;
 414 foreach my $rid ( keys %rdg_ids ) {
 415         my $exp = $rdg_ids{$rid};
 416         is( !$c->reading( $rid ), !$exp, "Reading $rid correctly " .
 417                 ( $exp ? "retained" : "removed" ) );
 418 }
 419 ok( $c->linear, "Graph is still linear" );
 420 try {
 421         $c->calculate_ranks; # This should succeed
 422         ok( 1, "Can still calculate ranks on the new graph" );
 423 } catch {
 424         ok( 0, "Rank calculation on merged graph failed: $@" );
 425 }
 426
 427 # Now add some transpositions
 428 $c->add_relationship( 'r8.4', 'r10.4', { type => 'transposition' } );
 429 $c->merge_related( 'transposition' );
 430 is( scalar( $c->readings ), keys( %rdg_ids ) - 9,
 431         "Transposed relationship is merged away" );
 432 ok( !$c->reading('r8.4'), "Correct transposed reading removed" );
 433 ok( !$c->linear, "Graph is no longer linear" );
 434 try {
 435         $c->calculate_ranks; # This should fail
 436         ok( 0, "Rank calculation happened on nonlinear graph?!" );
 437 } catch ( Text::Tradition::Error $e ) {
 438         is( $e->message, 'Cannot calculate ranks on a non-linear graph',
 439                 "Rank calculation on merged graph threw an error" );
 440 }
 441
 442
 443
 444 =end testing
 445
 446 =cut
 447
 448 # TODO: there should be a way to display merged without affecting the underlying data!
 449
 450 sub merge_related {
 451         my $self = shift;
 452         my %reltypehash;
 453         map { $reltypehash{$_} = 1 } @_;
 454
 455         # Set up the filter for finding related readings
 456         my $filter = sub {
 457                 exists $reltypehash{$_[0]->type};
 458         };
 459
 460         my $linear = 1;
 461         # Go through all readings looking for related ones
 462         foreach my $r ( $self->readings ) {
 463                 next unless $self->reading( "$r" ); # might have been deleted meanwhile
 464                 my @related = $self->related_readings( $r, $filter );
 465                 if( @related ) {
 466                         push( @related, $r );
 467                         @related = sort {
 468                                         scalar $b->witnesses <=> scalar $a->witnesses
 469                                 } @related;
 470                         my $keep = shift @related;
 471                         foreach my $delr ( @related ) {
 472                                 $linear = undef
 473                                         unless( $self->get_relationship( $keep, $delr )->colocated );
 474                                 $self->merge_readings( $keep, $delr );
 475                         }
 476                 }
 477         }
 478         $self->linear( $linear );
 479 }
 480
 481 =head2 compress_readings
 482
 483 Where possible in the graph, compresses plain sequences of readings into a
 484 single reading. The sequences must consist of readings with no
 485 relationships to other readings, with only a single witness path between
 486 them and no other witness paths from either that would skip the other. The
 487 readings must also not be marked as nonsense or bad grammar.
 488
 489 WARNING: This operation cannot be undone.
 490
 491 =cut
 492
 493 sub compress_readings {
 494         my $self = shift;
 495         # Anywhere in the graph that there is a reading that joins only to a single
 496         # successor, and neither of these have any relationships, just join the two
 497         # readings.
 498         foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
 499                 # Now look for readings that can be joined to their successors.
 500                 next unless $rdg->is_combinable;
 501                 my %seen;
 502                 while( $self->sequence->successors( $rdg ) == 1 ) {
 503                         my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
 504                         throw( "Infinite loop" ) if $seen{$next->id};
 505                         $seen{$next->id} = 1;
 506                         last if $self->sequence->predecessors( $next ) > 1;
 507                         last unless $next->is_combinable;
 508                         say "Joining readings $rdg and $next";
 509                         $self->merge_readings( $rdg, $next, 1 );
 510                 }
 511         }
 512         # Make sure we haven't screwed anything up
 513         foreach my $wit ( $self->tradition->witnesses ) {
 514                 my $pathtext = $self->path_text( $wit->sigil );
 515                 my $origtext = join( ' ', @{$wit->text} );
 516                 throw( "Text differs for witness " . $wit->sigil )
 517                         unless $pathtext eq $origtext;
 518                 if( $wit->is_layered ) {
 519                         $pathtext = $self->path_text( $wit->sigil.$self->ac_label );
 520                         $origtext = join( ' ', @{$wit->layertext} );
 521                         throw( "Ante-corr text differs for witness " . $wit->sigil )
 522                                 unless $pathtext eq $origtext;
 523                 }
 524         }
 525
 526         $self->relations->rebuild_equivalence();
 527         $self->calculate_ranks();
 528 }
 529
 530 # Helper function for manipulating the graph.
 531 sub _stringify_args {
 532         my( $self, $first, $second, @args ) = @_;
 533     $first = $first->id
 534         if ref( $first ) eq 'Text::Tradition::Collation::Reading';
 535     $second = $second->id
 536         if ref( $second ) eq 'Text::Tradition::Collation::Reading';
 537     return( $first, $second, @args );
 538 }
 539
 540 # Helper function for manipulating the graph.
 541 sub _objectify_args {
 542         my( $self, $first, $second, $arg ) = @_;
 543     $first = $self->reading( $first )
 544         unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
 545     $second = $self->reading( $second )
 546         unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
 547     return( $first, $second, $arg );
 548 }
 549
 550 =head2 duplicate_reading( $reading, @witlist )
 551
 552 Split the given reading into two, so that the new reading is in the path for
 553 the witnesses given in @witlist. If the result is that certain non-colocated
 554 relationships (e.g. transpositions) are no longer valid, these will be removed.
 555 Returns the newly-created reading.
 556
 557 =begin testing
 558
 559 use Test::More::UTF8;
 560 use Text::Tradition;
 561 use TryCatch;
 562
 563 my $st = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/collatecorr.xml' );
 564 is( ref( $st ), 'Text::Tradition', "Got a tradition from test file" );
 565 ok( $st->has_witness('Ba96'), "Tradition has the affected witness" );
 566
 567 my $sc = $st->collation;
 568 my $numr = 17;
 569 ok( $sc->reading('n131'), "Tradition has the affected reading" );
 570 is( scalar( $sc->readings ), $numr, "There are $numr readings in the graph" );
 571 is( $sc->end->rank, 14, "There are fourteen ranks in the graph" );
 572
 573 # Detach the erroneously collated reading
 574 my( $newr, @del_rdgs ) = $sc->duplicate_reading( 'n131', 'Ba96' );
 575 ok( $newr, "New reading was created" );
 576 ok( $sc->reading('n131_0'), "Detached the bad collation with a new reading" );
 577 is( scalar( $sc->readings ), $numr + 1, "A reading was added to the graph" );
 578 is( $sc->end->rank, 10, "There are now only ten ranks in the graph" );
 579 my $csucc = $sc->common_successor( 'n131', 'n131_0' );
 580 is( $csucc->id, 'n136', "Found correct common successor to duped reading" );
 581
 582 # Check that the bad transposition is gone
 583 is( scalar @del_rdgs, 1, "Deleted reading was returned by API call" );
 584 is( $sc->get_relationship( 'n130', 'n135' ), undef, "Bad transposition relationship is gone" );
 585
 586 # The collation should not be fixed
 587 my @pairs = $sc->identical_readings();
 588 is( scalar @pairs, 0, "Not re-collated yet" );
 589 # Fix the collation
 590 ok( $sc->merge_readings( 'n124', 'n131_0' ), "Collated the readings correctly" );
 591 @pairs = $sc->identical_readings( start => 'n124', end => $csucc->id );
 592 is( scalar @pairs, 3, "Found three more identical readings" );
 593 is( $sc->end->rank, 11, "The ranks shifted appropriately" );
 594 $sc->flatten_ranks();
 595 is( scalar( $sc->readings ), $numr - 3, "Now we are collated correctly" );
 596
 597 # Check that we can't "duplicate" a reading with no wits or with all wits
 598 try {
 599         my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124' );
 600         ok( 0, "Reading duplication without witnesses throws an error" );
 601 } catch( Text::Tradition::Error $e ) {
 602         like( $e->message, qr/Must specify one or more witnesses/,
 603                 "Reading duplication without witnesses throws the expected error" );
 604 } catch {
 605         ok( 0, "Reading duplication without witnesses threw the wrong error" );
 606 }
 607
 608 try {
 609         my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124', 'Ba96', 'Mü11475' );
 610         ok( 0, "Reading duplication with all witnesses throws an error" );
 611 } catch( Text::Tradition::Error $e ) {
 612         like( $e->message, qr/Cannot join all witnesses/,
 613                 "Reading duplication with all witnesses throws the expected error" );
 614 } catch {
 615         ok( 0, "Reading duplication with all witnesses threw the wrong error" );
 616 }
 617
 618 =end testing
 619
 620 =cut
 621
 622 sub duplicate_reading {
 623         my( $self, $r, @wits ) = @_;
 624         # Check that we are not doing anything unwise.
 625         throw( "Must specify one or more witnesses for the duplicated reading" )
 626                 unless @wits;
 627         unless( ref( $r ) eq 'Text::Tradition::Collation::Reading' ) {
 628                 $r = $self->reading( $r );
 629         }
 630         throw( "Cannot duplicate a meta-reading" )
 631                 if $r->is_meta;
 632         my $ordered_req_wits = join( ',', sort @wits );
 633         my $ordered_rdg_wits = join( ',', $r->witnesses );
 634         throw( "Cannot join all witnesses to the new reading" )
 635                 if $ordered_req_wits eq $ordered_rdg_wits;
 636
 637         # Get all the reading attributes and duplicate them.
 638         my $rmeta = Text::Tradition::Collation::Reading->meta;
 639         my %args;
 640     foreach my $attr( $rmeta->get_all_attributes ) {
 641                 next if $attr->name =~ /^_/;
 642                 my $acc = $attr->get_read_method;
 643                 if( !$acc && $attr->has_applied_traits ) {
 644                         my $tr = $attr->applied_traits;
 645                         if( $tr->[0] =~ /::(Array|Hash)$/ ) {
 646                                 my $which = $1;
 647                                 my %methods = reverse %{$attr->handles};
 648                                 $acc = $methods{elements};
 649                                 $args{$attr->name} = $which eq 'Array'
 650                                         ? [ $r->$acc ] : { $r->$acc };
 651                         }
 652                 } else {
 653                         $args{$attr->name} = $r->$acc if $acc;
 654                 }
 655         }
 656         # By definition the new reading will no longer be common.
 657         $args{is_common} = 0;
 658         # The new reading also needs its own ID.
 659         $args{id} = $self->_generate_dup_id( $r->id );
 660
 661         # Try to make the new reading.
 662         my $newr = $self->add_reading( \%args );
 663         # The old reading is also no longer common.
 664         $r->is_common( 0 );
 665
 666         # For each of the witnesses, dissociate from the old reading and
 667         # associate with the new.
 668         foreach my $wit ( @wits ) {
 669                 my $prior = $self->prior_reading( $r, $wit );
 670                 my $next = $self->next_reading( $r, $wit );
 671                 $self->del_path( $prior, $r, $wit );
 672                 $self->add_path( $prior, $newr, $wit );
 673                 $self->del_path( $r, $next, $wit );
 674                 $self->add_path( $newr, $next, $wit );
 675         }
 676
 677         # If the graph is ranked, we need to look for relationships that are now
 678         # invalid (i.e. 'non-colocation' types that might now be colocated) and
 679         # remove them. If not, we can skip it.
 680         my $succ;
 681         my %rrk;
 682         my @deleted_relations;
 683         if( $self->end->has_rank ) {
 684                 # Find the point where we can stop checking
 685                 $succ = $self->common_successor( $r, $newr );
 686
 687                 # Hash the existing ranks
 688                 foreach my $rdg ( $self->readings ) {
 689                         $rrk{$rdg->id} = $rdg->rank;
 690                 }
 691                 # Calculate the new ranks
 692                 $self->calculate_ranks();
 693
 694                 # Check for invalid non-colocated relationships among changed-rank readings
 695                 # from where the ranks start changing up to $succ
 696                 my $lastrank = $succ->rank;
 697                 foreach my $rdg ( $self->readings ) {
 698                         next if $rdg->rank > $lastrank;
 699                         next if $rdg->rank == $rrk{$rdg->id};
 700                         my @noncolo = $rdg->related_readings( sub { !$_[0]->colocated } );
 701                         next unless @noncolo;
 702                         foreach my $nc ( @noncolo ) {
 703                                 unless( $self->relations->verify_or_delete( $rdg, $nc ) ) {
 704                                         push( @deleted_relations, [ $rdg->id, $nc->id ] );
 705                                 }
 706                         }
 707                 }
 708         }
 709         return ( $newr, @deleted_relations );
 710 }
 711
 712 sub _generate_dup_id {
 713         my( $self, $rid ) = @_;
 714         my $newid;
 715         my $i = 0;
 716         while( !$newid ) {
 717                 $newid = $rid."_$i";
 718                 if( $self->has_reading( $newid ) ) {
 719                         $newid = '';
 720                         $i++;
 721                 }
 722         }
 723         return $newid;
 724 }
 725
 726 ### Path logic
 727
 728 sub add_path {
 729         my $self = shift;
 730
 731         # We only need the IDs for adding paths to the graph, not the reading
 732         # objects themselves.
 733     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 734
 735         $self->_graphcalc_done(0);
 736         # Connect the readings
 737         unless( $self->sequence->has_edge( $source, $target ) ) {
 738             $self->sequence->add_edge( $source, $target );
 739             $self->relations->add_equivalence_edge( $source, $target );
 740         }
 741     # Note the witness in question
 742     $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
 743 }
 744
 745 sub del_path {
 746         my $self = shift;
 747         my @args;
 748         if( ref( $_[0] ) eq 'ARRAY' ) {
 749                 my $e = shift @_;
 750                 @args = ( @$e, @_ );
 751         } else {
 752                 @args = @_;
 753         }
 754
 755         # We only need the IDs for removing paths from the graph, not the reading
 756         # objects themselves.
 757     my( $source, $target, $wit ) = $self->_stringify_args( @args );
 758
 759         $self->_graphcalc_done(0);
 760         if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
 761                 $self->sequence->delete_edge_attribute( $source, $target, $wit );
 762         }
 763         unless( $self->sequence->has_edge_attributes( $source, $target ) ) {
 764                 $self->sequence->delete_edge( $source, $target );
 765                 $self->relations->delete_equivalence_edge( $source, $target );
 766         }
 767 }
 768
 769
 770 # Extra graph-alike utility
 771 sub has_path {
 772         my $self = shift;
 773     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 774         return undef unless $self->sequence->has_edge( $source, $target );
 775         return $self->sequence->has_edge_attribute( $source, $target, $wit );
 776 }
 777
 778 =head2 clear_witness( @sigil_list )
 779
 780 Clear the given witnesses out of the collation entirely, removing references
 781 to them in paths, and removing readings that belong only to them.  Should only
 782 be called via $tradition->del_witness.
 783
 784 =cut
 785
 786 sub clear_witness {
 787         my( $self, @sigils ) = @_;
 788
 789         $self->_graphcalc_done(0);
 790         # Clear the witness(es) out of the paths
 791         foreach my $e ( $self->paths ) {
 792                 foreach my $sig ( @sigils ) {
 793                         $self->del_path( $e, $sig );
 794                 }
 795         }
 796
 797         # Clear out the newly unused readings
 798         foreach my $r ( $self->readings ) {
 799                 unless( $self->reading_witnesses( $r ) ) {
 800                         $self->del_reading( $r );
 801                 }
 802         }
 803 }
 804
 805 sub add_relationship {
 806         my $self = shift;
 807     my( $source, $target, $opts ) = $self->_stringify_args( @_ );
 808     my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
 809     foreach my $v ( @vectors ) {
 810         next unless $self->get_relationship( $v )->colocated;
 811         if( $self->reading( $v->[0] )->has_rank && $self->reading( $v->[1] )->has_rank
 812                 && $self->reading( $v->[0] )->rank ne $self->reading( $v->[1] )->rank ) {
 813                         $self->_graphcalc_done(0);
 814                         $self->_clear_cache;
 815                         last;
 816         }
 817     }
 818     return @vectors;
 819 }
 820
 821 around qw/ get_relationship del_relationship / => sub {
 822         my $orig = shift;
 823         my $self = shift;
 824         my @args = @_;
 825         if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
 826                 @args = @{$_[0]};
 827         }
 828         my @stringargs = $self->_stringify_args( @args );
 829         $self->$orig( @stringargs );
 830 };
 831
 832 =head2 reading_witnesses( $reading )
 833
 834 Return a list of sigils corresponding to the witnesses in which the reading appears.
 835
 836 =cut
 837
 838 sub reading_witnesses {
 839         my( $self, $reading ) = @_;
 840         # We need only check either the incoming or the outgoing edges; I have
 841         # arbitrarily chosen "incoming".  Thus, special-case the start node.
 842         if( $reading eq $self->start ) {
 843                 return map { $_->sigil } grep { $_->is_collated } $self->tradition->witnesses;
 844         }
 845         my %all_witnesses;
 846         foreach my $e ( $self->sequence->edges_to( $reading ) ) {
 847                 my $wits = $self->sequence->get_edge_attributes( @$e );
 848                 @all_witnesses{ keys %$wits } = 1;
 849         }
 850         my $acstr = $self->ac_label;
 851         foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
 852                 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
 853         }
 854         return keys %all_witnesses;
 855 }
 856
 857 =head1 OUTPUT METHODS
 858
 859 =head2 as_svg( \%options )
 860
 861 Returns an SVG string that represents the graph, via as_dot and graphviz.
 862 See as_dot for a list of options.  Must have GraphViz (dot) installed to run.
 863
 864 =cut
 865
 866 sub as_svg {
 867     my( $self, $opts ) = @_;
 868     throw( "Need GraphViz installed to output SVG" )
 869         unless File::Which::which( 'dot' );
 870     my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
 871     $self->calculate_ranks()
 872         unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
 873         my @cmd = qw/dot -Tsvg/;
 874         my( $svg, $err );
 875         my $dotfile = File::Temp->new();
 876         ## USE FOR DEBUGGING
 877         # $dotfile->unlink_on_destroy(0);
 878         binmode $dotfile, ':utf8';
 879         print $dotfile $self->as_dot( $opts );
 880         push( @cmd, $dotfile->filename );
 881         run( \@cmd, ">", binary(), \$svg );
 882         $svg = decode_utf8( $svg );
 883         return $svg;
 884 }
 885
 886
 887 =head2 as_dot( \%options )
 888
 889 Returns a string that is the collation graph expressed in dot
 890 (i.e. GraphViz) format.  Options include:
 891
 892 =over 4
 893
 894 =item * from
 895
 896 =item * to
 897
 898 =item * color_common
 899
 900 =back
 901
 902 =cut
 903
 904 sub as_dot {
 905     my( $self, $opts ) = @_;
 906     my $startrank = $opts->{'from'} if $opts;
 907     my $endrank = $opts->{'to'} if $opts;
 908     my $color_common = $opts->{'color_common'} if $opts;
 909     my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
 910        && $self->end->rank > 100;
 911     $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
 912
 913     # Check the arguments
 914     if( $startrank ) {
 915         return if $endrank && $startrank > $endrank;
 916         return if $startrank > $self->end->rank;
 917         }
 918         if( defined $endrank ) {
 919                 return if $endrank < 0;
 920                 $endrank = undef if $endrank == $self->end->rank;
 921         }
 922
 923     my $graph_name = $self->tradition->name;
 924     $graph_name =~ s/[^\w\s]//g;
 925     $graph_name = join( '_', split( /\s+/, $graph_name ) );
 926
 927     my %graph_attrs = (
 928         'rankdir' => 'LR',
 929         'bgcolor' => 'none',
 930         );
 931     my %node_attrs = (
 932         'fontsize' => 14,
 933         'fillcolor' => 'white',
 934         'style' => 'filled',
 935         'shape' => 'ellipse'
 936         );
 937     my %edge_attrs = (
 938         'arrowhead' => 'open',
 939         'color' => '#000000',
 940         'fontcolor' => '#000000',
 941         );
 942
 943     my $dot = sprintf( "digraph %s {\n", $graph_name );
 944     $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
 945     $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
 946
 947         # Output substitute start/end readings if necessary
 948         if( $startrank ) {
 949                 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
 950         }
 951         if( $endrank ) {
 952                 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
 953         }
 954         if( $STRAIGHTENHACK ) {
 955                 ## HACK part 1
 956                 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
 957                 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
 958                 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
 959         }
 960         my %used;  # Keep track of the readings that actually appear in the graph
 961         # Sort the readings by rank if we have ranks; this speeds layout.
 962         my @all_readings = $self->end->has_rank
 963                 ? sort { $a->rank <=> $b->rank } $self->readings
 964                 : $self->readings;
 965         # TODO Refrain from outputting lacuna nodes - just grey out the edges.
 966     foreach my $reading ( @all_readings ) {
 967         # Only output readings within our rank range.
 968         next if $startrank && $reading->rank < $startrank;
 969         next if $endrank && $reading->rank > $endrank;
 970         $used{$reading->id} = 1;
 971         # Need not output nodes without separate labels
 972         next if $reading->id eq $reading->text;
 973         my $rattrs;
 974         my $label = $reading->text;
 975         $label .= '-' if $reading->join_next;
 976         $label = "-$label" if $reading->join_prior;
 977         $label =~ s/\"/\\\"/g;
 978                 $rattrs->{'label'} = $label;
 979                 $rattrs->{'id'} = $reading->id;
 980                 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
 981         $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
 982     }
 983
 984         # Add the real edges. Need to weight one edge per rank jump, in a
 985         # continuous line.
 986         # my $weighted = $self->_add_edge_weights;
 987     my @edges = $self->paths;
 988         my( %substart, %subend );
 989     foreach my $edge ( @edges ) {
 990         # Do we need to output this edge?
 991         if( $used{$edge->[0]} && $used{$edge->[1]} ) {
 992                 my $label = $self->_path_display_label( $opts,
 993                         $self->path_witnesses( $edge ) );
 994                         my $variables = { %edge_attrs, 'label' => $label };
 995
 996                         # Account for the rank gap if necessary
 997                         my $rank0 = $self->reading( $edge->[0] )->rank
 998                                 if $self->reading( $edge->[0] )->has_rank;
 999                         my $rank1 = $self->reading( $edge->[1] )->rank
1000                                 if $self->reading( $edge->[1] )->has_rank;
1001                         if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
1002                                 $variables->{'minlen'} = $rank1 - $rank0;
1003                         }
1004
1005                         # Add the calculated edge weights
1006                         # if( exists $weighted->{$edge->[0]}
1007                         #       && $weighted->{$edge->[0]} eq $edge->[1] ) {
1008                         #       # $variables->{'color'} = 'red';
1009                         #       $variables->{'weight'} = 3.0;
1010                         # }
1011
1012                         # EXPERIMENTAL: make edge width reflect no. of witnesses
1013                         my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
1014                         $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
1015
1016                         my $varopts = _dot_attr_string( $variables );
1017                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1018                                 $edge->[0], $edge->[1], $varopts );
1019         } elsif( $used{$edge->[0]} ) {
1020                 $subend{$edge->[0]} = $edge->[1];
1021         } elsif( $used{$edge->[1]} ) {
1022                 $substart{$edge->[1]} = $edge->[0];
1023         }
1024     }
1025
1026     # If we are asked to, add relationship links
1027     if( exists $opts->{show_relations} ) {
1028         my $filter = $opts->{show_relations}; # can be 'transposition' or 'all'
1029         if( $filter eq 'transposition' ) {
1030                 $filter =~ qr/^transposition$/;
1031         }
1032         foreach my $redge ( $self->relationships ) {
1033                 if( $used{$redge->[0]} && $used{$redge->[1]} ) {
1034                         if( $filter ne 'all' ) {
1035                                 my $rel = $self->get_relationship( $redge );
1036                                 next unless $rel->type =~ /$filter/;
1037                                         my $variables = {
1038                                                 arrowhead => 'none',
1039                                                 color => '#FFA14F',
1040                                                 constraint => 'false',
1041                                                 label => uc( substr( $rel->type, 0, 4 ) ),
1042                                                 penwidth => '3',
1043                                         };
1044                                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1045                                                 $redge->[0], $redge->[1], _dot_attr_string( $variables ) );
1046                                 }
1047                 }
1048         }
1049     }
1050
1051     # Add substitute start and end edges if necessary
1052     foreach my $node ( keys %substart ) {
1053         my $witstr = $self->_path_display_label( $opts,
1054                 $self->path_witnesses( $substart{$node}, $node ) );
1055         my $variables = { %edge_attrs, 'label' => $witstr };
1056         my $nrdg = $self->reading( $node );
1057         if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
1058                 # Substart is actually one lower than $startrank
1059                 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
1060         }
1061         my $varopts = _dot_attr_string( $variables );
1062         $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
1063         }
1064     foreach my $node ( keys %subend ) {
1065         my $witstr = $self->_path_display_label( $opts,
1066                 $self->path_witnesses( $node, $subend{$node} ) );
1067         my $variables = { %edge_attrs, 'label' => $witstr };
1068         my $varopts = _dot_attr_string( $variables );
1069         $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
1070         }
1071         # HACK part 2
1072         if( $STRAIGHTENHACK ) {
1073                 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
1074                 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
1075         }
1076
1077     $dot .= "}\n";
1078     return $dot;
1079 }
1080
1081 sub _dot_attr_string {
1082         my( $hash ) = @_;
1083         my @attrs;
1084         foreach my $k ( sort keys %$hash ) {
1085                 my $v = $hash->{$k};
1086                 push( @attrs, $k.'="'.$v.'"' );
1087         }
1088         return( '[ ' . join( ', ', @attrs ) . ' ]' );
1089 }
1090
1091 sub _add_edge_weights {
1092         my $self = shift;
1093         # Walk the graph from START to END, choosing the successor node with
1094         # the largest number of witness paths each time.
1095         my $weighted = {};
1096         my $curr = $self->start->id;
1097         my $ranked = $self->end->has_rank;
1098         while( $curr ne $self->end->id ) {
1099                 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
1100                 my @succ = sort { $self->path_witnesses( $curr, $a )
1101                                                         <=> $self->path_witnesses( $curr, $b ) }
1102                         $self->sequence->successors( $curr );
1103                 my $next = pop @succ;
1104                 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
1105                 # Try to avoid lacunae in the weighted path.
1106                 while( @succ &&
1107                            ( $self->reading( $next )->is_lacuna ||
1108                                  $nextrank - $rank > 1 ) ){
1109                         $next = pop @succ;
1110                 }
1111                 $weighted->{$curr} = $next;
1112                 $curr = $next;
1113         }
1114         return $weighted;
1115 }
1116
1117 =head2 path_witnesses( $edge )
1118
1119 Returns the list of sigils whose witnesses are associated with the given edge.
1120 The edge can be passed as either an array or an arrayref of ( $source, $target ).
1121
1122 =cut
1123
1124 sub path_witnesses {
1125         my( $self, @edge ) = @_;
1126         # If edge is an arrayref, cope.
1127         if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
1128                 my $e = shift @edge;
1129                 @edge = @$e;
1130         }
1131         my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
1132         return @wits;
1133 }
1134
1135 # Helper function. Make a display label for the given witnesses, showing a.c.
1136 # witnesses only where the main witness is not also in the list.
1137 sub _path_display_label {
1138         my $self = shift;
1139         my $opts = shift;
1140         my %wits;
1141         map { $wits{$_} = 1 } @_;
1142
1143         # If an a.c. wit is listed, remove it if the main wit is also listed.
1144         # Otherwise keep it for explicit listing.
1145         my $aclabel = $self->ac_label;
1146         my @disp_ac;
1147         foreach my $w ( sort keys %wits ) {
1148                 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
1149                         if( exists $wits{$1} ) {
1150                                 delete $wits{$w};
1151                         } else {
1152                                 push( @disp_ac, $w );
1153                         }
1154                 }
1155         }
1156
1157         if( $opts->{'explicit_wits'} ) {
1158                 return join( ', ', sort keys %wits );
1159         } else {
1160                 # See if we are in a majority situation.
1161                 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
1162                 $maj = $maj > 5 ? $maj : 5;
1163                 if( scalar keys %wits > $maj ) {
1164                         unshift( @disp_ac, 'majority' );
1165                         return join( ', ', @disp_ac );
1166                 } else {
1167                         return join( ', ', sort keys %wits );
1168                 }
1169         }
1170 }
1171
1172 =head2 readings_at_rank( $rank )
1173
1174 Returns a list of readings at a given rank, taken from the alignment table.
1175
1176 =cut
1177
1178 sub readings_at_rank {
1179         my( $self, $rank ) = @_;
1180         my $table = $self->alignment_table;
1181         # Table rank is real rank - 1.
1182         my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
1183         my %readings;
1184         foreach my $e ( @elements ) {
1185                 next unless ref( $e ) eq 'HASH';
1186                 next unless exists $e->{'t'};
1187                 $readings{$e->{'t'}->id} = $e->{'t'};
1188         }
1189         return values %readings;
1190 }
1191
1192 =head2 as_graphml
1193
1194 Returns a GraphML representation of the collation.  The GraphML will contain
1195 two graphs. The first expresses the attributes of the readings and the witness
1196 paths that link them; the second expresses the relationships that link the
1197 readings.  This is the native transfer format for a tradition.
1198
1199 =begin testing
1200
1201 use Text::Tradition;
1202 use TryCatch;
1203
1204 my $READINGS = 311;
1205 my $PATHS = 361;
1206
1207 my $datafile = 't/data/florilegium_tei_ps.xml';
1208 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1209                                       'name' => 'test0',
1210                                       'file' => $datafile,
1211                                       'linear' => 1 );
1212
1213 ok( $tradition, "Got a tradition object" );
1214 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
1215 ok( $tradition->collation, "Tradition has a collation" );
1216
1217 my $c = $tradition->collation;
1218 is( scalar $c->readings, $READINGS, "Collation has all readings" );
1219 is( scalar $c->paths, $PATHS, "Collation has all paths" );
1220 is( scalar $c->relationships, 0, "Collation has all relationships" );
1221
1222 # Add a few relationships
1223 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
1224 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
1225 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
1226
1227 # Now write it to GraphML and parse it again.
1228
1229 my $graphml = $c->as_graphml;
1230 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
1231 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
1232 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
1233 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
1234
1235 # Now add a stemma, write to GraphML, and look at the output.
1236 SKIP: {
1237         skip "Analysis module not present", 3 unless $tradition->can( 'add_stemma' );
1238         my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
1239         is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
1240         is( $tradition->stemmata, 1, "Tradition now has the stemma" );
1241         $graphml = $c->as_graphml;
1242         like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
1243 }
1244
1245 =end testing
1246
1247 =cut
1248
1249 ## TODO MOVE this to Tradition.pm and modularize it better
1250 sub as_graphml {
1251     my( $self, $options ) = @_;
1252         $self->calculate_ranks unless $self->_graphcalc_done;
1253
1254         my $start = $options->{'from'}
1255                 ? $self->reading( $options->{'from'} ) : $self->start;
1256         my $end = $options->{'to'}
1257                 ? $self->reading( $options->{'to'} ) : $self->end;
1258         if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
1259                 throw( 'Start node must be before end node' );
1260         }
1261         # The readings need to be ranked for this to work.
1262         $start = $self->start unless $start->has_rank;
1263         $end = $self->end unless $end->has_rank;
1264         my $rankoffset = 0;
1265         unless( $start eq $self->start ) {
1266                 $rankoffset = $start->rank - 1;
1267         }
1268         my %use_readings;
1269
1270     # Some namespaces
1271     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
1272     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
1273     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
1274         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
1275
1276     # Create the document and root node
1277     require XML::LibXML;
1278     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
1279     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
1280     $graphml->setDocumentElement( $root );
1281     $root->setNamespace( $xsi_ns, 'xsi', 0 );
1282     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
1283
1284     # List of attribute types to save on our objects and their corresponding
1285     # GraphML types
1286     my %save_types = (
1287         'Str' => 'string',
1288         'Int' => 'int',
1289         'Bool' => 'boolean',
1290         'ReadingID' => 'string',
1291         'RelationshipType' => 'string',
1292         'RelationshipScope' => 'string',
1293     );
1294
1295     # Add the data keys for the graph. Include an extra key 'version' for the
1296     # GraphML output version.
1297     my %graph_data_keys;
1298     my $gdi = 0;
1299     my %graph_attributes = ( 'version' => 'string' );
1300         # Graph attributes include those of Tradition and those of Collation.
1301         my %gattr_from;
1302         # TODO Use meta introspection method from duplicate_reading to do this
1303         # instead of naming custom keys.
1304         my $tmeta = $self->tradition->meta;
1305         my $cmeta = $self->meta;
1306         map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
1307         map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
1308         foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
1309                 next if $attr->name =~ /^_/;
1310                 next unless $save_types{$attr->type_constraint->name};
1311                 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1312         }
1313     # Extra custom keys for complex objects that should be saved in some form.
1314     # The subroutine should return a string, or undef/empty.
1315     if( $tmeta->has_method('stemmata') ) {
1316                 $graph_attributes{'stemmata'} = sub {
1317                         my @stemstrs;
1318                         map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1319                                 $self->tradition->stemmata;
1320                         join( "\n", @stemstrs );
1321                 };
1322         }
1323
1324         if( $tmeta->has_method('user') ) {
1325                 $graph_attributes{'user'} = sub {
1326                         $self->tradition->user ? $self->tradition->user->id : undef
1327                 };
1328         }
1329
1330     foreach my $datum ( sort keys %graph_attributes ) {
1331         $graph_data_keys{$datum} = 'dg'.$gdi++;
1332         my $key = $root->addNewChild( $graphml_ns, 'key' );
1333         my $dtype = ref( $graph_attributes{$datum} ) ? 'string'
1334                 : $graph_attributes{$datum};
1335         $key->setAttribute( 'attr.name', $datum );
1336         $key->setAttribute( 'attr.type', $dtype );
1337         $key->setAttribute( 'for', 'graph' );
1338         $key->setAttribute( 'id', $graph_data_keys{$datum} );
1339     }
1340
1341     # Add the data keys for reading nodes
1342     my %reading_attributes;
1343     my $rmeta = Text::Tradition::Collation::Reading->meta;
1344     foreach my $attr( $rmeta->get_all_attributes ) {
1345                 next if $attr->name =~ /^_/;
1346                 next unless $save_types{$attr->type_constraint->name};
1347                 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1348         }
1349         if( $self->start->does('Text::Tradition::Morphology' ) ) {
1350                 # Extra custom key for the reading morphology
1351                 $reading_attributes{'lexemes'} = 'string';
1352         }
1353
1354     my %node_data_keys;
1355     my $ndi = 0;
1356     foreach my $datum ( sort keys %reading_attributes ) {
1357         $node_data_keys{$datum} = 'dn'.$ndi++;
1358         my $key = $root->addNewChild( $graphml_ns, 'key' );
1359         $key->setAttribute( 'attr.name', $datum );
1360         $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
1361         $key->setAttribute( 'for', 'node' );
1362         $key->setAttribute( 'id', $node_data_keys{$datum} );
1363     }
1364
1365     # Add the data keys for edges, that is, paths and relationships. Path
1366     # data does not come from a Moose class so is here manually.
1367     my $edi = 0;
1368     my %edge_data_keys;
1369     my %edge_attributes = (
1370         witness => 'string',                    # ID/label for a path
1371         extra => 'boolean',                             # Path key
1372         );
1373     my @path_attributes = keys %edge_attributes; # track our manual additions
1374     my $pmeta = Text::Tradition::Collation::Relationship->meta;
1375     foreach my $attr( $pmeta->get_all_attributes ) {
1376                 next if $attr->name =~ /^_/;
1377                 next unless $save_types{$attr->type_constraint->name};
1378                 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1379         }
1380     foreach my $datum ( sort keys %edge_attributes ) {
1381         $edge_data_keys{$datum} = 'de'.$edi++;
1382         my $key = $root->addNewChild( $graphml_ns, 'key' );
1383         $key->setAttribute( 'attr.name', $datum );
1384         $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
1385         $key->setAttribute( 'for', 'edge' );
1386         $key->setAttribute( 'id', $edge_data_keys{$datum} );
1387     }
1388
1389     # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1390     my $xmlidname = $self->tradition->name;
1391     $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1392     if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1393         $xmlidname = '_'.$xmlidname;
1394     }
1395     my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1396     $sgraph->setAttribute( 'edgedefault', 'directed' );
1397     $sgraph->setAttribute( 'id', $xmlidname );
1398     $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
1399     $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
1400     $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
1401     $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
1402     $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
1403
1404     # Tradition/collation attribute data
1405     foreach my $datum ( keys %graph_attributes ) {
1406         my $value;
1407         if( $datum eq 'version' ) {
1408                 $value = '3.2';
1409         } elsif( ref( $graph_attributes{$datum} ) ) {
1410                 my $sub = $graph_attributes{$datum};
1411                 $value = &$sub();
1412         } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1413                 $value = $self->tradition->$datum;
1414         } else {
1415                 $value = $self->$datum;
1416         }
1417                 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
1418         }
1419
1420     my $node_ctr = 0;
1421     my %node_hash;
1422     # Add our readings to the graph
1423     foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
1424         next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1425                 ( $n->rank < $start->rank || $n->rank > $end->rank );
1426         $use_readings{$n->id} = 1;
1427         # Add to the main graph
1428         my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
1429         my $node_xmlid = 'n' . $node_ctr++;
1430         $node_hash{ $n->id } = $node_xmlid;
1431         $node_el->setAttribute( 'id', $node_xmlid );
1432         foreach my $d ( keys %reading_attributes ) {
1433                 my $nval = $n->$d;
1434                 # Custom serialization
1435                 if( $d eq 'lexemes' ) {
1436                                 # If nval is a true value, we have lexemes so we need to
1437                                 # serialize them. Otherwise set nval to undef so that the
1438                                 # key is excluded from this reading.
1439                         $nval = $nval ? $n->_serialize_lexemes : undef;
1440                 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1441                         $nval = undef;
1442                 }
1443                 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
1444                         # Adjust the ranks within the subgraph.
1445                         $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1446                                 : $nval - $rankoffset;
1447                 }
1448                 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1449                         if defined $nval;
1450         }
1451     }
1452
1453     # Add the path edges to the sequence graph
1454     my $edge_ctr = 0;
1455     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1456         # We add an edge in the graphml for every witness in $e.
1457         next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1458         my @edge_wits = sort $self->path_witnesses( $e );
1459         $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1460         $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1461         # Skip any path from start to end; that witness is not in the subgraph.
1462         next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
1463         foreach my $wit ( @edge_wits ) {
1464                         my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1465                                                                                 $node_hash{ $e->[0] },
1466                                                                                 $node_hash{ $e->[1] } );
1467                         my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
1468                         $edge_el->setAttribute( 'source', $from );
1469                         $edge_el->setAttribute( 'target', $to );
1470                         $edge_el->setAttribute( 'id', $id );
1471
1472                         # It's a witness path, so add the witness
1473                         my $base = $wit;
1474                         my $key = $edge_data_keys{'witness'};
1475                         # Is this an ante-corr witness?
1476                         my $aclabel = $self->ac_label;
1477                         if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1478                                 # Keep the base witness
1479                                 $base = $1;
1480                                 # ...and record that this is an 'extra' reading path
1481                                 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1482                         }
1483                         _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1484                 }
1485         }
1486
1487         # Report the actual number of nodes and edges that went in
1488         $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1489         $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1490
1491         # Add the relationship graph to the XML
1492         map { delete $edge_data_keys{$_} } @path_attributes;
1493         $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1494                 $node_data_keys{'id'}, \%edge_data_keys );
1495
1496     # Save and return the thing
1497     my $result = decode_utf8( $graphml->toString(1) );
1498     return $result;
1499 }
1500
1501 sub _add_graphml_data {
1502     my( $el, $key, $value ) = @_;
1503     return unless defined $value;
1504     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
1505     $data_el->setAttribute( 'key', $key );
1506     $data_el->appendText( $value );
1507 }
1508
1509 =head2 as_csv
1510
1511 Returns a CSV alignment table representation of the collation graph, one
1512 row per witness (or witness uncorrected.)
1513
1514 =head2 as_tsv
1515
1516 Returns a tab-separated alignment table representation of the collation graph,
1517 one row per witness (or witness uncorrected.)
1518
1519 =begin testing
1520
1521 use Text::Tradition;
1522 use Text::CSV;
1523
1524 my $READINGS = 311;
1525 my $PATHS = 361;
1526 my $WITS = 13;
1527 my $WITAC = 4;
1528
1529 my $datafile = 't/data/florilegium_tei_ps.xml';
1530 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1531                                       'name' => 'test0',
1532                                       'file' => $datafile,
1533                                       'linear' => 1 );
1534
1535 my $c = $tradition->collation;
1536 # Export the thing to CSV
1537 my $csvstr = $c->as_csv();
1538 # Count the columns
1539 my $csv = Text::CSV->new({ sep_char => ',', binary => 1 });
1540 my @lines = split(/\n/, $csvstr );
1541 ok( $csv->parse( $lines[0] ), "Successfully parsed first line of CSV" );
1542 is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1543 my @q_ac = grep { $_ eq 'Q'.$c->ac_label } $csv->fields;
1544 ok( @q_ac, "Found a layered witness" );
1545
1546 my $t2 = Text::Tradition->new( input => 'Tabular',
1547                                                            name => 'test2',
1548                                                            string => $csvstr,
1549                                                            sep_char => ',' );
1550 is( scalar $t2->collation->readings, $READINGS, "Reparsed CSV collation has all readings" );
1551 is( scalar $t2->collation->paths, $PATHS, "Reparsed CSV collation has all paths" );
1552
1553 # Now do it with TSV
1554 my $tsvstr = $c->as_tsv();
1555 my $t3 = Text::Tradition->new( input => 'Tabular',
1556                                                            name => 'test3',
1557                                                            string => $tsvstr,
1558                                                            sep_char => "\t" );
1559 is( scalar $t3->collation->readings, $READINGS, "Reparsed TSV collation has all readings" );
1560 is( scalar $t3->collation->paths, $PATHS, "Reparsed TSV collation has all paths" );
1561
1562 my $table = $c->alignment_table;
1563 my $noaccsv = $c->as_csv({ noac => 1 });
1564 my @noaclines = split(/\n/, $noaccsv );
1565 ok( $csv->parse( $noaclines[0] ), "Successfully parsed first line of no-ac CSV" );
1566 is( scalar( $csv->fields ), $WITS, "CSV has correct number of witness columns" );
1567 is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
1568
1569 my $safecsv = $c->as_csv({ safe_ac => 1});
1570 my @safelines = split(/\n/, $safecsv );
1571 ok( $csv->parse( $safelines[0] ), "Successfully parsed first line of safe CSV" );
1572 is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1573 @q_ac = grep { $_ eq 'Q__L' } $csv->fields;
1574 ok( @q_ac, "Found a sanitized layered witness" );
1575 is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
1576
1577 =end testing
1578
1579 =cut
1580
1581 sub _tabular {
1582     my( $self, $opts ) = @_;
1583     my $table = $self->alignment_table( $opts );
1584         my $csv_options = { binary => 1, quote_null => 0 };
1585         $csv_options->{'sep_char'} = $opts->{fieldsep};
1586         if( $opts->{fieldsep} eq "\t" ) {
1587                 # If it is really tab separated, nothing is an escape char.
1588                 $csv_options->{'quote_char'} = undef;
1589                 $csv_options->{'escape_char'} = '';
1590         }
1591     my $csv = Text::CSV->new( $csv_options );
1592     my @result;
1593     # Make the header row
1594     $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1595         push( @result, $csv->string );
1596     # Make the rest of the rows
1597     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1598         my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1599         my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
1600         $csv->combine( @row );
1601         push( @result, $csv->string );
1602     }
1603     return join( "\n", @result );
1604 }
1605
1606 sub as_csv {
1607         my $self = shift;
1608         my $opts = shift || {};
1609         $opts->{fieldsep} = ',';
1610         return $self->_tabular( $opts );
1611 }
1612
1613 sub as_tsv {
1614         my $self = shift;
1615         my $opts = shift || {};
1616         $opts->{fieldsep} = "\t";
1617         return $self->_tabular( $opts );
1618 }
1619
1620 =head2 alignment_table
1621
1622 Return a reference to an alignment table, in a slightly enhanced CollateX
1623 format which looks like this:
1624
1625  $table = { alignment => [ { witness => "SIGIL",
1626                              tokens => [ { t => "TEXT" }, ... ] },
1627                            { witness => "SIG2",
1628                              tokens => [ { t => "TEXT" }, ... ] },
1629                            ... ],
1630             length => TEXTLEN };
1631
1632 =cut
1633
1634 sub alignment_table {
1635     my( $self, $opts ) = @_;
1636     if( $self->has_cached_table ) {
1637                 return $self->cached_table
1638                         unless $opts->{noac} || $opts->{safe_ac};
1639     }
1640
1641     # Make sure we can do this
1642         throw( "Need a linear graph in order to make an alignment table" )
1643                 unless $self->linear;
1644     $self->calculate_ranks()
1645         unless $self->_graphcalc_done && $self->end->has_rank;
1646
1647     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
1648     my @all_pos = ( 1 .. $self->end->rank - 1 );
1649     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
1650         # say STDERR "Making witness row(s) for " . $wit->sigil;
1651         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1652         my @row = _make_witness_row( \@wit_path, \@all_pos );
1653         my $witobj = { 'witness' => $wit->sigil, 'tokens' => \@row };
1654         $witobj->{'identifier'} = $wit->identifier if $wit->identifier;
1655         push( @{$table->{'alignment'}}, $witobj );
1656         if( $wit->is_layered && !$opts->{noac} ) {
1657                 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
1658                         $wit->sigil.$self->ac_label );
1659             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
1660             my $witlabel = $opts->{safe_ac}
1661                 ? $wit->sigil . '__L' : $wit->sigil.$self->ac_label;
1662             my $witacobj = { 'witness' => $witlabel,
1663                 'tokens' => \@ac_row };
1664             $witacobj->{'identifier'} = $wit->identifier if $wit->identifier;
1665                         push( @{$table->{'alignment'}}, $witacobj );
1666         }
1667     }
1668     unless( $opts->{noac} || $opts->{safe_ac} ) {
1669             $self->cached_table( $table );
1670         }
1671     return $table;
1672 }
1673
1674 sub _make_witness_row {
1675     my( $path, $positions ) = @_;
1676     my %char_hash;
1677     map { $char_hash{$_} = undef } @$positions;
1678     my $debug = 0;
1679     foreach my $rdg ( @$path ) {
1680         say STDERR "rank " . $rdg->rank if $debug;
1681         # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
1682         $char_hash{$rdg->rank} = { 't' => $rdg };
1683     }
1684     my @row = map { $char_hash{$_} } @$positions;
1685     # Fill in lacuna markers for undef spots in the row
1686     my $last_el = shift @row;
1687     my @filled_row = ( $last_el );
1688     foreach my $el ( @row ) {
1689         # If we are using node reference, make the lacuna node appear many times
1690         # in the table.  If not, use the lacuna tag.
1691         if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1692             $el = $last_el;
1693         }
1694         push( @filled_row, $el );
1695         $last_el = $el;
1696     }
1697     return @filled_row;
1698 }
1699
1700
1701 =head1 NAVIGATION METHODS
1702
1703 =head2 reading_sequence( $first, $last, $sigil, $backup )
1704
1705 Returns the ordered list of readings, starting with $first and ending
1706 with $last, for the witness given in $sigil. If a $backup sigil is
1707 specified (e.g. when walking a layered witness), it will be used wherever
1708 no $sigil path exists.  If there is a base text reading, that will be
1709 used wherever no path exists for $sigil or $backup.
1710
1711 =cut
1712
1713 # TODO Think about returning some lazy-eval iterator.
1714 # TODO Get rid of backup; we should know from what witness is whether we need it.
1715
1716 sub reading_sequence {
1717     my( $self, $start, $end, $witness ) = @_;
1718
1719     $witness = $self->baselabel unless $witness;
1720     my @readings = ( $start );
1721     my %seen;
1722     my $n = $start;
1723     while( $n && $n->id ne $end->id ) {
1724         if( exists( $seen{$n->id} ) ) {
1725             throw( "Detected loop for $witness at " . $n->id );
1726         }
1727         $seen{$n->id} = 1;
1728
1729         my $next = $self->next_reading( $n, $witness );
1730         unless( $next ) {
1731             throw( "Did not find any path for $witness from reading " . $n->id );
1732         }
1733         push( @readings, $next );
1734         $n = $next;
1735     }
1736     # Check that the last reading is our end reading.
1737     my $last = $readings[$#readings];
1738     throw( "Last reading found from " . $start->text .
1739         " for witness $witness is not the end!" ) # TODO do we get this far?
1740         unless $last->id eq $end->id;
1741
1742     return @readings;
1743 }
1744
1745 =head2 next_reading( $reading, $sigil );
1746
1747 Returns the reading that follows the given reading along the given witness
1748 path.
1749
1750 =cut
1751
1752 sub next_reading {
1753     # Return the successor via the corresponding path.
1754     my $self = shift;
1755     my $answer = $self->_find_linked_reading( 'next', @_ );
1756         return undef unless $answer;
1757     return $self->reading( $answer );
1758 }
1759
1760 =head2 prior_reading( $reading, $sigil )
1761
1762 Returns the reading that precedes the given reading along the given witness
1763 path.
1764
1765 =cut
1766
1767 sub prior_reading {
1768     # Return the predecessor via the corresponding path.
1769     my $self = shift;
1770     my $answer = $self->_find_linked_reading( 'prior', @_ );
1771     return $self->reading( $answer );
1772 }
1773
1774 sub _find_linked_reading {
1775     my( $self, $direction, $node, $path ) = @_;
1776
1777     # Get a backup if we are dealing with a layered witness
1778     my $alt_path;
1779     my $aclabel = $self->ac_label;
1780     if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1781         $alt_path = $1;
1782     }
1783
1784     my @linked_paths = $direction eq 'next'
1785         ? $self->sequence->edges_from( $node )
1786         : $self->sequence->edges_to( $node );
1787     return undef unless scalar( @linked_paths );
1788
1789     # We have to find the linked path that contains all of the
1790     # witnesses supplied in $path.
1791     my( @path_wits, @alt_path_wits );
1792     @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1793     @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
1794     my $base_le;
1795     my $alt_le;
1796     foreach my $le ( @linked_paths ) {
1797         if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
1798             $base_le = $le;
1799         }
1800                 my @le_wits = sort $self->path_witnesses( $le );
1801                 if( _is_within( \@path_wits, \@le_wits ) ) {
1802                         # This is the right path.
1803                         return $direction eq 'next' ? $le->[1] : $le->[0];
1804                 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1805                         $alt_le = $le;
1806                 }
1807     }
1808     # Got this far? Return the alternate path if it exists.
1809     return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
1810         if $alt_le;
1811
1812     # Got this far? Return the base path if it exists.
1813     return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
1814         if $base_le;
1815
1816     # Got this far? We have no appropriate path.
1817     warn "Could not find $direction node from " . $node->id
1818         . " along path $path";
1819     return undef;
1820 }
1821
1822 # Some set logic.
1823 sub _is_within {
1824     my( $set1, $set2 ) = @_;
1825     my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
1826     foreach my $el ( @$set1 ) {
1827         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
1828     }
1829     return $ret;
1830 }
1831
1832 # Return the string that joins together a list of witnesses for
1833 # display on a single path.
1834 sub _witnesses_of_label {
1835     my( $self, $label ) = @_;
1836     my $regex = $self->wit_list_separator;
1837     my @answer = split( /\Q$regex\E/, $label );
1838     return @answer;
1839 }
1840
1841 =head2 common_readings
1842
1843 Returns the list of common readings in the graph (i.e. those readings that are
1844 shared by all non-lacunose witnesses.)
1845
1846 =cut
1847
1848 sub common_readings {
1849         my $self = shift;
1850         my @common = grep { $_->is_common } $self->readings;
1851         return @common;
1852 }
1853
1854 =head2 path_text( $sigil, [, $start, $end ] )
1855
1856 Returns the text of a witness (plus its backup, if we are using a layer)
1857 as stored in the collation.  The text is returned as a string, where the
1858 individual readings are joined with spaces and the meta-readings (e.g.
1859 lacunae) are omitted.  Optional specification of $start and $end allows
1860 the generation of a subset of the witness text.
1861
1862 =cut
1863
1864 sub path_text {
1865         my( $self, $wit, $start, $end ) = @_;
1866         $start = $self->start unless $start;
1867         $end = $self->end unless $end;
1868         my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
1869         my $pathtext = '';
1870         my $last;
1871         foreach my $r ( @path ) {
1872                 unless ( $r->join_prior || !$last || $last->join_next ) {
1873                         $pathtext .= ' ';
1874                 }
1875                 $pathtext .= $r->text;
1876                 $last = $r;
1877         }
1878         return $pathtext;
1879 }
1880
1881 =head1 INITIALIZATION METHODS
1882
1883 These are mostly for use by parsers.
1884
1885 =head2 make_witness_path( $witness )
1886
1887 Link the array of readings contained in $witness->path (and in
1888 $witness->uncorrected_path if it exists) into collation paths.
1889 Clear out the arrays when finished.
1890
1891 =head2 make_witness_paths
1892
1893 Call make_witness_path for all witnesses in the tradition.
1894
1895 =cut
1896
1897 # For use when a collation is constructed from a base text and an apparatus.
1898 # We have the sequences of readings and just need to add path edges.
1899 # When we are done, clear out the witness path attributes, as they are no
1900 # longer needed.
1901 # TODO Find a way to replace the witness path attributes with encapsulated functions?
1902
1903 sub make_witness_paths {
1904     my( $self ) = @_;
1905     foreach my $wit ( $self->tradition->witnesses ) {
1906         # say STDERR "Making path for " . $wit->sigil;
1907         $self->make_witness_path( $wit );
1908     }
1909 }
1910
1911 sub make_witness_path {
1912     my( $self, $wit ) = @_;
1913     my @chain = @{$wit->path};
1914     my $sig = $wit->sigil;
1915     # Add start and end if necessary
1916     unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1917     push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1918     foreach my $idx ( 0 .. $#chain-1 ) {
1919         $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
1920     }
1921     if( $wit->is_layered ) {
1922         @chain = @{$wit->uncorrected_path};
1923                 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1924                 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1925         foreach my $idx( 0 .. $#chain-1 ) {
1926             my $source = $chain[$idx];
1927             my $target = $chain[$idx+1];
1928             $self->add_path( $source, $target, $sig.$self->ac_label )
1929                 unless $self->has_path( $source, $target, $sig );
1930         }
1931     }
1932     $wit->clear_path;
1933     $wit->clear_uncorrected_path;
1934 }
1935
1936 =head2 calculate_ranks
1937
1938 Calculate the reading ranks (that is, their aligned positions relative
1939 to each other) for the graph.  This can only be called on linear collations.
1940
1941 =begin testing
1942
1943 use Text::Tradition;
1944
1945 my $cxfile = 't/data/Collatex-16.xml';
1946 my $t = Text::Tradition->new(
1947     'name'  => 'inline',
1948     'input' => 'CollateX',
1949     'file'  => $cxfile,
1950     );
1951 my $c = $t->collation;
1952
1953 # Make an svg
1954 my $table = $c->alignment_table;
1955 ok( $c->has_cached_table, "Alignment table was cached" );
1956 is( $c->alignment_table, $table, "Cached table returned upon second call" );
1957 $c->calculate_ranks;
1958 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
1959 $c->add_relationship( 'n13', 'n23', { type => 'repetition' } );
1960 is( $c->alignment_table, $table, "Alignment table unchanged after non-colo relationship add" );
1961 $c->add_relationship( 'n24', 'n23', { type => 'spelling' } );
1962 isnt( $c->alignment_table, $table, "Alignment table changed after colo relationship add" );
1963
1964 =end testing
1965
1966 =cut
1967
1968 sub calculate_ranks {
1969     my $self = shift;
1970     # Save the existing ranks, in case we need to invalidate the cached SVG.
1971     throw( "Cannot calculate ranks on a non-linear graph" )
1972         unless $self->linear;
1973     my %existing_ranks;
1974     map { $existing_ranks{$_} = $_->rank } $self->readings;
1975
1976     # Do the rankings based on the relationship equivalence graph, starting
1977     # with the start node.
1978     my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
1979
1980     # Transfer our rankings from the topological graph to the real one.
1981     foreach my $r ( $self->readings ) {
1982         if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
1983             $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
1984         } else {
1985                 # Die. Find the last rank we calculated.
1986                 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
1987                                  <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
1988                         $self->readings;
1989                 my $last = pop @all_defined;
1990             throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
1991         }
1992     }
1993     # Do we need to invalidate the cached data?
1994     if( $self->has_cached_table ) {
1995         foreach my $r ( $self->readings ) {
1996                 next if defined( $existing_ranks{$r} )
1997                         && $existing_ranks{$r} == $r->rank;
1998                 # Something has changed, so clear the cache
1999                 $self->_clear_cache;
2000                         # ...and recalculate the common readings.
2001                         $self->calculate_common_readings();
2002                 last;
2003         }
2004     }
2005         # The graph calculation information is now up to date.
2006         $self->_graphcalc_done(1);
2007 }
2008
2009 sub _clear_cache {
2010         my $self = shift;
2011         $self->wipe_table if $self->has_cached_table;
2012 }
2013
2014
2015 =head2 flatten_ranks
2016
2017 A convenience method for parsing collation data.  Searches the graph for readings
2018 with the same text at the same rank, and merges any that are found.
2019
2020 =cut
2021
2022 sub flatten_ranks {
2023     my ( $self, %args ) = shift;
2024     my %unique_rank_rdg;
2025     my $changed;
2026     foreach my $p ( $self->identical_readings( %args ) ) {
2027                 # say STDERR "Combining readings at same rank: @$p";
2028                 $changed = 1;
2029                 $self->merge_readings( @$p );
2030                 # TODO see if this now makes a common point.
2031     }
2032     # If we merged readings, the ranks are still fine but the alignment
2033     # table is wrong. Wipe it.
2034     $self->wipe_table() if $changed;
2035 }
2036
2037 =head2 identical_readings
2038 =head2 identical_readings( start => $startnode, end => $endnode )
2039 =head2 identical_readings( startrank => $startrank, endrank => $endrank )
2040
2041 Goes through the graph identifying all pairs of readings that appear to be
2042 identical, and therefore able to be merged into a single reading. Returns the
2043 relevant identical pairs. Can be restricted to run over only a part of the
2044 graph, specified either by node or by rank.
2045
2046 =cut
2047
2048 sub identical_readings {
2049         my ( $self, %args ) = @_;
2050     # Find where we should start and end.
2051     my $startrank = $args{startrank} || 0;
2052     if( $args{start} ) {
2053         throw( "Starting reading has no rank" ) unless $self->reading( $args{start} )
2054                 && $self->reading( $args{start} )->has_rank;
2055         $startrank = $self->reading( $args{start} )->rank;
2056     }
2057     my $endrank = $args{endrank} || $self->end->rank;
2058     if( $args{end} ) {
2059         throw( "Ending reading has no rank" ) unless $self->reading( $args{end} )
2060                 && $self->reading( $args{end} )->has_rank;
2061         $endrank = $self->reading( $args{end} )->rank;
2062     }
2063
2064     # Make sure the ranks are correct.
2065     unless( $self->_graphcalc_done ) {
2066         $self->calculate_ranks;
2067     }
2068     # Go through the readings looking for duplicates.
2069     my %unique_rank_rdg;
2070     my @pairs;
2071     foreach my $rdg ( $self->readings ) {
2072         next unless $rdg->has_rank;
2073         my $rk = $rdg->rank;
2074         next if $rk > $endrank || $rk < $startrank;
2075         my $key = $rk . "||" . $rdg->text;
2076         if( exists $unique_rank_rdg{$key} ) {
2077                 # Make sure they don't have different grammatical forms
2078                         my $ur = $unique_rank_rdg{$key};
2079                 if( $rdg->is_identical( $ur ) ) {
2080                                 push( @pairs, [ $ur, $rdg ] );
2081                         }
2082         } else {
2083             $unique_rank_rdg{$key} = $rdg;
2084         }
2085     }
2086
2087     return @pairs;
2088 }
2089
2090
2091 =head2 calculate_common_readings
2092
2093 Goes through the graph identifying the readings that appear in every witness
2094 (apart from those with lacunae at that spot.) Marks them as common and returns
2095 the list.
2096
2097 =begin testing
2098
2099 use Text::Tradition;
2100
2101 my $cxfile = 't/data/Collatex-16.xml';
2102 my $t = Text::Tradition->new(
2103     'name'  => 'inline',
2104     'input' => 'CollateX',
2105     'file'  => $cxfile,
2106     );
2107 my $c = $t->collation;
2108
2109 my @common = $c->calculate_common_readings();
2110 is( scalar @common, 8, "Found correct number of common readings" );
2111 my @marked = sort $c->common_readings();
2112 is( scalar @common, 8, "All common readings got marked as such" );
2113 my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
2114 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
2115
2116 =end testing
2117
2118 =cut
2119
2120 sub calculate_common_readings {
2121         my $self = shift;
2122         my @common;
2123         map { $_->is_common( 0 ) } $self->readings;
2124         # Implicitly calls calculate_ranks
2125         my $table = $self->alignment_table;
2126         foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
2127                 my @row = map { $_->{'tokens'}->[$idx]
2128                                                         ? $_->{'tokens'}->[$idx]->{'t'} : '' }
2129                                         @{$table->{'alignment'}};
2130                 my %hash;
2131                 foreach my $r ( @row ) {
2132                         if( $r ) {
2133                                 $hash{$r->id} = $r unless $r->is_meta;
2134                         } else {
2135                                 $hash{'UNDEF'} = $r;
2136                         }
2137                 }
2138                 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
2139                         my( $r ) = values %hash;
2140                         $r->is_common( 1 );
2141                         push( @common, $r );
2142                 }
2143         }
2144         return @common;
2145 }
2146
2147 =head2 text_from_paths
2148
2149 Calculate the text array for all witnesses from the path, for later consistency
2150 checking.  Only to be used if there is no non-graph-based way to know the
2151 original texts.
2152
2153 =cut
2154
2155 sub text_from_paths {
2156         my $self = shift;
2157     foreach my $wit ( $self->tradition->witnesses ) {
2158         my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
2159         my @text;
2160         foreach my $r ( @readings ) {
2161                 next if $r->is_meta;
2162                 push( @text, $r->text );
2163         }
2164         $wit->text( \@text );
2165         if( $wit->is_layered ) {
2166                         my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
2167                                                                                                   $wit->sigil.$self->ac_label );
2168                         my @uctext;
2169                         foreach my $r ( @ucrdgs ) {
2170                                 next if $r->is_meta;
2171                                 push( @uctext, $r->text );
2172                         }
2173                         $wit->layertext( \@uctext );
2174         }
2175     }
2176 }
2177
2178 =head1 UTILITY FUNCTIONS
2179
2180 =head2 common_predecessor( $reading_a, $reading_b )
2181
2182 Find the last reading that occurs in sequence before both the given readings.
2183 At the very least this should be $self->start.
2184
2185 =head2 common_successor( $reading_a, $reading_b )
2186
2187 Find the first reading that occurs in sequence after both the given readings.
2188 At the very least this should be $self->end.
2189
2190 =begin testing
2191
2192 use Text::Tradition;
2193
2194 my $cxfile = 't/data/Collatex-16.xml';
2195 my $t = Text::Tradition->new(
2196     'name'  => 'inline',
2197     'input' => 'CollateX',
2198     'file'  => $cxfile,
2199     );
2200 my $c = $t->collation;
2201
2202 is( $c->common_predecessor( 'n24', 'n23' )->id,
2203     'n20', "Found correct common predecessor" );
2204 is( $c->common_successor( 'n24', 'n23' )->id,
2205     '__END__', "Found correct common successor" );
2206
2207 is( $c->common_predecessor( 'n19', 'n17' )->id,
2208     'n16', "Found correct common predecessor for readings on same path" );
2209 is( $c->common_successor( 'n21', 'n10' )->id,
2210     '__END__', "Found correct common successor for readings on same path" );
2211
2212 =end testing
2213
2214 =cut
2215
2216 ## Return the closest reading that is a predecessor of both the given readings.
2217 sub common_predecessor {
2218         my $self = shift;
2219         my( $r1, $r2 ) = $self->_objectify_args( @_ );
2220         return $self->_common_in_path( $r1, $r2, 'predecessors' );
2221 }
2222
2223 sub common_successor {
2224         my $self = shift;
2225         my( $r1, $r2 ) = $self->_objectify_args( @_ );
2226         return $self->_common_in_path( $r1, $r2, 'successors' );
2227 }
2228
2229
2230 # TODO think about how to do this without ranks...
2231 sub _common_in_path {
2232         my( $self, $r1, $r2, $dir ) = @_;
2233         my $iter = $self->end->rank;
2234         my @candidates;
2235         my @last_r1 = ( $r1 );
2236         my @last_r2 = ( $r2 );
2237         # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
2238         my %all_seen;
2239         # say STDERR "Finding common $dir for $r1, $r2";
2240         while( !@candidates ) {
2241                 last unless $iter--;  # Avoid looping infinitely
2242                 # Iterate separately down the graph from r1 and r2
2243                 my( @new_lc1, @new_lc2 );
2244                 foreach my $lc ( @last_r1 ) {
2245                         foreach my $p ( $lc->$dir ) {
2246                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
2247                                         # say STDERR "Path candidate $p from $lc";
2248                                         push( @candidates, $p );
2249                                 } elsif( !$all_seen{$p->id} ) {
2250                                         $all_seen{$p->id} = 'r1';
2251                                         push( @new_lc1, $p );
2252                                 }
2253                         }
2254                 }
2255                 foreach my $lc ( @last_r2 ) {
2256                         foreach my $p ( $lc->$dir ) {
2257                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
2258                                         # say STDERR "Path candidate $p from $lc";
2259                                         push( @candidates, $p );
2260                                 } elsif( !$all_seen{$p->id} ) {
2261                                         $all_seen{$p->id} = 'r2';
2262                                         push( @new_lc2, $p );
2263                                 }
2264                         }
2265                 }
2266                 @last_r1 = @new_lc1;
2267                 @last_r2 = @new_lc2;
2268         }
2269         my @answer = sort { $a->rank <=> $b->rank } @candidates;
2270         return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
2271 }
2272
2273 sub throw {
2274         Text::Tradition::Error->throw(
2275                 'ident' => 'Collation error',
2276                 'message' => $_[0],
2277                 );
2278 }
2279
2280 no Moose;
2281 __PACKAGE__->meta->make_immutable;
2282
2283 =head1 BUGS/TODO
2284
2285 =over
2286
2287 =item * Rework XML serialization in a more modular way
2288
2289 =back
2290
2291 =head1 LICENSE
2292
2293 This package is free software and is provided "as is" without express
2294 or implied warranty.  You can redistribute it and/or modify it under
2295 the same terms as Perl itself.
2296
2297 =head1 AUTHOR
2298
2299 Tara L Andrews E<lt>aurum@cpan.orgE<gt>