base/lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use feature 'say';
   4 use Encode qw( decode_utf8 );
   5 use File::Temp;
   6 use File::Which;
   7 use Graph;
   8 use IPC::Run qw( run binary );
   9 use Text::CSV;
  10 use Text::Tradition::Collation::Data;
  11 use Text::Tradition::Collation::Reading;
  12 use Text::Tradition::Collation::RelationshipStore;
  13 use Text::Tradition::Error;
  14 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
  15 use XML::LibXML;
  16 use XML::LibXML::XPathContext;
  17 use Moose;
  18
  19 has _data => (
  20         isa      => 'Text::Tradition::Collation::Data',
  21         is       => 'ro',
  22         required => 1,
  23         handles  => [ qw(
  24                 sequence
  25                 paths
  26                 _set_relations
  27                 relations
  28                 _set_start
  29                 _set_end
  30                 ac_label
  31                 has_cached_table
  32                 relationships
  33                 related_readings
  34                 get_relationship
  35                 del_relationship
  36                 equivalence
  37                 equivalence_graph
  38                 readings
  39                 reading
  40                 _add_reading
  41                 del_reading
  42                 has_reading
  43                 wit_list_separator
  44                 baselabel
  45                 linear
  46                 wordsep
  47                 start
  48                 end
  49                 cached_table
  50                 _graphcalc_done
  51                 has_cached_svg
  52                 wipe_table
  53         )]
  54 );
  55
  56 has 'tradition' => (
  57     is => 'ro',
  58     isa => 'Text::Tradition',
  59     writer => '_set_tradition',
  60     weak_ref => 1,
  61     );
  62
  63 =head1 NAME
  64
  65 Text::Tradition::Collation - a software model for a text collation
  66
  67 =head1 SYNOPSIS
  68
  69   use Text::Tradition;
  70   my $t = Text::Tradition->new(
  71     'name' => 'this is a text',
  72     'input' => 'TEI',
  73     'file' => '/path/to/tei_parallel_seg_file.xml' );
  74
  75   my $c = $t->collation;
  76   my @readings = $c->readings;
  77   my @paths = $c->paths;
  78   my @relationships = $c->relationships;
  79
  80   my $svg_variant_graph = $t->collation->as_svg();
  81
  82 =head1 DESCRIPTION
  83
  84 Text::Tradition is a library for representation and analysis of collated
  85 texts, particularly medieval ones.  The Collation is the central feature of
  86 a Tradition, where the text, its sequence of readings, and its relationships
  87 between readings are actually kept.
  88
  89 =head1 CONSTRUCTOR
  90
  91 =head2 new
  92
  93 The constructor.  Takes a hash or hashref of the following arguments:
  94
  95 =over
  96
  97 =item * tradition - The Text::Tradition object to which the collation
  98 belongs. Required.
  99
 100 =item * linear - Whether the collation should be linear; that is, whether
 101 transposed readings should be treated as two linked readings rather than one,
 102 and therefore whether the collation graph is acyclic.  Defaults to true.
 103
 104 =item * baselabel - The default label for the path taken by a base text
 105 (if any). Defaults to 'base text'.
 106
 107 =item * wit_list_separator - The string to join a list of witnesses for
 108 purposes of making labels in display graphs.  Defaults to ', '.
 109
 110 =item * ac_label - The extra label to tack onto a witness sigil when
 111 representing another layer of path for the given witness - that is, when
 112 a text has more than one possible reading due to scribal corrections or
 113 the like.  Defaults to ' (a.c.)'.
 114
 115 =item * wordsep - The string used to separate words in the original text.
 116 Defaults to ' '.
 117
 118 =back
 119
 120 =head1 ACCESSORS
 121
 122 =head2 tradition
 123
 124 =head2 linear
 125
 126 =head2 wit_list_separator
 127
 128 =head2 baselabel
 129
 130 =head2 ac_label
 131
 132 =head2 wordsep
 133
 134 Simple accessors for collation attributes.
 135
 136 =head2 start
 137
 138 The meta-reading at the start of every witness path.
 139
 140 =head2 end
 141
 142 The meta-reading at the end of every witness path.
 143
 144 =head2 readings
 145
 146 Returns all Reading objects in the graph.
 147
 148 =head2 reading( $id )
 149
 150 Returns the Reading object corresponding to the given ID.
 151
 152 =head2 add_reading( $reading_args )
 153
 154 Adds a new reading object to the collation.
 155 See L<Text::Tradition::Collation::Reading> for the available arguments.
 156
 157 =head2 del_reading( $object_or_id )
 158
 159 Removes the given reading from the collation, implicitly removing its
 160 paths and relationships.
 161
 162 =head2 has_reading( $id )
 163
 164 Predicate to see whether a given reading ID is in the graph.
 165
 166 =head2 reading_witnesses( $object_or_id )
 167
 168 Returns a list of sigils whose witnesses contain the reading.
 169
 170 =head2 paths
 171
 172 Returns all reading paths within the document - that is, all edges in the
 173 collation graph.  Each path is an arrayref of [ $source, $target ] reading IDs.
 174
 175 =head2 add_path( $source, $target, $sigil )
 176
 177 Links the given readings in the collation in sequence, under the given witness
 178 sigil.  The readings may be specified by object or ID.
 179
 180 =head2 del_path( $source, $target, $sigil )
 181
 182 Links the given readings in the collation in sequence, under the given witness
 183 sigil.  The readings may be specified by object or ID.
 184
 185 =head2 has_path( $source, $target );
 186
 187 Returns true if the two readings are linked in sequence in any witness.
 188 The readings may be specified by object or ID.
 189
 190 =head2 relationships
 191
 192 Returns all Relationship objects in the collation.
 193
 194 =head2 add_relationship( $reading, $other_reading, $options )
 195
 196 Adds a new relationship of the type given in $options between the two readings,
 197 which may be specified by object or ID.  Returns a value of ( $status, @vectors)
 198 where $status is true on success, and @vectors is a list of relationship edges
 199 that were ultimately added.
 200 See L<Text::Tradition::Collation::Relationship> for the available options.
 201
 202 =cut
 203
 204 sub BUILDARGS {
 205         my ( $class, @args ) = @_;
 206         my %args = @args == 1 ? %{ $args[0] } : @args;
 207         # TODO determine these from the Moose::Meta object
 208         my @delegate_attrs = qw(sequence relations readings wit_list_separator baselabel
 209                 linear wordsep start end cached_table _graphcalc_done);
 210         my %data_args;
 211         for my $attr (@delegate_attrs) {
 212                 $data_args{$attr} = delete $args{$attr} if exists $args{$attr};
 213         }
 214         $args{_data} = Text::Tradition::Collation::Data->new(%data_args);
 215         return \%args;
 216 }
 217
 218 sub BUILD {
 219     my $self = shift;
 220     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
 221     $self->_set_start( $self->add_reading(
 222         { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
 223     $self->_set_end( $self->add_reading(
 224         { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
 225 }
 226
 227 sub register_relationship_type {
 228         my $self = shift;
 229         my %args = @_ == 1 ? %{$_[0]} : @_;
 230         if( $self->relations->has_type( $args{name} ) ) {
 231                 throw( 'Relationship type ' . $args{name} . ' already registered' );
 232         }
 233         $self->relations->add_type( %args );
 234 }
 235
 236 sub get_relationship_type {
 237         my( $self, $name ) = @_;
 238                 return $self->relations->has_type( $name )
 239                         ? $self->relations->type( $name ) : undef;
 240 }
 241
 242 ### Reading construct/destruct functions
 243
 244 sub add_reading {
 245         my( $self, $reading ) = @_;
 246         unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
 247                 my %args = %$reading;
 248                 if( $args{'init'} ) {
 249                         # If we are initializing an empty collation, don't assume that we
 250                         # have set a tradition.
 251                         delete $args{'init'};
 252                 } elsif( $self->tradition->can('language') && $self->tradition->has_language
 253                         && !exists $args{'language'} ) {
 254                         $args{'language'} = $self->tradition->language;
 255                 }
 256                 $reading = Text::Tradition::Collation::Reading->new(
 257                         'collation' => $self,
 258                         %args );
 259         }
 260         # First check to see if a reading with this ID exists.
 261         if( $self->reading( $reading->id ) ) {
 262                 throw( "Collation already has a reading with id " . $reading->id );
 263         }
 264         $self->_graphcalc_done(0);
 265         $self->_add_reading( $reading->id => $reading );
 266         # Once the reading has been added, put it in both graphs.
 267         $self->sequence->add_vertex( $reading->id );
 268         $self->relations->add_reading( $reading->id );
 269         return $reading;
 270 };
 271
 272 around del_reading => sub {
 273         my $orig = shift;
 274         my $self = shift;
 275         my $arg = shift;
 276
 277         if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
 278                 $arg = $arg->id;
 279         }
 280         # Remove the reading from the graphs.
 281         $self->_graphcalc_done(0);
 282         $self->_clear_cache; # Explicitly clear caches to GC the reading
 283         $self->sequence->delete_vertex( $arg );
 284         $self->relations->delete_reading( $arg );
 285
 286         # Carry on.
 287         $self->$orig( $arg );
 288 };
 289
 290 =head2 merge_readings( $main, $second, $concatenate, $with_str )
 291
 292 Merges the $second reading into the $main one. If $concatenate is true, then
 293 the merged node will carry the text of both readings, concatenated with either
 294 $with_str (if specified) or a sensible default (the empty string if the
 295 appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
 296
 297 The first two arguments may be either readings or reading IDs.
 298
 299 =begin testing
 300
 301 use Text::Tradition;
 302
 303 my $cxfile = 't/data/Collatex-16.xml';
 304 my $t = Text::Tradition->new(
 305     'name'  => 'inline',
 306     'input' => 'CollateX',
 307     'file'  => $cxfile,
 308     );
 309 my $c = $t->collation;
 310
 311 my $rno = scalar $c->readings;
 312 # Split n21 ('unto') for testing purposes
 313 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
 314 my $old_r = $c->reading( 'n21' );
 315 $old_r->alter_text( 'to' );
 316 $c->del_path( 'n20', 'n21', 'A' );
 317 $c->add_path( 'n20', 'n21p0', 'A' );
 318 $c->add_path( 'n21p0', 'n21', 'A' );
 319 $c->add_relationship( 'n21', 'n22', { type => 'collated', scope => 'local' } );
 320 $c->flatten_ranks();
 321 ok( $c->reading( 'n21p0' ), "New reading exists" );
 322 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
 323
 324 # Combine n3 and n4 ( with his )
 325 $c->merge_readings( 'n3', 'n4', 1 );
 326 ok( !$c->reading('n4'), "Reading n4 is gone" );
 327 is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
 328
 329 # Collapse n9 and n10 ( rood / root )
 330 $c->merge_readings( 'n9', 'n10' );
 331 ok( !$c->reading('n10'), "Reading n10 is gone" );
 332 is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
 333
 334 # Combine n21 and n21p0
 335 my $remaining = $c->reading('n21');
 336 $remaining ||= $c->reading('n22');  # one of these should still exist
 337 $c->merge_readings( 'n21p0', $remaining, 1 );
 338 ok( !$c->reading('n21'), "Reading $remaining is gone" );
 339 is( $c->reading('n21p0')->text, 'unto', "Reading n21p0 merged correctly" );
 340
 341 =end testing
 342
 343 =cut
 344
 345 sub merge_readings {
 346         my $self = shift;
 347
 348         # Sanity check
 349         my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
 350         my $mergemeta = $kept_obj->is_meta;
 351         throw( "Cannot merge meta and non-meta reading" )
 352                 unless ( $mergemeta && $del_obj->is_meta )
 353                         || ( !$mergemeta && !$del_obj->is_meta );
 354         if( $mergemeta ) {
 355                 throw( "Cannot merge with start or end node" )
 356                         if( $kept_obj eq $self->start || $kept_obj eq $self->end
 357                                 || $del_obj eq $self->start || $del_obj eq $self->end );
 358                 throw( "Cannot combine text of meta readings" ) if $combine;
 359         }
 360         # We only need the IDs for adding paths to the graph, not the reading
 361         # objects themselves.
 362         my $kept = $kept_obj->id;
 363         my $deleted = $del_obj->id;
 364         $self->_graphcalc_done(0);
 365
 366     # The kept reading should inherit the paths and the relationships
 367     # of the deleted reading.
 368         foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
 369                 my @vector = ( $kept );
 370                 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
 371                 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
 372                 next if $vector[0] eq $vector[1]; # Don't add a self loop
 373                 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
 374                 $self->sequence->add_edge( @vector );
 375                 my $fwits = $self->sequence->get_edge_attributes( @vector );
 376                 @wits{keys %$fwits} = values %$fwits;
 377                 $self->sequence->set_edge_attributes( @vector, \%wits );
 378         }
 379         $self->relations->merge_readings( $kept, $deleted, $combine );
 380
 381         # Do the deletion deed.
 382         if( $combine ) {
 383                 # Combine the text of the readings
 384                 my $joinstr = $combine_char;
 385                 unless( defined $joinstr ) {
 386                         $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
 387                         $joinstr = $self->wordsep unless defined $joinstr;
 388                 }
 389                 $kept_obj->_combine( $del_obj, $joinstr );
 390         }
 391         $self->del_reading( $deleted );
 392 }
 393
 394 =head2 merge_related( @relationship_types )
 395
 396 Merge all readings linked with the relationship types given. If any of the selected type(s) is not a colocation, the graph will no longer be linear. The majority/plurality reading in each case will be the one kept.
 397
 398 WARNING: This operation cannot be undone.
 399
 400 =cut
 401
 402 =begin testing
 403
 404 use Text::Tradition;
 405 use TryCatch;
 406
 407 my $t = Text::Tradition->new(
 408     'name'  => 'inline',
 409     'input' => 'Self',
 410     'file'  => 't/data/legendfrag.xml',
 411     );
 412 my $c = $t->collation;
 413
 414 my %rdg_ids;
 415 map { $rdg_ids{$_} = 1 } $c->readings;
 416 $c->merge_related( 'orthographic' );
 417 is( scalar( $c->readings ), keys( %rdg_ids ) - 8,
 418         "Successfully collapsed orthographic variation" );
 419 map { $rdg_ids{$_} = undef } qw/ r13.3 r11.4 r8.5 r8.2 r7.7 r7.5 r7.4 r7.1 /;
 420 foreach my $rid ( keys %rdg_ids ) {
 421         my $exp = $rdg_ids{$rid};
 422         is( !$c->reading( $rid ), !$exp, "Reading $rid correctly " .
 423                 ( $exp ? "retained" : "removed" ) );
 424 }
 425 ok( $c->linear, "Graph is still linear" );
 426 try {
 427         $c->calculate_ranks; # This should succeed
 428         ok( 1, "Can still calculate ranks on the new graph" );
 429 } catch {
 430         ok( 0, "Rank calculation on merged graph failed: $@" );
 431 }
 432
 433 # Now add some transpositions
 434 $c->add_relationship( 'r8.4', 'r10.4', { type => 'transposition' } );
 435 $c->merge_related( 'transposition' );
 436 is( scalar( $c->readings ), keys( %rdg_ids ) - 9,
 437         "Transposed relationship is merged away" );
 438 ok( !$c->reading('r8.4'), "Correct transposed reading removed" );
 439 ok( !$c->linear, "Graph is no longer linear" );
 440 try {
 441         $c->calculate_ranks; # This should fail
 442         ok( 0, "Rank calculation happened on nonlinear graph?!" );
 443 } catch ( Text::Tradition::Error $e ) {
 444         is( $e->message, 'Cannot calculate ranks on a non-linear graph',
 445                 "Rank calculation on merged graph threw an error" );
 446 }
 447
 448
 449
 450 =end testing
 451
 452 =cut
 453
 454 # TODO: there should be a way to display merged without affecting the underlying data!
 455
 456 sub merge_related {
 457         my $self = shift;
 458         my %reltypehash;
 459         map { $reltypehash{$_} = 1 } @_;
 460
 461         # Set up the filter for finding related readings
 462         my $filter = sub {
 463                 exists $reltypehash{$_[0]->type};
 464         };
 465
 466         my $linear = 1;
 467         # Go through all readings looking for related ones
 468         foreach my $r ( $self->readings ) {
 469                 next unless $self->reading( "$r" ); # might have been deleted meanwhile
 470                 my @related = $self->related_readings( $r, $filter );
 471                 if( @related ) {
 472                         push( @related, $r );
 473                         @related = sort {
 474                                         scalar $b->witnesses <=> scalar $a->witnesses
 475                                 } @related;
 476                         my $keep = shift @related;
 477                         foreach my $delr ( @related ) {
 478                                 $linear = undef
 479                                         unless( $self->get_relationship( $keep, $delr )->colocated );
 480                                 $self->merge_readings( $keep, $delr );
 481                         }
 482                 }
 483         }
 484         $self->linear( $linear );
 485 }
 486
 487 =head2 compress_readings
 488
 489 Where possible in the graph, compresses plain sequences of readings into a
 490 single reading. The sequences must consist of readings with no
 491 relationships to other readings, with only a single witness path between
 492 them and no other witness paths from either that would skip the other. The
 493 readings must also not be marked as nonsense or bad grammar.
 494
 495 WARNING: This operation cannot be undone.
 496
 497 =cut
 498
 499 sub compress_readings {
 500         my $self = shift;
 501         # Anywhere in the graph that there is a reading that joins only to a single
 502         # successor, and neither of these have any relationships, just join the two
 503         # readings.
 504         foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
 505                 # Now look for readings that can be joined to their successors.
 506                 next unless $rdg->is_combinable;
 507                 my %seen;
 508                 while( $self->sequence->successors( $rdg ) == 1 ) {
 509                         my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
 510                         throw( "Infinite loop" ) if $seen{$next->id};
 511                         $seen{$next->id} = 1;
 512                         last if $self->sequence->predecessors( $next ) > 1;
 513                         last unless $next->is_combinable;
 514                         say "Joining readings $rdg and $next";
 515                         $self->merge_readings( $rdg, $next, 1 );
 516                 }
 517         }
 518         # Make sure we haven't screwed anything up
 519         foreach my $wit ( $self->tradition->witnesses ) {
 520                 my $pathtext = $self->path_text( $wit->sigil );
 521                 my $origtext = join( ' ', @{$wit->text} );
 522                 throw( "Text differs for witness " . $wit->sigil )
 523                         unless $pathtext eq $origtext;
 524                 if( $wit->is_layered ) {
 525                         $pathtext = $self->path_text( $wit->sigil.$self->ac_label );
 526                         $origtext = join( ' ', @{$wit->layertext} );
 527                         throw( "Ante-corr text differs for witness " . $wit->sigil )
 528                                 unless $pathtext eq $origtext;
 529                 }
 530         }
 531
 532         $self->relations->rebuild_equivalence();
 533         $self->calculate_ranks();
 534 }
 535
 536 # Helper function for manipulating the graph.
 537 sub _stringify_args {
 538         my( $self, $first, $second, @args ) = @_;
 539     $first = $first->id
 540         if ref( $first ) eq 'Text::Tradition::Collation::Reading';
 541     $second = $second->id
 542         if ref( $second ) eq 'Text::Tradition::Collation::Reading';
 543     return( $first, $second, @args );
 544 }
 545
 546 # Helper function for manipulating the graph.
 547 sub _objectify_args {
 548         my( $self, $first, $second, $arg ) = @_;
 549     $first = $self->reading( $first )
 550         unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
 551     $second = $self->reading( $second )
 552         unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
 553     return( $first, $second, $arg );
 554 }
 555
 556 =head2 duplicate_reading( $reading, @witlist )
 557
 558 Split the given reading into two, so that the new reading is in the path for
 559 the witnesses given in @witlist. If the result is that certain non-colocated
 560 relationships (e.g. transpositions) are no longer valid, these will be removed.
 561 Returns the newly-created reading.
 562
 563 =begin testing
 564
 565 use Test::More::UTF8;
 566 use Text::Tradition;
 567 use TryCatch;
 568
 569 my $st = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/collatecorr.xml' );
 570 is( ref( $st ), 'Text::Tradition', "Got a tradition from test file" );
 571 ok( $st->has_witness('Ba96'), "Tradition has the affected witness" );
 572
 573 my $sc = $st->collation;
 574 my $numr = 17;
 575 ok( $sc->reading('n131'), "Tradition has the affected reading" );
 576 is( scalar( $sc->readings ), $numr, "There are $numr readings in the graph" );
 577 is( $sc->end->rank, 14, "There are fourteen ranks in the graph" );
 578
 579 # Detach the erroneously collated reading
 580 my( $newr, @del_rdgs ) = $sc->duplicate_reading( 'n131', 'Ba96' );
 581 ok( $newr, "New reading was created" );
 582 ok( $sc->reading('n131_0'), "Detached the bad collation with a new reading" );
 583 is( scalar( $sc->readings ), $numr + 1, "A reading was added to the graph" );
 584 is( $sc->end->rank, 10, "There are now only ten ranks in the graph" );
 585 my $csucc = $sc->common_successor( 'n131', 'n131_0' );
 586 is( $csucc->id, 'n136', "Found correct common successor to duped reading" );
 587
 588 # Check that the bad transposition is gone
 589 is( scalar @del_rdgs, 1, "Deleted reading was returned by API call" );
 590 is( $sc->get_relationship( 'n130', 'n135' ), undef, "Bad transposition relationship is gone" );
 591
 592 # The collation should not be fixed
 593 my @pairs = $sc->identical_readings();
 594 is( scalar @pairs, 0, "Not re-collated yet" );
 595 # Fix the collation
 596 ok( $sc->merge_readings( 'n124', 'n131_0' ), "Collated the readings correctly" );
 597 @pairs = $sc->identical_readings( start => 'n124', end => $csucc->id );
 598 is( scalar @pairs, 3, "Found three more identical readings" );
 599 is( $sc->end->rank, 11, "The ranks shifted appropriately" );
 600 $sc->flatten_ranks();
 601 is( scalar( $sc->readings ), $numr - 3, "Now we are collated correctly" );
 602
 603 # Check that we can't "duplicate" a reading with no wits or with all wits
 604 try {
 605         my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124' );
 606         ok( 0, "Reading duplication without witnesses throws an error" );
 607 } catch( Text::Tradition::Error $e ) {
 608         like( $e->message, qr/Must specify one or more witnesses/,
 609                 "Reading duplication without witnesses throws the expected error" );
 610 } catch {
 611         ok( 0, "Reading duplication without witnesses threw the wrong error" );
 612 }
 613
 614 try {
 615         my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124', 'Ba96', 'Mü11475' );
 616         ok( 0, "Reading duplication with all witnesses throws an error" );
 617 } catch( Text::Tradition::Error $e ) {
 618         like( $e->message, qr/Cannot join all witnesses/,
 619                 "Reading duplication with all witnesses throws the expected error" );
 620 } catch {
 621         ok( 0, "Reading duplication with all witnesses threw the wrong error" );
 622 }
 623
 624 =end testing
 625
 626 =cut
 627
 628 sub duplicate_reading {
 629         my( $self, $r, @wits ) = @_;
 630         # Check that we are not doing anything unwise.
 631         throw( "Must specify one or more witnesses for the duplicated reading" )
 632                 unless @wits;
 633         unless( ref( $r ) eq 'Text::Tradition::Collation::Reading' ) {
 634                 $r = $self->reading( $r );
 635         }
 636         throw( "Cannot duplicate a meta-reading" )
 637                 if $r->is_meta;
 638         throw( "Cannot join all witnesses to the new reading" )
 639                 if scalar( @wits ) == scalar( $r->witnesses );
 640
 641         # Get all the reading attributes and duplicate them.
 642         my $rmeta = Text::Tradition::Collation::Reading->meta;
 643         my %args;
 644     foreach my $attr( $rmeta->get_all_attributes ) {
 645                 next if $attr->name =~ /^_/;
 646                 my $acc = $attr->get_read_method;
 647                 if( !$acc && $attr->has_applied_traits ) {
 648                         my $tr = $attr->applied_traits;
 649                         if( $tr->[0] =~ /::(Array|Hash)$/ ) {
 650                                 my $which = $1;
 651                                 my %methods = reverse %{$attr->handles};
 652                                 $acc = $methods{elements};
 653                                 $args{$attr->name} = $which eq 'Array'
 654                                         ? [ $r->$acc ] : { $r->$acc };
 655                         }
 656                 } else {
 657                         $args{$attr->name} = $r->$acc if $acc;
 658                 }
 659         }
 660         # By definition the new reading will no longer be common.
 661         $args{is_common} = 0;
 662         # The new reading also needs its own ID.
 663         $args{id} = $self->_generate_dup_id( $r->id );
 664
 665         # Try to make the new reading.
 666         my $newr = $self->add_reading( \%args );
 667         # The old reading is also no longer common.
 668         $r->is_common( 0 );
 669
 670         # For each of the witnesses, dissociate from the old reading and
 671         # associate with the new.
 672         foreach my $wit ( @wits ) {
 673                 my $prior = $self->prior_reading( $r, $wit );
 674                 my $next = $self->next_reading( $r, $wit );
 675                 $self->del_path( $prior, $r, $wit );
 676                 $self->add_path( $prior, $newr, $wit );
 677                 $self->del_path( $r, $next, $wit );
 678                 $self->add_path( $newr, $next, $wit );
 679         }
 680
 681         # If the graph is ranked, we need to look for relationships that are now
 682         # invalid (i.e. 'non-colocation' types that might now be colocated) and
 683         # remove them. If not, we can skip it.
 684         my $succ;
 685         my %rrk;
 686         my @deleted_relations;
 687         if( $self->end->has_rank ) {
 688                 # Find the point where we can stop checking
 689                 $succ = $self->common_successor( $r, $newr );
 690
 691                 # Hash the existing ranks
 692                 foreach my $rdg ( $self->readings ) {
 693                         $rrk{$rdg->id} = $rdg->rank;
 694                 }
 695                 # Calculate the new ranks
 696                 $self->calculate_ranks();
 697
 698                 # Check for invalid non-colocated relationships among changed-rank readings
 699                 # from where the ranks start changing up to $succ
 700                 my $lastrank = $succ->rank;
 701                 foreach my $rdg ( $self->readings ) {
 702                         next if $rdg->rank > $lastrank;
 703                         next if $rdg->rank == $rrk{$rdg->id};
 704                         my @noncolo = $rdg->related_readings( sub { !$_[0]->colocated } );
 705                         next unless @noncolo;
 706                         foreach my $nc ( @noncolo ) {
 707                                 unless( $self->relations->verify_or_delete( $rdg, $nc ) ) {
 708                                         push( @deleted_relations, [ $rdg->id, $nc->id ] );
 709                                 }
 710                         }
 711                 }
 712         }
 713         return ( $newr, @deleted_relations );
 714 }
 715
 716 sub _generate_dup_id {
 717         my( $self, $rid ) = @_;
 718         my $newid;
 719         my $i = 0;
 720         while( !$newid ) {
 721                 $newid = $rid."_$i";
 722                 if( $self->has_reading( $newid ) ) {
 723                         $newid = '';
 724                         $i++;
 725                 }
 726         }
 727         return $newid;
 728 }
 729
 730 ### Path logic
 731
 732 sub add_path {
 733         my $self = shift;
 734
 735         # We only need the IDs for adding paths to the graph, not the reading
 736         # objects themselves.
 737     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 738
 739         $self->_graphcalc_done(0);
 740         # Connect the readings
 741         unless( $self->sequence->has_edge( $source, $target ) ) {
 742             $self->sequence->add_edge( $source, $target );
 743             $self->relations->add_equivalence_edge( $source, $target );
 744         }
 745     # Note the witness in question
 746     $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
 747 }
 748
 749 sub del_path {
 750         my $self = shift;
 751         my @args;
 752         if( ref( $_[0] ) eq 'ARRAY' ) {
 753                 my $e = shift @_;
 754                 @args = ( @$e, @_ );
 755         } else {
 756                 @args = @_;
 757         }
 758
 759         # We only need the IDs for removing paths from the graph, not the reading
 760         # objects themselves.
 761     my( $source, $target, $wit ) = $self->_stringify_args( @args );
 762
 763         $self->_graphcalc_done(0);
 764         if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
 765                 $self->sequence->delete_edge_attribute( $source, $target, $wit );
 766         }
 767         unless( $self->sequence->has_edge_attributes( $source, $target ) ) {
 768                 $self->sequence->delete_edge( $source, $target );
 769                 $self->relations->delete_equivalence_edge( $source, $target );
 770         }
 771 }
 772
 773
 774 # Extra graph-alike utility
 775 sub has_path {
 776         my $self = shift;
 777     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 778         return undef unless $self->sequence->has_edge( $source, $target );
 779         return $self->sequence->has_edge_attribute( $source, $target, $wit );
 780 }
 781
 782 =head2 clear_witness( @sigil_list )
 783
 784 Clear the given witnesses out of the collation entirely, removing references
 785 to them in paths, and removing readings that belong only to them.  Should only
 786 be called via $tradition->del_witness.
 787
 788 =cut
 789
 790 sub clear_witness {
 791         my( $self, @sigils ) = @_;
 792
 793         $self->_graphcalc_done(0);
 794         # Clear the witness(es) out of the paths
 795         foreach my $e ( $self->paths ) {
 796                 foreach my $sig ( @sigils ) {
 797                         $self->del_path( $e, $sig );
 798                 }
 799         }
 800
 801         # Clear out the newly unused readings
 802         foreach my $r ( $self->readings ) {
 803                 unless( $self->reading_witnesses( $r ) ) {
 804                         $self->del_reading( $r );
 805                 }
 806         }
 807 }
 808
 809 sub add_relationship {
 810         my $self = shift;
 811     my( $source, $target, $opts ) = $self->_stringify_args( @_ );
 812     my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
 813     foreach my $v ( @vectors ) {
 814         next unless $self->get_relationship( $v )->colocated;
 815         if( $self->reading( $v->[0] )->has_rank && $self->reading( $v->[1] )->has_rank
 816                 && $self->reading( $v->[0] )->rank ne $self->reading( $v->[1] )->rank ) {
 817                         $self->_graphcalc_done(0);
 818                         $self->_clear_cache;
 819                         last;
 820         }
 821     }
 822     return @vectors;
 823 }
 824
 825 around qw/ get_relationship del_relationship / => sub {
 826         my $orig = shift;
 827         my $self = shift;
 828         my @args = @_;
 829         if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
 830                 @args = @{$_[0]};
 831         }
 832         my @stringargs = $self->_stringify_args( @args );
 833         $self->$orig( @stringargs );
 834 };
 835
 836 =head2 reading_witnesses( $reading )
 837
 838 Return a list of sigils corresponding to the witnesses in which the reading appears.
 839
 840 =cut
 841
 842 sub reading_witnesses {
 843         my( $self, $reading ) = @_;
 844         # We need only check either the incoming or the outgoing edges; I have
 845         # arbitrarily chosen "incoming".  Thus, special-case the start node.
 846         if( $reading eq $self->start ) {
 847                 return map { $_->sigil } grep { $_->is_collated } $self->tradition->witnesses;
 848         }
 849         my %all_witnesses;
 850         foreach my $e ( $self->sequence->edges_to( $reading ) ) {
 851                 my $wits = $self->sequence->get_edge_attributes( @$e );
 852                 @all_witnesses{ keys %$wits } = 1;
 853         }
 854         my $acstr = $self->ac_label;
 855         foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
 856                 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
 857         }
 858         return keys %all_witnesses;
 859 }
 860
 861 =head1 OUTPUT METHODS
 862
 863 =head2 as_svg( \%options )
 864
 865 Returns an SVG string that represents the graph, via as_dot and graphviz.
 866 See as_dot for a list of options.  Must have GraphViz (dot) installed to run.
 867
 868 =cut
 869
 870 sub as_svg {
 871     my( $self, $opts ) = @_;
 872     throw( "Need GraphViz installed to output SVG" )
 873         unless File::Which::which( 'dot' );
 874     my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
 875     $self->calculate_ranks()
 876         unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
 877         my @cmd = qw/dot -Tsvg/;
 878         my( $svg, $err );
 879         my $dotfile = File::Temp->new();
 880         ## USE FOR DEBUGGING
 881         # $dotfile->unlink_on_destroy(0);
 882         binmode $dotfile, ':utf8';
 883         print $dotfile $self->as_dot( $opts );
 884         push( @cmd, $dotfile->filename );
 885         run( \@cmd, ">", binary(), \$svg );
 886         $svg = decode_utf8( $svg );
 887         return $svg;
 888 }
 889
 890
 891 =head2 as_dot( \%options )
 892
 893 Returns a string that is the collation graph expressed in dot
 894 (i.e. GraphViz) format.  Options include:
 895
 896 =over 4
 897
 898 =item * from
 899
 900 =item * to
 901
 902 =item * color_common
 903
 904 =back
 905
 906 =cut
 907
 908 sub as_dot {
 909     my( $self, $opts ) = @_;
 910     my $startrank = $opts->{'from'} if $opts;
 911     my $endrank = $opts->{'to'} if $opts;
 912     my $color_common = $opts->{'color_common'} if $opts;
 913     my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
 914        && $self->end->rank > 100;
 915     $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
 916
 917     # Check the arguments
 918     if( $startrank ) {
 919         return if $endrank && $startrank > $endrank;
 920         return if $startrank > $self->end->rank;
 921         }
 922         if( defined $endrank ) {
 923                 return if $endrank < 0;
 924                 $endrank = undef if $endrank == $self->end->rank;
 925         }
 926
 927     my $graph_name = $self->tradition->name;
 928     $graph_name =~ s/[^\w\s]//g;
 929     $graph_name = join( '_', split( /\s+/, $graph_name ) );
 930
 931     my %graph_attrs = (
 932         'rankdir' => 'LR',
 933         'bgcolor' => 'none',
 934         );
 935     my %node_attrs = (
 936         'fontsize' => 14,
 937         'fillcolor' => 'white',
 938         'style' => 'filled',
 939         'shape' => 'ellipse'
 940         );
 941     my %edge_attrs = (
 942         'arrowhead' => 'open',
 943         'color' => '#000000',
 944         'fontcolor' => '#000000',
 945         );
 946
 947     my $dot = sprintf( "digraph %s {\n", $graph_name );
 948     $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
 949     $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
 950
 951         # Output substitute start/end readings if necessary
 952         if( $startrank ) {
 953                 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
 954         }
 955         if( $endrank ) {
 956                 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
 957         }
 958         if( $STRAIGHTENHACK ) {
 959                 ## HACK part 1
 960                 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
 961                 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
 962                 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
 963         }
 964         my %used;  # Keep track of the readings that actually appear in the graph
 965         # Sort the readings by rank if we have ranks; this speeds layout.
 966         my @all_readings = $self->end->has_rank
 967                 ? sort { $a->rank <=> $b->rank } $self->readings
 968                 : $self->readings;
 969         # TODO Refrain from outputting lacuna nodes - just grey out the edges.
 970     foreach my $reading ( @all_readings ) {
 971         # Only output readings within our rank range.
 972         next if $startrank && $reading->rank < $startrank;
 973         next if $endrank && $reading->rank > $endrank;
 974         $used{$reading->id} = 1;
 975         # Need not output nodes without separate labels
 976         next if $reading->id eq $reading->text;
 977         my $rattrs;
 978         my $label = $reading->text;
 979         $label .= '-' if $reading->join_next;
 980         $label = "-$label" if $reading->join_prior;
 981         $label =~ s/\"/\\\"/g;
 982                 $rattrs->{'label'} = $label;
 983                 $rattrs->{'id'} = $reading->id;
 984                 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
 985         $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
 986     }
 987
 988         # Add the real edges. Need to weight one edge per rank jump, in a
 989         # continuous line.
 990         # my $weighted = $self->_add_edge_weights;
 991     my @edges = $self->paths;
 992         my( %substart, %subend );
 993     foreach my $edge ( @edges ) {
 994         # Do we need to output this edge?
 995         if( $used{$edge->[0]} && $used{$edge->[1]} ) {
 996                 my $label = $self->_path_display_label( $opts,
 997                         $self->path_witnesses( $edge ) );
 998                         my $variables = { %edge_attrs, 'label' => $label };
 999
1000                         # Account for the rank gap if necessary
1001                         my $rank0 = $self->reading( $edge->[0] )->rank
1002                                 if $self->reading( $edge->[0] )->has_rank;
1003                         my $rank1 = $self->reading( $edge->[1] )->rank
1004                                 if $self->reading( $edge->[1] )->has_rank;
1005                         if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
1006                                 $variables->{'minlen'} = $rank1 - $rank0;
1007                         }
1008
1009                         # Add the calculated edge weights
1010                         # if( exists $weighted->{$edge->[0]}
1011                         #       && $weighted->{$edge->[0]} eq $edge->[1] ) {
1012                         #       # $variables->{'color'} = 'red';
1013                         #       $variables->{'weight'} = 3.0;
1014                         # }
1015
1016                         # EXPERIMENTAL: make edge width reflect no. of witnesses
1017                         my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
1018                         $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
1019
1020                         my $varopts = _dot_attr_string( $variables );
1021                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1022                                 $edge->[0], $edge->[1], $varopts );
1023         } elsif( $used{$edge->[0]} ) {
1024                 $subend{$edge->[0]} = $edge->[1];
1025         } elsif( $used{$edge->[1]} ) {
1026                 $substart{$edge->[1]} = $edge->[0];
1027         }
1028     }
1029
1030     # If we are asked to, add relationship links
1031     if( exists $opts->{show_relations} ) {
1032         my $filter = $opts->{show_relations}; # can be 'transposition' or 'all'
1033         if( $filter eq 'transposition' ) {
1034                 $filter =~ qr/^transposition$/;
1035         }
1036         foreach my $redge ( $self->relationships ) {
1037                 if( $used{$redge->[0]} && $used{$redge->[1]} ) {
1038                         if( $filter ne 'all' ) {
1039                                 my $rel = $self->get_relationship( $redge );
1040                                 next unless $rel->type =~ /$filter/;
1041                                         my $variables = {
1042                                                 arrowhead => 'none',
1043                                                 color => '#FFA14F',
1044                                                 constraint => 'false',
1045                                                 label => uc( substr( $rel->type, 0, 4 ) ),
1046                                                 penwidth => '3',
1047                                         };
1048                                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1049                                                 $redge->[0], $redge->[1], _dot_attr_string( $variables ) );
1050                                 }
1051                 }
1052         }
1053     }
1054
1055     # Add substitute start and end edges if necessary
1056     foreach my $node ( keys %substart ) {
1057         my $witstr = $self->_path_display_label( $opts,
1058                 $self->path_witnesses( $substart{$node}, $node ) );
1059         my $variables = { %edge_attrs, 'label' => $witstr };
1060         my $nrdg = $self->reading( $node );
1061         if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
1062                 # Substart is actually one lower than $startrank
1063                 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
1064         }
1065         my $varopts = _dot_attr_string( $variables );
1066         $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
1067         }
1068     foreach my $node ( keys %subend ) {
1069         my $witstr = $self->_path_display_label( $opts,
1070                 $self->path_witnesses( $node, $subend{$node} ) );
1071         my $variables = { %edge_attrs, 'label' => $witstr };
1072         my $varopts = _dot_attr_string( $variables );
1073         $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
1074         }
1075         # HACK part 2
1076         if( $STRAIGHTENHACK ) {
1077                 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
1078                 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
1079         }
1080
1081     $dot .= "}\n";
1082     return $dot;
1083 }
1084
1085 sub _dot_attr_string {
1086         my( $hash ) = @_;
1087         my @attrs;
1088         foreach my $k ( sort keys %$hash ) {
1089                 my $v = $hash->{$k};
1090                 push( @attrs, $k.'="'.$v.'"' );
1091         }
1092         return( '[ ' . join( ', ', @attrs ) . ' ]' );
1093 }
1094
1095 sub _add_edge_weights {
1096         my $self = shift;
1097         # Walk the graph from START to END, choosing the successor node with
1098         # the largest number of witness paths each time.
1099         my $weighted = {};
1100         my $curr = $self->start->id;
1101         my $ranked = $self->end->has_rank;
1102         while( $curr ne $self->end->id ) {
1103                 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
1104                 my @succ = sort { $self->path_witnesses( $curr, $a )
1105                                                         <=> $self->path_witnesses( $curr, $b ) }
1106                         $self->sequence->successors( $curr );
1107                 my $next = pop @succ;
1108                 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
1109                 # Try to avoid lacunae in the weighted path.
1110                 while( @succ &&
1111                            ( $self->reading( $next )->is_lacuna ||
1112                                  $nextrank - $rank > 1 ) ){
1113                         $next = pop @succ;
1114                 }
1115                 $weighted->{$curr} = $next;
1116                 $curr = $next;
1117         }
1118         return $weighted;
1119 }
1120
1121 =head2 path_witnesses( $edge )
1122
1123 Returns the list of sigils whose witnesses are associated with the given edge.
1124 The edge can be passed as either an array or an arrayref of ( $source, $target ).
1125
1126 =cut
1127
1128 sub path_witnesses {
1129         my( $self, @edge ) = @_;
1130         # If edge is an arrayref, cope.
1131         if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
1132                 my $e = shift @edge;
1133                 @edge = @$e;
1134         }
1135         my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
1136         return @wits;
1137 }
1138
1139 # Helper function. Make a display label for the given witnesses, showing a.c.
1140 # witnesses only where the main witness is not also in the list.
1141 sub _path_display_label {
1142         my $self = shift;
1143         my $opts = shift;
1144         my %wits;
1145         map { $wits{$_} = 1 } @_;
1146
1147         # If an a.c. wit is listed, remove it if the main wit is also listed.
1148         # Otherwise keep it for explicit listing.
1149         my $aclabel = $self->ac_label;
1150         my @disp_ac;
1151         foreach my $w ( sort keys %wits ) {
1152                 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
1153                         if( exists $wits{$1} ) {
1154                                 delete $wits{$w};
1155                         } else {
1156                                 push( @disp_ac, $w );
1157                         }
1158                 }
1159         }
1160
1161         if( $opts->{'explicit_wits'} ) {
1162                 return join( ', ', sort keys %wits );
1163         } else {
1164                 # See if we are in a majority situation.
1165                 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
1166                 $maj = $maj > 5 ? $maj : 5;
1167                 if( scalar keys %wits > $maj ) {
1168                         unshift( @disp_ac, 'majority' );
1169                         return join( ', ', @disp_ac );
1170                 } else {
1171                         return join( ', ', sort keys %wits );
1172                 }
1173         }
1174 }
1175
1176 =head2 readings_at_rank( $rank )
1177
1178 Returns a list of readings at a given rank, taken from the alignment table.
1179
1180 =cut
1181
1182 sub readings_at_rank {
1183         my( $self, $rank ) = @_;
1184         my $table = $self->alignment_table;
1185         # Table rank is real rank - 1.
1186         my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
1187         my %readings;
1188         foreach my $e ( @elements ) {
1189                 next unless ref( $e ) eq 'HASH';
1190                 next unless exists $e->{'t'};
1191                 $readings{$e->{'t'}->id} = $e->{'t'};
1192         }
1193         return values %readings;
1194 }
1195
1196 =head2 as_graphml
1197
1198 Returns a GraphML representation of the collation.  The GraphML will contain
1199 two graphs. The first expresses the attributes of the readings and the witness
1200 paths that link them; the second expresses the relationships that link the
1201 readings.  This is the native transfer format for a tradition.
1202
1203 =begin testing
1204
1205 use Text::Tradition;
1206 use TryCatch;
1207
1208 my $READINGS = 311;
1209 my $PATHS = 361;
1210
1211 my $datafile = 't/data/florilegium_tei_ps.xml';
1212 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1213                                       'name' => 'test0',
1214                                       'file' => $datafile,
1215                                       'linear' => 1 );
1216
1217 ok( $tradition, "Got a tradition object" );
1218 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
1219 ok( $tradition->collation, "Tradition has a collation" );
1220
1221 my $c = $tradition->collation;
1222 is( scalar $c->readings, $READINGS, "Collation has all readings" );
1223 is( scalar $c->paths, $PATHS, "Collation has all paths" );
1224 is( scalar $c->relationships, 0, "Collation has all relationships" );
1225
1226 # Add a few relationships
1227 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
1228 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
1229 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
1230
1231 # Now write it to GraphML and parse it again.
1232
1233 my $graphml = $c->as_graphml;
1234 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
1235 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
1236 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
1237 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
1238
1239 # Now add a stemma, write to GraphML, and look at the output.
1240 SKIP: {
1241         skip "Analysis module not present", 3 unless $tradition->can( 'add_stemma' );
1242         my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
1243         is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
1244         is( $tradition->stemmata, 1, "Tradition now has the stemma" );
1245         $graphml = $c->as_graphml;
1246         like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
1247 }
1248
1249 =end testing
1250
1251 =cut
1252
1253 ## TODO MOVE this to Tradition.pm and modularize it better
1254 sub as_graphml {
1255     my( $self, $options ) = @_;
1256         $self->calculate_ranks unless $self->_graphcalc_done;
1257
1258         my $start = $options->{'from'}
1259                 ? $self->reading( $options->{'from'} ) : $self->start;
1260         my $end = $options->{'to'}
1261                 ? $self->reading( $options->{'to'} ) : $self->end;
1262         if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
1263                 throw( 'Start node must be before end node' );
1264         }
1265         # The readings need to be ranked for this to work.
1266         $start = $self->start unless $start->has_rank;
1267         $end = $self->end unless $end->has_rank;
1268         my $rankoffset = 0;
1269         unless( $start eq $self->start ) {
1270                 $rankoffset = $start->rank - 1;
1271         }
1272         my %use_readings;
1273
1274     # Some namespaces
1275     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
1276     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
1277     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
1278         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
1279
1280     # Create the document and root node
1281     require XML::LibXML;
1282     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
1283     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
1284     $graphml->setDocumentElement( $root );
1285     $root->setNamespace( $xsi_ns, 'xsi', 0 );
1286     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
1287
1288     # List of attribute types to save on our objects and their corresponding
1289     # GraphML types
1290     my %save_types = (
1291         'Str' => 'string',
1292         'Int' => 'int',
1293         'Bool' => 'boolean',
1294         'ReadingID' => 'string',
1295         'RelationshipType' => 'string',
1296         'RelationshipScope' => 'string',
1297     );
1298
1299     # Add the data keys for the graph. Include an extra key 'version' for the
1300     # GraphML output version.
1301     my %graph_data_keys;
1302     my $gdi = 0;
1303     my %graph_attributes = ( 'version' => 'string' );
1304         # Graph attributes include those of Tradition and those of Collation.
1305         my %gattr_from;
1306         # TODO Use meta introspection method from duplicate_reading to do this
1307         # instead of naming custom keys.
1308         my $tmeta = $self->tradition->meta;
1309         my $cmeta = $self->meta;
1310         map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
1311         map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
1312         foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
1313                 next if $attr->name =~ /^_/;
1314                 next unless $save_types{$attr->type_constraint->name};
1315                 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1316         }
1317     # Extra custom keys for complex objects that should be saved in some form.
1318     # The subroutine should return a string, or undef/empty.
1319     if( $tmeta->has_method('stemmata') ) {
1320                 $graph_attributes{'stemmata'} = sub {
1321                         my @stemstrs;
1322                         map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1323                                 $self->tradition->stemmata;
1324                         join( "\n", @stemstrs );
1325                 };
1326         }
1327
1328         if( $tmeta->has_method('user') ) {
1329                 $graph_attributes{'user'} = sub {
1330                         $self->tradition->user ? $self->tradition->user->id : undef
1331                 };
1332         }
1333
1334     foreach my $datum ( sort keys %graph_attributes ) {
1335         $graph_data_keys{$datum} = 'dg'.$gdi++;
1336         my $key = $root->addNewChild( $graphml_ns, 'key' );
1337         my $dtype = ref( $graph_attributes{$datum} ) ? 'string'
1338                 : $graph_attributes{$datum};
1339         $key->setAttribute( 'attr.name', $datum );
1340         $key->setAttribute( 'attr.type', $dtype );
1341         $key->setAttribute( 'for', 'graph' );
1342         $key->setAttribute( 'id', $graph_data_keys{$datum} );
1343     }
1344
1345     # Add the data keys for reading nodes
1346     my %reading_attributes;
1347     my $rmeta = Text::Tradition::Collation::Reading->meta;
1348     foreach my $attr( $rmeta->get_all_attributes ) {
1349                 next if $attr->name =~ /^_/;
1350                 next unless $save_types{$attr->type_constraint->name};
1351                 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1352         }
1353         if( $self->start->does('Text::Tradition::Morphology' ) ) {
1354                 # Extra custom key for the reading morphology
1355                 $reading_attributes{'lexemes'} = 'string';
1356         }
1357
1358     my %node_data_keys;
1359     my $ndi = 0;
1360     foreach my $datum ( sort keys %reading_attributes ) {
1361         $node_data_keys{$datum} = 'dn'.$ndi++;
1362         my $key = $root->addNewChild( $graphml_ns, 'key' );
1363         $key->setAttribute( 'attr.name', $datum );
1364         $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
1365         $key->setAttribute( 'for', 'node' );
1366         $key->setAttribute( 'id', $node_data_keys{$datum} );
1367     }
1368
1369     # Add the data keys for edges, that is, paths and relationships. Path
1370     # data does not come from a Moose class so is here manually.
1371     my $edi = 0;
1372     my %edge_data_keys;
1373     my %edge_attributes = (
1374         witness => 'string',                    # ID/label for a path
1375         extra => 'boolean',                             # Path key
1376         );
1377     my @path_attributes = keys %edge_attributes; # track our manual additions
1378     my $pmeta = Text::Tradition::Collation::Relationship->meta;
1379     foreach my $attr( $pmeta->get_all_attributes ) {
1380                 next if $attr->name =~ /^_/;
1381                 next unless $save_types{$attr->type_constraint->name};
1382                 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1383         }
1384     foreach my $datum ( sort keys %edge_attributes ) {
1385         $edge_data_keys{$datum} = 'de'.$edi++;
1386         my $key = $root->addNewChild( $graphml_ns, 'key' );
1387         $key->setAttribute( 'attr.name', $datum );
1388         $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
1389         $key->setAttribute( 'for', 'edge' );
1390         $key->setAttribute( 'id', $edge_data_keys{$datum} );
1391     }
1392
1393     # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1394     my $xmlidname = $self->tradition->name;
1395     $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1396     if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1397         $xmlidname = '_'.$xmlidname;
1398     }
1399     my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1400     $sgraph->setAttribute( 'edgedefault', 'directed' );
1401     $sgraph->setAttribute( 'id', $xmlidname );
1402     $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
1403     $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
1404     $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
1405     $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
1406     $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
1407
1408     # Tradition/collation attribute data
1409     foreach my $datum ( keys %graph_attributes ) {
1410         my $value;
1411         if( $datum eq 'version' ) {
1412                 $value = '3.2';
1413         } elsif( ref( $graph_attributes{$datum} ) ) {
1414                 my $sub = $graph_attributes{$datum};
1415                 $value = &$sub();
1416         } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1417                 $value = $self->tradition->$datum;
1418         } else {
1419                 $value = $self->$datum;
1420         }
1421                 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
1422         }
1423
1424     my $node_ctr = 0;
1425     my %node_hash;
1426     # Add our readings to the graph
1427     foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
1428         next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1429                 ( $n->rank < $start->rank || $n->rank > $end->rank );
1430         $use_readings{$n->id} = 1;
1431         # Add to the main graph
1432         my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
1433         my $node_xmlid = 'n' . $node_ctr++;
1434         $node_hash{ $n->id } = $node_xmlid;
1435         $node_el->setAttribute( 'id', $node_xmlid );
1436         foreach my $d ( keys %reading_attributes ) {
1437                 my $nval = $n->$d;
1438                 # Custom serialization
1439                 if( $d eq 'lexemes' ) {
1440                                 # If nval is a true value, we have lexemes so we need to
1441                                 # serialize them. Otherwise set nval to undef so that the
1442                                 # key is excluded from this reading.
1443                         $nval = $nval ? $n->_serialize_lexemes : undef;
1444                 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1445                         $nval = undef;
1446                 }
1447                 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
1448                         # Adjust the ranks within the subgraph.
1449                         $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1450                                 : $nval - $rankoffset;
1451                 }
1452                 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1453                         if defined $nval;
1454         }
1455     }
1456
1457     # Add the path edges to the sequence graph
1458     my $edge_ctr = 0;
1459     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1460         # We add an edge in the graphml for every witness in $e.
1461         next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1462         my @edge_wits = sort $self->path_witnesses( $e );
1463         $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1464         $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1465         # Skip any path from start to end; that witness is not in the subgraph.
1466         next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
1467         foreach my $wit ( @edge_wits ) {
1468                         my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1469                                                                                 $node_hash{ $e->[0] },
1470                                                                                 $node_hash{ $e->[1] } );
1471                         my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
1472                         $edge_el->setAttribute( 'source', $from );
1473                         $edge_el->setAttribute( 'target', $to );
1474                         $edge_el->setAttribute( 'id', $id );
1475
1476                         # It's a witness path, so add the witness
1477                         my $base = $wit;
1478                         my $key = $edge_data_keys{'witness'};
1479                         # Is this an ante-corr witness?
1480                         my $aclabel = $self->ac_label;
1481                         if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1482                                 # Keep the base witness
1483                                 $base = $1;
1484                                 # ...and record that this is an 'extra' reading path
1485                                 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1486                         }
1487                         _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1488                 }
1489         }
1490
1491         # Report the actual number of nodes and edges that went in
1492         $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1493         $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1494
1495         # Add the relationship graph to the XML
1496         map { delete $edge_data_keys{$_} } @path_attributes;
1497         $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1498                 $node_data_keys{'id'}, \%edge_data_keys );
1499
1500     # Save and return the thing
1501     my $result = decode_utf8( $graphml->toString(1) );
1502     return $result;
1503 }
1504
1505 sub _add_graphml_data {
1506     my( $el, $key, $value ) = @_;
1507     return unless defined $value;
1508     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
1509     $data_el->setAttribute( 'key', $key );
1510     $data_el->appendText( $value );
1511 }
1512
1513 =head2 as_csv
1514
1515 Returns a CSV alignment table representation of the collation graph, one
1516 row per witness (or witness uncorrected.)
1517
1518 =head2 as_tsv
1519
1520 Returns a tab-separated alignment table representation of the collation graph,
1521 one row per witness (or witness uncorrected.)
1522
1523 =begin testing
1524
1525 use Text::Tradition;
1526 use Text::CSV;
1527
1528 my $READINGS = 311;
1529 my $PATHS = 361;
1530 my $WITS = 13;
1531 my $WITAC = 4;
1532
1533 my $datafile = 't/data/florilegium_tei_ps.xml';
1534 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1535                                       'name' => 'test0',
1536                                       'file' => $datafile,
1537                                       'linear' => 1 );
1538
1539 my $c = $tradition->collation;
1540 # Export the thing to CSV
1541 my $csvstr = $c->as_csv();
1542 # Count the columns
1543 my $csv = Text::CSV->new({ sep_char => ',', binary => 1 });
1544 my @lines = split(/\n/, $csvstr );
1545 ok( $csv->parse( $lines[0] ), "Successfully parsed first line of CSV" );
1546 is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1547 my @q_ac = grep { $_ eq 'Q'.$c->ac_label } $csv->fields;
1548 ok( @q_ac, "Found a layered witness" );
1549
1550 my $t2 = Text::Tradition->new( input => 'Tabular',
1551                                                            name => 'test2',
1552                                                            string => $csvstr,
1553                                                            sep_char => ',' );
1554 is( scalar $t2->collation->readings, $READINGS, "Reparsed CSV collation has all readings" );
1555 is( scalar $t2->collation->paths, $PATHS, "Reparsed CSV collation has all paths" );
1556
1557 # Now do it with TSV
1558 my $tsvstr = $c->as_tsv();
1559 my $t3 = Text::Tradition->new( input => 'Tabular',
1560                                                            name => 'test3',
1561                                                            string => $tsvstr,
1562                                                            sep_char => "\t" );
1563 is( scalar $t3->collation->readings, $READINGS, "Reparsed TSV collation has all readings" );
1564 is( scalar $t3->collation->paths, $PATHS, "Reparsed TSV collation has all paths" );
1565
1566 my $table = $c->alignment_table;
1567 my $noaccsv = $c->as_csv({ noac => 1 });
1568 my @noaclines = split(/\n/, $noaccsv );
1569 ok( $csv->parse( $noaclines[0] ), "Successfully parsed first line of no-ac CSV" );
1570 is( scalar( $csv->fields ), $WITS, "CSV has correct number of witness columns" );
1571 is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
1572
1573 my $safecsv = $c->as_csv({ safe_ac => 1});
1574 my @safelines = split(/\n/, $safecsv );
1575 ok( $csv->parse( $safelines[0] ), "Successfully parsed first line of safe CSV" );
1576 is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1577 @q_ac = grep { $_ eq 'Q__L' } $csv->fields;
1578 ok( @q_ac, "Found a sanitized layered witness" );
1579 is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
1580
1581 =end testing
1582
1583 =cut
1584
1585 sub _tabular {
1586     my( $self, $opts ) = @_;
1587     my $table = $self->alignment_table( $opts );
1588         my $csv_options = { binary => 1, quote_null => 0 };
1589         $csv_options->{'sep_char'} = $opts->{fieldsep};
1590         if( $opts->{fieldsep} eq "\t" ) {
1591                 # If it is really tab separated, nothing is an escape char.
1592                 $csv_options->{'quote_char'} = undef;
1593                 $csv_options->{'escape_char'} = '';
1594         }
1595     my $csv = Text::CSV->new( $csv_options );
1596     my @result;
1597     # Make the header row
1598     $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1599         push( @result, $csv->string );
1600     # Make the rest of the rows
1601     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1602         my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1603         my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
1604         $csv->combine( @row );
1605         push( @result, $csv->string );
1606     }
1607     return join( "\n", @result );
1608 }
1609
1610 sub as_csv {
1611         my $self = shift;
1612         my $opts = shift || {};
1613         $opts->{fieldsep} = ',';
1614         return $self->_tabular( $opts );
1615 }
1616
1617 sub as_tsv {
1618         my $self = shift;
1619         my $opts = shift || {};
1620         $opts->{fieldsep} = "\t";
1621         return $self->_tabular( $opts );
1622 }
1623
1624 =head2 alignment_table
1625
1626 Return a reference to an alignment table, in a slightly enhanced CollateX
1627 format which looks like this:
1628
1629  $table = { alignment => [ { witness => "SIGIL",
1630                              tokens => [ { t => "TEXT" }, ... ] },
1631                            { witness => "SIG2",
1632                              tokens => [ { t => "TEXT" }, ... ] },
1633                            ... ],
1634             length => TEXTLEN };
1635
1636 =cut
1637
1638 sub alignment_table {
1639     my( $self, $opts ) = @_;
1640     if( $self->has_cached_table ) {
1641                 return $self->cached_table
1642                         unless $opts->{noac} || $opts->{safe_ac};
1643     }
1644
1645     # Make sure we can do this
1646         throw( "Need a linear graph in order to make an alignment table" )
1647                 unless $self->linear;
1648     $self->calculate_ranks()
1649         unless $self->_graphcalc_done && $self->end->has_rank;
1650
1651     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
1652     my @all_pos = ( 1 .. $self->end->rank - 1 );
1653     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
1654         # say STDERR "Making witness row(s) for " . $wit->sigil;
1655         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1656         my @row = _make_witness_row( \@wit_path, \@all_pos );
1657         my $witobj = { 'witness' => $wit->sigil, 'tokens' => \@row };
1658         $witobj->{'identifier'} = $wit->identifier if $wit->identifier;
1659         push( @{$table->{'alignment'}}, $witobj );
1660         if( $wit->is_layered && !$opts->{noac} ) {
1661                 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
1662                         $wit->sigil.$self->ac_label );
1663             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
1664             my $witlabel = $opts->{safe_ac}
1665                 ? $wit->sigil . '__L' : $wit->sigil.$self->ac_label;
1666             my $witacobj = { 'witness' => $witlabel,
1667                 'tokens' => \@ac_row };
1668             $witacobj->{'identifier'} = $wit->identifier if $wit->identifier;
1669                         push( @{$table->{'alignment'}}, $witacobj );
1670         }
1671     }
1672     unless( $opts->{noac} || $opts->{safe_ac} ) {
1673             $self->cached_table( $table );
1674         }
1675     return $table;
1676 }
1677
1678 sub _make_witness_row {
1679     my( $path, $positions ) = @_;
1680     my %char_hash;
1681     map { $char_hash{$_} = undef } @$positions;
1682     my $debug = 0;
1683     foreach my $rdg ( @$path ) {
1684         say STDERR "rank " . $rdg->rank if $debug;
1685         # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
1686         $char_hash{$rdg->rank} = { 't' => $rdg };
1687     }
1688     my @row = map { $char_hash{$_} } @$positions;
1689     # Fill in lacuna markers for undef spots in the row
1690     my $last_el = shift @row;
1691     my @filled_row = ( $last_el );
1692     foreach my $el ( @row ) {
1693         # If we are using node reference, make the lacuna node appear many times
1694         # in the table.  If not, use the lacuna tag.
1695         if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1696             $el = $last_el;
1697         }
1698         push( @filled_row, $el );
1699         $last_el = $el;
1700     }
1701     return @filled_row;
1702 }
1703
1704
1705 =head1 NAVIGATION METHODS
1706
1707 =head2 reading_sequence( $first, $last, $sigil, $backup )
1708
1709 Returns the ordered list of readings, starting with $first and ending
1710 with $last, for the witness given in $sigil. If a $backup sigil is
1711 specified (e.g. when walking a layered witness), it will be used wherever
1712 no $sigil path exists.  If there is a base text reading, that will be
1713 used wherever no path exists for $sigil or $backup.
1714
1715 =cut
1716
1717 # TODO Think about returning some lazy-eval iterator.
1718 # TODO Get rid of backup; we should know from what witness is whether we need it.
1719
1720 sub reading_sequence {
1721     my( $self, $start, $end, $witness ) = @_;
1722
1723     $witness = $self->baselabel unless $witness;
1724     my @readings = ( $start );
1725     my %seen;
1726     my $n = $start;
1727     while( $n && $n->id ne $end->id ) {
1728         if( exists( $seen{$n->id} ) ) {
1729             throw( "Detected loop for $witness at " . $n->id );
1730         }
1731         $seen{$n->id} = 1;
1732
1733         my $next = $self->next_reading( $n, $witness );
1734         unless( $next ) {
1735             throw( "Did not find any path for $witness from reading " . $n->id );
1736         }
1737         push( @readings, $next );
1738         $n = $next;
1739     }
1740     # Check that the last reading is our end reading.
1741     my $last = $readings[$#readings];
1742     throw( "Last reading found from " . $start->text .
1743         " for witness $witness is not the end!" ) # TODO do we get this far?
1744         unless $last->id eq $end->id;
1745
1746     return @readings;
1747 }
1748
1749 =head2 next_reading( $reading, $sigil );
1750
1751 Returns the reading that follows the given reading along the given witness
1752 path.
1753
1754 =cut
1755
1756 sub next_reading {
1757     # Return the successor via the corresponding path.
1758     my $self = shift;
1759     my $answer = $self->_find_linked_reading( 'next', @_ );
1760         return undef unless $answer;
1761     return $self->reading( $answer );
1762 }
1763
1764 =head2 prior_reading( $reading, $sigil )
1765
1766 Returns the reading that precedes the given reading along the given witness
1767 path.
1768
1769 =cut
1770
1771 sub prior_reading {
1772     # Return the predecessor via the corresponding path.
1773     my $self = shift;
1774     my $answer = $self->_find_linked_reading( 'prior', @_ );
1775     return $self->reading( $answer );
1776 }
1777
1778 sub _find_linked_reading {
1779     my( $self, $direction, $node, $path ) = @_;
1780
1781     # Get a backup if we are dealing with a layered witness
1782     my $alt_path;
1783     my $aclabel = $self->ac_label;
1784     if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1785         $alt_path = $1;
1786     }
1787
1788     my @linked_paths = $direction eq 'next'
1789         ? $self->sequence->edges_from( $node )
1790         : $self->sequence->edges_to( $node );
1791     return undef unless scalar( @linked_paths );
1792
1793     # We have to find the linked path that contains all of the
1794     # witnesses supplied in $path.
1795     my( @path_wits, @alt_path_wits );
1796     @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1797     @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
1798     my $base_le;
1799     my $alt_le;
1800     foreach my $le ( @linked_paths ) {
1801         if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
1802             $base_le = $le;
1803         }
1804                 my @le_wits = sort $self->path_witnesses( $le );
1805                 if( _is_within( \@path_wits, \@le_wits ) ) {
1806                         # This is the right path.
1807                         return $direction eq 'next' ? $le->[1] : $le->[0];
1808                 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1809                         $alt_le = $le;
1810                 }
1811     }
1812     # Got this far? Return the alternate path if it exists.
1813     return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
1814         if $alt_le;
1815
1816     # Got this far? Return the base path if it exists.
1817     return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
1818         if $base_le;
1819
1820     # Got this far? We have no appropriate path.
1821     warn "Could not find $direction node from " . $node->id
1822         . " along path $path";
1823     return undef;
1824 }
1825
1826 # Some set logic.
1827 sub _is_within {
1828     my( $set1, $set2 ) = @_;
1829     my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
1830     foreach my $el ( @$set1 ) {
1831         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
1832     }
1833     return $ret;
1834 }
1835
1836 # Return the string that joins together a list of witnesses for
1837 # display on a single path.
1838 sub _witnesses_of_label {
1839     my( $self, $label ) = @_;
1840     my $regex = $self->wit_list_separator;
1841     my @answer = split( /\Q$regex\E/, $label );
1842     return @answer;
1843 }
1844
1845 =head2 common_readings
1846
1847 Returns the list of common readings in the graph (i.e. those readings that are
1848 shared by all non-lacunose witnesses.)
1849
1850 =cut
1851
1852 sub common_readings {
1853         my $self = shift;
1854         my @common = grep { $_->is_common } $self->readings;
1855         return @common;
1856 }
1857
1858 =head2 path_text( $sigil, [, $start, $end ] )
1859
1860 Returns the text of a witness (plus its backup, if we are using a layer)
1861 as stored in the collation.  The text is returned as a string, where the
1862 individual readings are joined with spaces and the meta-readings (e.g.
1863 lacunae) are omitted.  Optional specification of $start and $end allows
1864 the generation of a subset of the witness text.
1865
1866 =cut
1867
1868 sub path_text {
1869         my( $self, $wit, $start, $end ) = @_;
1870         $start = $self->start unless $start;
1871         $end = $self->end unless $end;
1872         my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
1873         my $pathtext = '';
1874         my $last;
1875         foreach my $r ( @path ) {
1876                 unless ( $r->join_prior || !$last || $last->join_next ) {
1877                         $pathtext .= ' ';
1878                 }
1879                 $pathtext .= $r->text;
1880                 $last = $r;
1881         }
1882         return $pathtext;
1883 }
1884
1885 =head1 INITIALIZATION METHODS
1886
1887 These are mostly for use by parsers.
1888
1889 =head2 make_witness_path( $witness )
1890
1891 Link the array of readings contained in $witness->path (and in
1892 $witness->uncorrected_path if it exists) into collation paths.
1893 Clear out the arrays when finished.
1894
1895 =head2 make_witness_paths
1896
1897 Call make_witness_path for all witnesses in the tradition.
1898
1899 =cut
1900
1901 # For use when a collation is constructed from a base text and an apparatus.
1902 # We have the sequences of readings and just need to add path edges.
1903 # When we are done, clear out the witness path attributes, as they are no
1904 # longer needed.
1905 # TODO Find a way to replace the witness path attributes with encapsulated functions?
1906
1907 sub make_witness_paths {
1908     my( $self ) = @_;
1909     foreach my $wit ( $self->tradition->witnesses ) {
1910         # say STDERR "Making path for " . $wit->sigil;
1911         $self->make_witness_path( $wit );
1912     }
1913 }
1914
1915 sub make_witness_path {
1916     my( $self, $wit ) = @_;
1917     my @chain = @{$wit->path};
1918     my $sig = $wit->sigil;
1919     # Add start and end if necessary
1920     unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1921     push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1922     foreach my $idx ( 0 .. $#chain-1 ) {
1923         $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
1924     }
1925     if( $wit->is_layered ) {
1926         @chain = @{$wit->uncorrected_path};
1927                 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1928                 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1929         foreach my $idx( 0 .. $#chain-1 ) {
1930             my $source = $chain[$idx];
1931             my $target = $chain[$idx+1];
1932             $self->add_path( $source, $target, $sig.$self->ac_label )
1933                 unless $self->has_path( $source, $target, $sig );
1934         }
1935     }
1936     $wit->clear_path;
1937     $wit->clear_uncorrected_path;
1938 }
1939
1940 =head2 calculate_ranks
1941
1942 Calculate the reading ranks (that is, their aligned positions relative
1943 to each other) for the graph.  This can only be called on linear collations.
1944
1945 =begin testing
1946
1947 use Text::Tradition;
1948
1949 my $cxfile = 't/data/Collatex-16.xml';
1950 my $t = Text::Tradition->new(
1951     'name'  => 'inline',
1952     'input' => 'CollateX',
1953     'file'  => $cxfile,
1954     );
1955 my $c = $t->collation;
1956
1957 # Make an svg
1958 my $table = $c->alignment_table;
1959 ok( $c->has_cached_table, "Alignment table was cached" );
1960 is( $c->alignment_table, $table, "Cached table returned upon second call" );
1961 $c->calculate_ranks;
1962 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
1963 $c->add_relationship( 'n13', 'n23', { type => 'repetition' } );
1964 is( $c->alignment_table, $table, "Alignment table unchanged after non-colo relationship add" );
1965 $c->add_relationship( 'n24', 'n23', { type => 'spelling' } );
1966 isnt( $c->alignment_table, $table, "Alignment table changed after colo relationship add" );
1967
1968 =end testing
1969
1970 =cut
1971
1972 sub calculate_ranks {
1973     my $self = shift;
1974     # Save the existing ranks, in case we need to invalidate the cached SVG.
1975     throw( "Cannot calculate ranks on a non-linear graph" )
1976         unless $self->linear;
1977     my %existing_ranks;
1978     map { $existing_ranks{$_} = $_->rank } $self->readings;
1979
1980     # Do the rankings based on the relationship equivalence graph, starting
1981     # with the start node.
1982     my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
1983
1984     # Transfer our rankings from the topological graph to the real one.
1985     foreach my $r ( $self->readings ) {
1986         if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
1987             $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
1988         } else {
1989                 # Die. Find the last rank we calculated.
1990                 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
1991                                  <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
1992                         $self->readings;
1993                 my $last = pop @all_defined;
1994             throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
1995         }
1996     }
1997     # Do we need to invalidate the cached data?
1998     if( $self->has_cached_table ) {
1999         foreach my $r ( $self->readings ) {
2000                 next if defined( $existing_ranks{$r} )
2001                         && $existing_ranks{$r} == $r->rank;
2002                 # Something has changed, so clear the cache
2003                 $self->_clear_cache;
2004                         # ...and recalculate the common readings.
2005                         $self->calculate_common_readings();
2006                 last;
2007         }
2008     }
2009         # The graph calculation information is now up to date.
2010         $self->_graphcalc_done(1);
2011 }
2012
2013 sub _clear_cache {
2014         my $self = shift;
2015         $self->wipe_table if $self->has_cached_table;
2016 }
2017
2018
2019 =head2 flatten_ranks
2020
2021 A convenience method for parsing collation data.  Searches the graph for readings
2022 with the same text at the same rank, and merges any that are found.
2023
2024 =cut
2025
2026 sub flatten_ranks {
2027     my ( $self, %args ) = shift;
2028     my %unique_rank_rdg;
2029     my $changed;
2030     foreach my $p ( $self->identical_readings( %args ) ) {
2031                 # say STDERR "Combining readings at same rank: @$p";
2032                 $changed = 1;
2033                 $self->merge_readings( @$p );
2034                 # TODO see if this now makes a common point.
2035     }
2036     # If we merged readings, the ranks are still fine but the alignment
2037     # table is wrong. Wipe it.
2038     $self->wipe_table() if $changed;
2039 }
2040
2041 =head2 identical_readings
2042 =head2 identical_readings( start => $startnode, end => $endnode )
2043 =head2 identical_readings( startrank => $startrank, endrank => $endrank )
2044
2045 Goes through the graph identifying all pairs of readings that appear to be
2046 identical, and therefore able to be merged into a single reading. Returns the
2047 relevant identical pairs. Can be restricted to run over only a part of the
2048 graph, specified either by node or by rank.
2049
2050 =cut
2051
2052 sub identical_readings {
2053         my ( $self, %args ) = @_;
2054     # Find where we should start and end.
2055     my $startrank = $args{startrank} || 0;
2056     if( $args{start} ) {
2057         throw( "Starting reading has no rank" ) unless $self->reading( $args{start} )
2058                 && $self->reading( $args{start} )->has_rank;
2059         $startrank = $self->reading( $args{start} )->rank;
2060     }
2061     my $endrank = $args{endrank} || $self->end->rank;
2062     if( $args{end} ) {
2063         throw( "Ending reading has no rank" ) unless $self->reading( $args{end} )
2064                 && $self->reading( $args{end} )->has_rank;
2065         $endrank = $self->reading( $args{end} )->rank;
2066     }
2067
2068     # Make sure the ranks are correct.
2069     unless( $self->_graphcalc_done ) {
2070         $self->calculate_ranks;
2071     }
2072     # Go through the readings looking for duplicates.
2073     my %unique_rank_rdg;
2074     my @pairs;
2075     foreach my $rdg ( $self->readings ) {
2076         next unless $rdg->has_rank;
2077         my $rk = $rdg->rank;
2078         next if $rk > $endrank || $rk < $startrank;
2079         my $key = $rk . "||" . $rdg->text;
2080         if( exists $unique_rank_rdg{$key} ) {
2081                 # Make sure they don't have different grammatical forms
2082                         my $ur = $unique_rank_rdg{$key};
2083                 if( $rdg->is_identical( $ur ) ) {
2084                                 push( @pairs, [ $ur, $rdg ] );
2085                         }
2086         } else {
2087             $unique_rank_rdg{$key} = $rdg;
2088         }
2089     }
2090
2091     return @pairs;
2092 }
2093
2094
2095 =head2 calculate_common_readings
2096
2097 Goes through the graph identifying the readings that appear in every witness
2098 (apart from those with lacunae at that spot.) Marks them as common and returns
2099 the list.
2100
2101 =begin testing
2102
2103 use Text::Tradition;
2104
2105 my $cxfile = 't/data/Collatex-16.xml';
2106 my $t = Text::Tradition->new(
2107     'name'  => 'inline',
2108     'input' => 'CollateX',
2109     'file'  => $cxfile,
2110     );
2111 my $c = $t->collation;
2112
2113 my @common = $c->calculate_common_readings();
2114 is( scalar @common, 8, "Found correct number of common readings" );
2115 my @marked = sort $c->common_readings();
2116 is( scalar @common, 8, "All common readings got marked as such" );
2117 my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
2118 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
2119
2120 =end testing
2121
2122 =cut
2123
2124 sub calculate_common_readings {
2125         my $self = shift;
2126         my @common;
2127         map { $_->is_common( 0 ) } $self->readings;
2128         # Implicitly calls calculate_ranks
2129         my $table = $self->alignment_table;
2130         foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
2131                 my @row = map { $_->{'tokens'}->[$idx]
2132                                                         ? $_->{'tokens'}->[$idx]->{'t'} : '' }
2133                                         @{$table->{'alignment'}};
2134                 my %hash;
2135                 foreach my $r ( @row ) {
2136                         if( $r ) {
2137                                 $hash{$r->id} = $r unless $r->is_meta;
2138                         } else {
2139                                 $hash{'UNDEF'} = $r;
2140                         }
2141                 }
2142                 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
2143                         my( $r ) = values %hash;
2144                         $r->is_common( 1 );
2145                         push( @common, $r );
2146                 }
2147         }
2148         return @common;
2149 }
2150
2151 =head2 text_from_paths
2152
2153 Calculate the text array for all witnesses from the path, for later consistency
2154 checking.  Only to be used if there is no non-graph-based way to know the
2155 original texts.
2156
2157 =cut
2158
2159 sub text_from_paths {
2160         my $self = shift;
2161     foreach my $wit ( $self->tradition->witnesses ) {
2162         my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
2163         my @text;
2164         foreach my $r ( @readings ) {
2165                 next if $r->is_meta;
2166                 push( @text, $r->text );
2167         }
2168         $wit->text( \@text );
2169         if( $wit->is_layered ) {
2170                         my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
2171                                                                                                   $wit->sigil.$self->ac_label );
2172                         my @uctext;
2173                         foreach my $r ( @ucrdgs ) {
2174                                 next if $r->is_meta;
2175                                 push( @uctext, $r->text );
2176                         }
2177                         $wit->layertext( \@uctext );
2178         }
2179     }
2180 }
2181
2182 =head1 UTILITY FUNCTIONS
2183
2184 =head2 common_predecessor( $reading_a, $reading_b )
2185
2186 Find the last reading that occurs in sequence before both the given readings.
2187 At the very least this should be $self->start.
2188
2189 =head2 common_successor( $reading_a, $reading_b )
2190
2191 Find the first reading that occurs in sequence after both the given readings.
2192 At the very least this should be $self->end.
2193
2194 =begin testing
2195
2196 use Text::Tradition;
2197
2198 my $cxfile = 't/data/Collatex-16.xml';
2199 my $t = Text::Tradition->new(
2200     'name'  => 'inline',
2201     'input' => 'CollateX',
2202     'file'  => $cxfile,
2203     );
2204 my $c = $t->collation;
2205
2206 is( $c->common_predecessor( 'n24', 'n23' )->id,
2207     'n20', "Found correct common predecessor" );
2208 is( $c->common_successor( 'n24', 'n23' )->id,
2209     '__END__', "Found correct common successor" );
2210
2211 is( $c->common_predecessor( 'n19', 'n17' )->id,
2212     'n16', "Found correct common predecessor for readings on same path" );
2213 is( $c->common_successor( 'n21', 'n10' )->id,
2214     '__END__', "Found correct common successor for readings on same path" );
2215
2216 =end testing
2217
2218 =cut
2219
2220 ## Return the closest reading that is a predecessor of both the given readings.
2221 sub common_predecessor {
2222         my $self = shift;
2223         my( $r1, $r2 ) = $self->_objectify_args( @_ );
2224         return $self->_common_in_path( $r1, $r2, 'predecessors' );
2225 }
2226
2227 sub common_successor {
2228         my $self = shift;
2229         my( $r1, $r2 ) = $self->_objectify_args( @_ );
2230         return $self->_common_in_path( $r1, $r2, 'successors' );
2231 }
2232
2233
2234 # TODO think about how to do this without ranks...
2235 sub _common_in_path {
2236         my( $self, $r1, $r2, $dir ) = @_;
2237         my $iter = $self->end->rank;
2238         my @candidates;
2239         my @last_r1 = ( $r1 );
2240         my @last_r2 = ( $r2 );
2241         # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
2242         my %all_seen;
2243         # say STDERR "Finding common $dir for $r1, $r2";
2244         while( !@candidates ) {
2245                 last unless $iter--;  # Avoid looping infinitely
2246                 # Iterate separately down the graph from r1 and r2
2247                 my( @new_lc1, @new_lc2 );
2248                 foreach my $lc ( @last_r1 ) {
2249                         foreach my $p ( $lc->$dir ) {
2250                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
2251                                         # say STDERR "Path candidate $p from $lc";
2252                                         push( @candidates, $p );
2253                                 } elsif( !$all_seen{$p->id} ) {
2254                                         $all_seen{$p->id} = 'r1';
2255                                         push( @new_lc1, $p );
2256                                 }
2257                         }
2258                 }
2259                 foreach my $lc ( @last_r2 ) {
2260                         foreach my $p ( $lc->$dir ) {
2261                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
2262                                         # say STDERR "Path candidate $p from $lc";
2263                                         push( @candidates, $p );
2264                                 } elsif( !$all_seen{$p->id} ) {
2265                                         $all_seen{$p->id} = 'r2';
2266                                         push( @new_lc2, $p );
2267                                 }
2268                         }
2269                 }
2270                 @last_r1 = @new_lc1;
2271                 @last_r2 = @new_lc2;
2272         }
2273         my @answer = sort { $a->rank <=> $b->rank } @candidates;
2274         return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
2275 }
2276
2277 sub throw {
2278         Text::Tradition::Error->throw(
2279                 'ident' => 'Collation error',
2280                 'message' => $_[0],
2281                 );
2282 }
2283
2284 no Moose;
2285 __PACKAGE__->meta->make_immutable;
2286
2287 =head1 BUGS/TODO
2288
2289 =over
2290
2291 =item * Rework XML serialization in a more modular way
2292
2293 =back
2294
2295 =head1 LICENSE
2296
2297 This package is free software and is provided "as is" without express
2298 or implied warranty.  You can redistribute it and/or modify it under
2299 the same terms as Perl itself.
2300
2301 =head1 AUTHOR
2302
2303 Tara L Andrews E<lt>aurum@cpan.orgE<gt>