base/lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use feature 'say';
   4 use Encode qw( decode_utf8 );
   5 use File::Temp;
   6 use File::Which;
   7 use Graph;
   8 use IPC::Run qw( run binary );
   9 use Text::CSV;
  10 use Text::Tradition::Collation::Data;
  11 use Text::Tradition::Collation::Reading;
  12 use Text::Tradition::Collation::RelationshipStore;
  13 use Text::Tradition::Error;
  14 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
  15 use XML::LibXML;
  16 use XML::LibXML::XPathContext;
  17 use Moose;
  18
  19 has _data => (
  20         isa      => 'Text::Tradition::Collation::Data',
  21         is       => 'ro',
  22         required => 1,
  23         handles  => [ qw(
  24                 sequence
  25                 paths
  26                 _set_relations
  27                 relations
  28                 _set_start
  29                 _set_end
  30                 ac_label
  31                 has_cached_table
  32                 relationships
  33                 relationship_types
  34                 related_readings
  35                 get_relationship
  36                 del_relationship
  37                 equivalence
  38                 equivalence_graph
  39                 readings
  40                 reading
  41                 _add_reading
  42                 del_reading
  43                 has_reading
  44                 wit_list_separator
  45                 baselabel
  46                 linear
  47                 wordsep
  48                 start
  49                 end
  50                 cached_table
  51                 _graphcalc_done
  52                 has_cached_svg
  53                 wipe_table
  54         )]
  55 );
  56
  57 has 'tradition' => (
  58     is => 'ro',
  59     isa => 'Text::Tradition',
  60     writer => '_set_tradition',
  61     weak_ref => 1,
  62     );
  63
  64 =encoding utf8
  65
  66 =head1 NAME
  67
  68 Text::Tradition::Collation - a software model for a text collation
  69
  70 =head1 SYNOPSIS
  71
  72   use Text::Tradition;
  73   my $t = Text::Tradition->new(
  74     'name' => 'this is a text',
  75     'input' => 'TEI',
  76     'file' => '/path/to/tei_parallel_seg_file.xml' );
  77
  78   my $c = $t->collation;
  79   my @readings = $c->readings;
  80   my @paths = $c->paths;
  81   my @relationships = $c->relationships;
  82
  83   my $svg_variant_graph = $t->collation->as_svg();
  84
  85 =head1 DESCRIPTION
  86
  87 Text::Tradition is a library for representation and analysis of collated
  88 texts, particularly medieval ones.  The Collation is the central feature of
  89 a Tradition, where the text, its sequence of readings, and its relationships
  90 between readings are actually kept.
  91
  92 =head1 CONSTRUCTOR
  93
  94 =head2 new
  95
  96 The constructor.  Takes a hash or hashref of the following arguments:
  97
  98 =over
  99
 100 =item * tradition - The Text::Tradition object to which the collation
 101 belongs. Required.
 102
 103 =item * linear - Whether the collation should be linear; that is, whether
 104 transposed readings should be treated as two linked readings rather than one,
 105 and therefore whether the collation graph is acyclic.  Defaults to true.
 106
 107 =item * baselabel - The default label for the path taken by a base text
 108 (if any). Defaults to 'base text'.
 109
 110 =item * wit_list_separator - The string to join a list of witnesses for
 111 purposes of making labels in display graphs.  Defaults to ', '.
 112
 113 =item * ac_label - The extra label to tack onto a witness sigil when
 114 representing another layer of path for the given witness - that is, when
 115 a text has more than one possible reading due to scribal corrections or
 116 the like.  Defaults to ' (a.c.)'.
 117
 118 =item * wordsep - The string used to separate words in the original text.
 119 Defaults to ' '.
 120
 121 =back
 122
 123 =head1 ACCESSORS
 124
 125 =head2 tradition
 126
 127 =head2 linear
 128
 129 =head2 wit_list_separator
 130
 131 =head2 baselabel
 132
 133 =head2 ac_label
 134
 135 =head2 wordsep
 136
 137 Simple accessors for collation attributes.
 138
 139 =head2 start
 140
 141 The meta-reading at the start of every witness path.
 142
 143 =head2 end
 144
 145 The meta-reading at the end of every witness path.
 146
 147 =head2 readings
 148
 149 Returns all Reading objects in the graph.
 150
 151 =head2 reading( $id )
 152
 153 Returns the Reading object corresponding to the given ID.
 154
 155 =head2 add_reading( $reading_args )
 156
 157 Adds a new reading object to the collation.
 158 See L<Text::Tradition::Collation::Reading> for the available arguments.
 159
 160 =head2 del_reading( $object_or_id )
 161
 162 Removes the given reading from the collation, implicitly removing its
 163 paths and relationships.
 164
 165 =head2 has_reading( $id )
 166
 167 Predicate to see whether a given reading ID is in the graph.
 168
 169 =head2 reading_witnesses( $object_or_id )
 170
 171 Returns a list of sigils whose witnesses contain the reading.
 172
 173 =head2 paths
 174
 175 Returns all reading paths within the document - that is, all edges in the
 176 collation graph.  Each path is an arrayref of [ $source, $target ] reading IDs.
 177
 178 =head2 add_path( $source, $target, $sigil )
 179
 180 Links the given readings in the collation in sequence, under the given witness
 181 sigil.  The readings may be specified by object or ID.
 182
 183 =head2 del_path( $source, $target, $sigil )
 184
 185 Links the given readings in the collation in sequence, under the given witness
 186 sigil.  The readings may be specified by object or ID.
 187
 188 =head2 has_path( $source, $target );
 189
 190 Returns true if the two readings are linked in sequence in any witness.
 191 The readings may be specified by object or ID.
 192
 193 =head2 relationships
 194
 195 Returns all Relationship objects in the collation.
 196
 197 =head2 add_relationship( $reading, $other_reading, $options )
 198
 199 Adds a new relationship of the type given in $options between the two readings,
 200 which may be specified by object or ID.  Returns a value of ( $status, @vectors)
 201 where $status is true on success, and @vectors is a list of relationship edges
 202 that were ultimately added.
 203 See L<Text::Tradition::Collation::Relationship> for the available options.
 204
 205 =cut
 206
 207 sub BUILDARGS {
 208         my ( $class, @args ) = @_;
 209         my %args = @args == 1 ? %{ $args[0] } : @args;
 210         # TODO determine these from the Moose::Meta object
 211         my @delegate_attrs = qw(sequence relations readings wit_list_separator baselabel
 212                 linear wordsep start end cached_table _graphcalc_done);
 213         my %data_args;
 214         for my $attr (@delegate_attrs) {
 215                 $data_args{$attr} = delete $args{$attr} if exists $args{$attr};
 216         }
 217         $args{_data} = Text::Tradition::Collation::Data->new(%data_args);
 218         return \%args;
 219 }
 220
 221 sub BUILD {
 222     my $self = shift;
 223     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
 224     $self->_set_start( $self->add_reading(
 225         { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
 226     $self->_set_end( $self->add_reading(
 227         { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
 228 }
 229
 230 =head2 register_relationship_type( %relationship_definition )
 231
 232 Add a relationship type definition to this collation. The argument can be either a
 233 hash or a hashref, defining the properties of the relationship. For relationship types
 234 and their properties, see L<Text::Tradition::Collation::RelationshipType>.
 235
 236 =head2 get_relationship_type( $relationship_name )
 237
 238 Retrieve the RelationshipType object for the relationship with the given name.
 239
 240 =cut
 241
 242 sub register_relationship_type {
 243         my $self = shift;
 244         my %args = @_ == 1 ? %{$_[0]} : @_;
 245         if( $self->relations->has_type( $args{name} ) ) {
 246                 throw( 'Relationship type ' . $args{name} . ' already registered' );
 247         }
 248         $self->relations->add_type( %args );
 249 }
 250
 251 sub get_relationship_type {
 252         my( $self, $name ) = @_;
 253                 return $self->relations->has_type( $name )
 254                         ? $self->relations->type( $name ) : undef;
 255 }
 256
 257 ### Reading construct/destruct functions
 258
 259 sub add_reading {
 260         my( $self, $reading ) = @_;
 261         unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
 262                 my %args = %$reading;
 263                 if( $args{'init'} ) {
 264                         # If we are initializing an empty collation, don't assume that we
 265                         # have set a tradition.
 266                         delete $args{'init'};
 267                 } elsif( $self->tradition->can('language') && $self->tradition->has_language
 268                         && !exists $args{'language'} ) {
 269                         $args{'language'} = $self->tradition->language;
 270                 }
 271                 $reading = Text::Tradition::Collation::Reading->new(
 272                         'collation' => $self,
 273                         %args );
 274         }
 275         # First check to see if a reading with this ID exists.
 276         if( $self->reading( $reading->id ) ) {
 277                 throw( "Collation already has a reading with id " . $reading->id );
 278         }
 279         $self->_graphcalc_done(0);
 280         $self->_add_reading( $reading->id => $reading );
 281         # Once the reading has been added, put it in both graphs.
 282         $self->sequence->add_vertex( $reading->id );
 283         $self->relations->add_reading( $reading->id );
 284         return $reading;
 285 };
 286
 287 around del_reading => sub {
 288         my $orig = shift;
 289         my $self = shift;
 290         my $arg = shift;
 291
 292         if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
 293                 $arg = $arg->id;
 294         }
 295         # Remove the reading from the graphs.
 296         $self->_graphcalc_done(0);
 297         $self->_clear_cache; # Explicitly clear caches to GC the reading
 298         $self->sequence->delete_vertex( $arg );
 299         $self->relations->delete_reading( $arg );
 300
 301         # Carry on.
 302         $self->$orig( $arg );
 303 };
 304
 305 =head2 merge_readings( $main, $second, $concatenate, $with_str )
 306
 307 Merges the $second reading into the $main one. If $concatenate is true, then
 308 the merged node will carry the text of both readings, concatenated with either
 309 $with_str (if specified) or a sensible default (the empty string if the
 310 appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
 311
 312 The first two arguments may be either readings or reading IDs.
 313
 314 =begin testing
 315
 316 use Text::Tradition;
 317 use TryCatch;
 318
 319 my $cxfile = 't/data/Collatex-16.xml';
 320 my $t = Text::Tradition->new(
 321     'name'  => 'inline',
 322     'input' => 'CollateX',
 323     'file'  => $cxfile,
 324     );
 325 my $c = $t->collation;
 326
 327 my $rno = scalar $c->readings;
 328 # Split n21 ('unto') for testing purposes
 329 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
 330 my $old_r = $c->reading( 'n21' );
 331 $old_r->alter_text( 'to' );
 332 $c->del_path( 'n20', 'n21', 'A' );
 333 $c->add_path( 'n20', 'n21p0', 'A' );
 334 $c->add_path( 'n21p0', 'n21', 'A' );
 335 $c->add_relationship( 'n21', 'n22', { type => 'collated', scope => 'local' } );
 336 $c->flatten_ranks();
 337 ok( $c->reading( 'n21p0' ), "New reading exists" );
 338 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
 339
 340 # Combine n3 and n4 ( with his )
 341 $c->merge_readings( 'n3', 'n4', 1 );
 342 ok( !$c->reading('n4'), "Reading n4 is gone" );
 343 is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
 344
 345 # Collapse n9 and n10 ( rood / root )
 346 $c->merge_readings( 'n9', 'n10' );
 347 ok( !$c->reading('n10'), "Reading n10 is gone" );
 348 is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
 349
 350 # Try to combine n21 and n21p0. This should break.
 351 my $remaining = $c->reading('n21');
 352 $remaining ||= $c->reading('n22');  # one of these should still exist
 353 try {
 354         $c->merge_readings( 'n21p0', $remaining, 1 );
 355         ok( 0, "Bad reading merge changed the graph" );
 356 } catch( Text::Tradition::Error $e ) {
 357         like( $e->message, qr/neither concatenated nor collated/, "Expected exception from bad concatenation" );
 358 } catch {
 359         ok( 0, "Unexpected error on bad reading merge: $@" );
 360 }
 361
 362 try {
 363         $c->calculate_ranks();
 364         ok( 1, "Graph is still evidently whole" );
 365 } catch( Text::Tradition::Error $e ) {
 366         ok( 0, "Caught a rank exception: " . $e->message );
 367 }
 368
 369 =end testing
 370
 371 =cut
 372
 373 sub merge_readings {
 374         my $self = shift;
 375
 376         # Sanity check
 377         my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
 378         my $mergemeta = $kept_obj->is_meta;
 379         throw( "Cannot merge meta and non-meta reading" )
 380                 unless ( $mergemeta && $del_obj->is_meta )
 381                         || ( !$mergemeta && !$del_obj->is_meta );
 382         if( $mergemeta ) {
 383                 throw( "Cannot merge with start or end node" )
 384                         if( $kept_obj eq $self->start || $kept_obj eq $self->end
 385                                 || $del_obj eq $self->start || $del_obj eq $self->end );
 386                 throw( "Cannot combine text of meta readings" ) if $combine;
 387         }
 388         # We can only merge readings in a linear graph if:
 389         # - they are contiguous with only one edge between them, OR
 390         # - they are at equivalent ranks in the graph.
 391         if( $self->linear ) {
 392                 my @delpred = $del_obj->predecessors;
 393                 my @keptsuc = $kept_obj->successors;
 394                 unless ( @delpred == 1 && $delpred[0] eq $kept_obj
 395                         && @keptsuc == 1 && $keptsuc[0] eq $del_obj ) {
 396                         my( $is_ok, $msg ) = $self->relations->relationship_valid(
 397                                 $kept_obj, $del_obj, 'collated' );
 398                         unless( $is_ok ) {
 399                                 throw( "Readings $kept_obj and $del_obj can be neither concatenated nor collated" );
 400                         }
 401                 }
 402         }
 403
 404         # We only need the IDs for adding paths to the graph, not the reading
 405         # objects themselves.
 406         my $kept = $kept_obj->id;
 407         my $deleted = $del_obj->id;
 408         $self->_graphcalc_done(0);
 409
 410     # The kept reading should inherit the paths and the relationships
 411     # of the deleted reading.
 412         foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
 413                 my @vector = ( $kept );
 414                 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
 415                 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
 416                 next if $vector[0] eq $vector[1]; # Don't add a self loop
 417                 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
 418                 $self->sequence->add_edge( @vector );
 419                 my $fwits = $self->sequence->get_edge_attributes( @vector );
 420                 @wits{keys %$fwits} = values %$fwits;
 421                 $self->sequence->set_edge_attributes( @vector, \%wits );
 422         }
 423         $self->relations->merge_readings( $kept, $deleted, $combine );
 424
 425         # Do the deletion deed.
 426         if( $combine ) {
 427                 # Combine the text of the readings
 428                 my $joinstr = $combine_char;
 429                 unless( defined $joinstr ) {
 430                         $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
 431                         $joinstr = $self->wordsep unless defined $joinstr;
 432                 }
 433                 $kept_obj->_combine( $del_obj, $joinstr );
 434         }
 435         $self->del_reading( $deleted );
 436 }
 437
 438 =head2 merge_related( @relationship_types )
 439
 440 Merge all readings linked with the relationship types given. If any of the selected type(s) is not a colocation, the graph will no longer be linear. The majority/plurality reading in each case will be the one kept.
 441
 442 WARNING: This operation cannot be undone.
 443
 444 =cut
 445
 446 =begin testing
 447
 448 use Test::Warn;
 449 use Text::Tradition;
 450 use TryCatch;
 451
 452 my $t;
 453 warnings_exist {
 454         $t = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/legendfrag.xml' );
 455 } [qr/Cannot set relationship on a meta reading/],
 456         "Got expected relationship drop warning on parse";
 457
 458 my $c = $t->collation;
 459
 460 my %rdg_ids;
 461 map { $rdg_ids{$_} = 1 } $c->readings;
 462 $c->merge_related( 'orthographic' );
 463 is( scalar( $c->readings ), keys( %rdg_ids ) - 9,
 464         "Successfully collapsed orthographic variation" );
 465 map { $rdg_ids{$_} = undef } qw/ r13.3 r11.4 r8.5 r8.2 r7.7 r7.5 r7.4 r7.3 r7.1 /;
 466 foreach my $rid ( keys %rdg_ids ) {
 467         my $exp = $rdg_ids{$rid};
 468         is( !$c->reading( $rid ), !$exp, "Reading $rid correctly " .
 469                 ( $exp ? "retained" : "removed" ) );
 470 }
 471 ok( $c->linear, "Graph is still linear" );
 472 try {
 473         $c->calculate_ranks; # This should succeed
 474         ok( 1, "Can still calculate ranks on the new graph" );
 475 } catch {
 476         ok( 0, "Rank calculation on merged graph failed: $@" );
 477 }
 478
 479 # Now add some transpositions
 480 $c->add_relationship( 'r8.4', 'r10.4', { type => 'transposition' } );
 481 $c->merge_related( 'transposition' );
 482 is( scalar( $c->readings ), keys( %rdg_ids ) - 10,
 483         "Transposed relationship is merged away" );
 484 ok( !$c->reading('r8.4'), "Correct transposed reading removed" );
 485 ok( !$c->linear, "Graph is no longer linear" );
 486 try {
 487         $c->calculate_ranks; # This should fail
 488         ok( 0, "Rank calculation happened on nonlinear graph?!" );
 489 } catch ( Text::Tradition::Error $e ) {
 490         is( $e->message, 'Cannot calculate ranks on a non-linear graph',
 491                 "Rank calculation on merged graph threw an error" );
 492 }
 493
 494 =end testing
 495
 496 =cut
 497
 498 # TODO: there should be a way to display merged without affecting the underlying data!
 499
 500 sub merge_related {
 501         my $self = shift;
 502         my %reltypehash;
 503         map { $reltypehash{$_} = 1 } @_;
 504
 505         # Set up the filter for finding related readings
 506         my $filter = sub {
 507                 exists $reltypehash{$_[0]->type};
 508         };
 509
 510         # Go through all readings looking for related ones
 511         foreach my $r ( $self->readings ) {
 512                 next unless $self->reading( "$r" ); # might have been deleted meanwhile
 513                 while( my @related = $self->related_readings( $r, $filter ) ) {
 514                         push( @related, $r );
 515                         @related = sort {
 516                                         scalar $b->witnesses <=> scalar $a->witnesses
 517                                 } @related;
 518                         my $keep = shift @related;
 519                         foreach my $delr ( @related ) {
 520                                 $self->linear( 0 )
 521                                         unless( $self->get_relationship( $keep, $delr )->colocated );
 522                                 $self->merge_readings( $keep, $delr );
 523                         }
 524                 }
 525         }
 526 }
 527
 528 =head2 compress_readings
 529
 530 Where possible in the graph, compresses plain sequences of readings into a
 531 single reading. The sequences must consist of readings with no
 532 relationships to other readings, with only a single witness path between
 533 them and no other witness paths from either that would skip the other. The
 534 readings must also not be marked as nonsense or bad grammar.
 535
 536 WARNING: This operation cannot be undone.
 537
 538 =cut
 539
 540 sub compress_readings {
 541         my $self = shift;
 542         # Anywhere in the graph that there is a reading that joins only to a single
 543         # successor, and neither of these have any relationships, just join the two
 544         # readings.
 545         foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
 546                 # Now look for readings that can be joined to their successors.
 547                 next unless $rdg->is_combinable;
 548                 my %seen;
 549                 while( $self->sequence->successors( $rdg ) == 1 ) {
 550                         my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
 551                         throw( "Infinite loop" ) if $seen{$next->id};
 552                         $seen{$next->id} = 1;
 553                         last if $self->sequence->predecessors( $next ) > 1;
 554                         last unless $next->is_combinable;
 555                         say "Joining readings $rdg and $next";
 556                         $self->merge_readings( $rdg, $next, 1 );
 557                 }
 558         }
 559         # Make sure we haven't screwed anything up
 560         foreach my $wit ( $self->tradition->witnesses ) {
 561                 my $pathtext = $self->path_text( $wit->sigil );
 562                 my $origtext = join( ' ', @{$wit->text} );
 563                 throw( "Text differs for witness " . $wit->sigil )
 564                         unless $pathtext eq $origtext;
 565                 if( $wit->is_layered ) {
 566                         $pathtext = $self->path_text( $wit->sigil.$self->ac_label );
 567                         $origtext = join( ' ', @{$wit->layertext} );
 568                         throw( "Ante-corr text differs for witness " . $wit->sigil )
 569                                 unless $pathtext eq $origtext;
 570                 }
 571         }
 572
 573         $self->relations->rebuild_equivalence();
 574         $self->calculate_ranks();
 575 }
 576
 577 # Helper function for manipulating the graph.
 578 sub _stringify_args {
 579         my( $self, $first, $second, @args ) = @_;
 580     $first = $first->id
 581         if ref( $first ) eq 'Text::Tradition::Collation::Reading';
 582     $second = $second->id
 583         if ref( $second ) eq 'Text::Tradition::Collation::Reading';
 584     return( $first, $second, @args );
 585 }
 586
 587 # Helper function for manipulating the graph.
 588 sub _objectify_args {
 589         my( $self, $first, $second, $arg ) = @_;
 590     $first = $self->reading( $first )
 591         unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
 592     $second = $self->reading( $second )
 593         unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
 594     return( $first, $second, $arg );
 595 }
 596
 597 =head2 duplicate_reading( $reading, @witlist )
 598
 599 Split the given reading into two, so that the new reading is in the path for
 600 the witnesses given in @witlist. If the result is that certain non-colocated
 601 relationships (e.g. transpositions) are no longer valid, these will be removed.
 602 Returns the newly-created reading.
 603
 604 =begin testing
 605
 606 use Test::More::UTF8;
 607 use Text::Tradition;
 608 use TryCatch;
 609
 610 my $st = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/collatecorr.xml' );
 611 is( ref( $st ), 'Text::Tradition', "Got a tradition from test file" );
 612 ok( $st->has_witness('Ba96'), "Tradition has the affected witness" );
 613
 614 my $sc = $st->collation;
 615 my $numr = 17;
 616 ok( $sc->reading('n131'), "Tradition has the affected reading" );
 617 is( scalar( $sc->readings ), $numr, "There are $numr readings in the graph" );
 618 is( $sc->end->rank, 14, "There are fourteen ranks in the graph" );
 619
 620 # Detach the erroneously collated reading
 621 my( $newr, @del_rdgs ) = $sc->duplicate_reading( 'n131', 'Ba96' );
 622 ok( $newr, "New reading was created" );
 623 ok( $sc->reading('n131_0'), "Detached the bad collation with a new reading" );
 624 is( scalar( $sc->readings ), $numr + 1, "A reading was added to the graph" );
 625 is( $sc->end->rank, 10, "There are now only ten ranks in the graph" );
 626 my $csucc = $sc->common_successor( 'n131', 'n131_0' );
 627 is( $csucc->id, 'n136', "Found correct common successor to duped reading" );
 628
 629 # Check that the bad transposition is gone
 630 is( scalar @del_rdgs, 1, "Deleted reading was returned by API call" );
 631 is( $sc->get_relationship( 'n130', 'n135' ), undef, "Bad transposition relationship is gone" );
 632
 633 # The collation should not be fixed
 634 my @pairs = $sc->identical_readings();
 635 is( scalar @pairs, 0, "Not re-collated yet" );
 636 # Fix the collation
 637 ok( $sc->merge_readings( 'n124', 'n131_0' ), "Collated the readings correctly" );
 638 @pairs = $sc->identical_readings( start => 'n124', end => $csucc->id );
 639 is( scalar @pairs, 3, "Found three more identical readings" );
 640 is( $sc->end->rank, 11, "The ranks shifted appropriately" );
 641 $sc->flatten_ranks();
 642 is( scalar( $sc->readings ), $numr - 3, "Now we are collated correctly" );
 643
 644 # Check that we can't "duplicate" a reading with no wits or with all wits
 645 try {
 646         my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124' );
 647         ok( 0, "Reading duplication without witnesses throws an error" );
 648 } catch( Text::Tradition::Error $e ) {
 649         like( $e->message, qr/Must specify one or more witnesses/,
 650                 "Reading duplication without witnesses throws the expected error" );
 651 } catch {
 652         ok( 0, "Reading duplication without witnesses threw the wrong error" );
 653 }
 654
 655 try {
 656         my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124', 'Ba96', 'Mü11475' );
 657         ok( 0, "Reading duplication with all witnesses throws an error" );
 658 } catch( Text::Tradition::Error $e ) {
 659         like( $e->message, qr/Cannot join all witnesses/,
 660                 "Reading duplication with all witnesses throws the expected error" );
 661 } catch {
 662         ok( 0, "Reading duplication with all witnesses threw the wrong error" );
 663 }
 664
 665 try {
 666         $sc->calculate_ranks();
 667         ok( 1, "Graph is still evidently whole" );
 668 } catch( Text::Tradition::Error $e ) {
 669         ok( 0, "Caught a rank exception: " . $e->message );
 670 }
 671
 672 =end testing
 673
 674 =cut
 675
 676 sub duplicate_reading {
 677         my( $self, $r, @wits ) = @_;
 678         # Check that we are not doing anything unwise.
 679         throw( "Must specify one or more witnesses for the duplicated reading" )
 680                 unless @wits;
 681         unless( ref( $r ) eq 'Text::Tradition::Collation::Reading' ) {
 682                 $r = $self->reading( $r );
 683         }
 684         throw( "Cannot duplicate a meta-reading" )
 685                 if $r->is_meta;
 686         throw( "Cannot join all witnesses to the new reading" )
 687                 if scalar( @wits ) == scalar( $r->witnesses );
 688
 689         # Get all the reading attributes and duplicate them.
 690         my $rmeta = Text::Tradition::Collation::Reading->meta;
 691         my %args;
 692     foreach my $attr( $rmeta->get_all_attributes ) {
 693                 next if $attr->name =~ /^_/;
 694                 my $acc = $attr->get_read_method;
 695                 if( !$acc && $attr->has_applied_traits ) {
 696                         my $tr = $attr->applied_traits;
 697                         if( $tr->[0] =~ /::(Array|Hash)$/ ) {
 698                                 my $which = $1;
 699                                 my %methods = reverse %{$attr->handles};
 700                                 $acc = $methods{elements};
 701                                 $args{$attr->name} = $which eq 'Array'
 702                                         ? [ $r->$acc ] : { $r->$acc };
 703                         }
 704                 } else {
 705                         $args{$attr->name} = $r->$acc if $acc;
 706                 }
 707         }
 708         # By definition the new reading will no longer be common.
 709         $args{is_common} = 0;
 710         # The new reading also needs its own ID.
 711         $args{id} = $self->_generate_dup_id( $r->id );
 712
 713         # Try to make the new reading.
 714         my $newr = $self->add_reading( \%args );
 715         # The old reading is also no longer common.
 716         $r->is_common( 0 );
 717
 718         # For each of the witnesses, dissociate from the old reading and
 719         # associate with the new.
 720         foreach my $wit ( @wits ) {
 721                 my $prior = $self->prior_reading( $r, $wit );
 722                 my $next = $self->next_reading( $r, $wit );
 723                 $self->del_path( $prior, $r, $wit );
 724                 $self->add_path( $prior, $newr, $wit );
 725                 $self->del_path( $r, $next, $wit );
 726                 $self->add_path( $newr, $next, $wit );
 727         }
 728
 729         # If the graph is ranked, we need to look for relationships that are now
 730         # invalid (i.e. 'non-colocation' types that might now be colocated) and
 731         # remove them. If not, we can skip it.
 732         my $succ;
 733         my %rrk;
 734         my @deleted_relations;
 735         if( $self->end->has_rank ) {
 736                 # Find the point where we can stop checking
 737                 $succ = $self->common_successor( $r, $newr );
 738
 739                 # Hash the existing ranks
 740                 foreach my $rdg ( $self->readings ) {
 741                         $rrk{$rdg->id} = $rdg->rank;
 742                 }
 743                 # Calculate the new ranks
 744                 $self->calculate_ranks();
 745
 746                 # Check for invalid non-colocated relationships among changed-rank readings
 747                 # from where the ranks start changing up to $succ
 748                 my $lastrank = $succ->rank;
 749                 foreach my $rdg ( $self->readings ) {
 750                         next if $rdg->rank > $lastrank;
 751                         next if $rdg->rank == $rrk{$rdg->id};
 752                         my @noncolo = $rdg->related_readings( sub { !$_[0]->colocated } );
 753                         next unless @noncolo;
 754                         foreach my $nc ( @noncolo ) {
 755                                 unless( $self->relations->verify_or_delete( $rdg, $nc ) ) {
 756                                         push( @deleted_relations, [ $rdg->id, $nc->id ] );
 757                                 }
 758                         }
 759                 }
 760         }
 761         return ( $newr, @deleted_relations );
 762 }
 763
 764 sub _generate_dup_id {
 765         my( $self, $rid ) = @_;
 766         my $newid;
 767         my $i = 0;
 768         while( !$newid ) {
 769                 $newid = $rid."_$i";
 770                 if( $self->has_reading( $newid ) ) {
 771                         $newid = '';
 772                         $i++;
 773                 }
 774         }
 775         return $newid;
 776 }
 777
 778 ### Path logic
 779
 780 sub add_path {
 781         my $self = shift;
 782
 783         # We only need the IDs for adding paths to the graph, not the reading
 784         # objects themselves.
 785     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 786
 787         $self->_graphcalc_done(0);
 788         # Connect the readings
 789         unless( $self->sequence->has_edge( $source, $target ) ) {
 790             $self->sequence->add_edge( $source, $target );
 791             $self->relations->add_equivalence_edge( $source, $target );
 792         }
 793     # Note the witness in question
 794     $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
 795 }
 796
 797 sub del_path {
 798         my $self = shift;
 799         my @args;
 800         if( ref( $_[0] ) eq 'ARRAY' ) {
 801                 my $e = shift @_;
 802                 @args = ( @$e, @_ );
 803         } else {
 804                 @args = @_;
 805         }
 806
 807         # We only need the IDs for removing paths from the graph, not the reading
 808         # objects themselves.
 809     my( $source, $target, $wit ) = $self->_stringify_args( @args );
 810
 811         $self->_graphcalc_done(0);
 812         if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
 813                 $self->sequence->delete_edge_attribute( $source, $target, $wit );
 814         }
 815         unless( $self->sequence->has_edge_attributes( $source, $target ) ) {
 816                 $self->sequence->delete_edge( $source, $target );
 817                 $self->relations->delete_equivalence_edge( $source, $target );
 818         }
 819 }
 820
 821
 822 # Extra graph-alike utility
 823 sub has_path {
 824         my $self = shift;
 825     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 826         return undef unless $self->sequence->has_edge( $source, $target );
 827         return $self->sequence->has_edge_attribute( $source, $target, $wit );
 828 }
 829
 830 =head2 clear_witness( @sigil_list )
 831
 832 Clear the given witnesses out of the collation entirely, removing references
 833 to them in paths, and removing readings that belong only to them.  Should only
 834 be called via $tradition->del_witness.
 835
 836 =cut
 837
 838 sub clear_witness {
 839         my( $self, @sigils ) = @_;
 840
 841         $self->_graphcalc_done(0);
 842         # Clear the witness(es) out of the paths
 843         foreach my $e ( $self->paths ) {
 844                 foreach my $sig ( @sigils ) {
 845                         $self->del_path( $e, $sig );
 846                 }
 847         }
 848
 849         # Clear out the newly unused readings
 850         foreach my $r ( $self->readings ) {
 851                 unless( $self->reading_witnesses( $r ) ) {
 852                         $self->del_reading( $r );
 853                 }
 854         }
 855 }
 856
 857 sub add_relationship {
 858         my $self = shift;
 859     my( $source, $target, $opts ) = $self->_stringify_args( @_ );
 860     my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
 861     foreach my $v ( @vectors ) {
 862         next unless $self->get_relationship( $v )->colocated;
 863         if( $self->reading( $v->[0] )->has_rank && $self->reading( $v->[1] )->has_rank
 864                 && $self->reading( $v->[0] )->rank ne $self->reading( $v->[1] )->rank ) {
 865                         $self->_graphcalc_done(0);
 866                         $self->_clear_cache;
 867                         last;
 868         }
 869     }
 870     return @vectors;
 871 }
 872
 873 around qw/ get_relationship del_relationship / => sub {
 874         my $orig = shift;
 875         my $self = shift;
 876         my @args = @_;
 877         if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
 878                 @args = @{$_[0]};
 879         }
 880         my @stringargs = $self->_stringify_args( @args );
 881         $self->$orig( @stringargs );
 882 };
 883
 884 =head2 reading_witnesses( $reading )
 885
 886 Return a list of sigils corresponding to the witnesses in which the reading appears.
 887
 888 =cut
 889
 890 sub reading_witnesses {
 891         my( $self, $reading ) = @_;
 892         # We need only check either the incoming or the outgoing edges; I have
 893         # arbitrarily chosen "incoming".  Thus, special-case the start node.
 894         if( $reading eq $self->start ) {
 895                 return map { $_->sigil } grep { $_->is_collated } $self->tradition->witnesses;
 896         }
 897         my %all_witnesses;
 898         foreach my $e ( $self->sequence->edges_to( $reading ) ) {
 899                 my $wits = $self->sequence->get_edge_attributes( @$e );
 900                 @all_witnesses{ keys %$wits } = 1;
 901         }
 902         my $acstr = $self->ac_label;
 903         foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
 904                 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
 905         }
 906         return keys %all_witnesses;
 907 }
 908
 909 =head1 OUTPUT METHODS
 910
 911 =head2 as_svg( \%options )
 912
 913 Returns an SVG string that represents the graph, via as_dot and graphviz.
 914 See as_dot for a list of options.  Must have GraphViz (dot) installed to run.
 915
 916 =cut
 917
 918 sub as_svg {
 919     my( $self, $opts ) = @_;
 920     throw( "Need GraphViz installed to output SVG" )
 921         unless File::Which::which( 'dot' );
 922     my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
 923     $self->calculate_ranks()
 924         unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
 925         my @cmd = qw/dot -Tsvg/;
 926         my( $svg, $err );
 927         my $dotfile = File::Temp->new();
 928         ## USE FOR DEBUGGING
 929         # $dotfile->unlink_on_destroy(0);
 930         binmode $dotfile, ':utf8';
 931         print $dotfile $self->as_dot( $opts );
 932         push( @cmd, $dotfile->filename );
 933         run( \@cmd, ">", binary(), \$svg );
 934         $svg = decode_utf8( $svg );
 935         return $svg;
 936 }
 937
 938
 939 =head2 as_dot( \%options )
 940
 941 Returns a string that is the collation graph expressed in dot
 942 (i.e. GraphViz) format.  Options include:
 943
 944 =over 4
 945
 946 =item * from
 947
 948 =item * to
 949
 950 =item * color_common
 951
 952 =back
 953
 954 =cut
 955
 956 sub as_dot {
 957     my( $self, $opts ) = @_;
 958     my $startrank = $opts->{'from'} if $opts;
 959     my $endrank = $opts->{'to'} if $opts;
 960     my $color_common = $opts->{'color_common'} if $opts;
 961     my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
 962        && $self->end->rank > 100;
 963     $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
 964
 965     # Check the arguments
 966     if( $startrank ) {
 967         return if $endrank && $startrank > $endrank;
 968         return if $startrank > $self->end->rank;
 969         }
 970         if( defined $endrank ) {
 971                 return if $endrank < 0;
 972                 $endrank = undef if $endrank == $self->end->rank;
 973         }
 974
 975     my $graph_name = $self->tradition->name;
 976     $graph_name =~ s/[^\w\s]//g;
 977     $graph_name = join( '_', split( /\s+/, $graph_name ) );
 978
 979     my %graph_attrs = (
 980         'rankdir' => 'LR',
 981         'bgcolor' => 'none',
 982         );
 983     my %node_attrs = (
 984         'fontsize' => 14,
 985         'fillcolor' => 'white',
 986         'style' => 'filled',
 987         'shape' => 'ellipse'
 988         );
 989     my %edge_attrs = (
 990         'arrowhead' => 'open',
 991         'color' => '#000000',
 992         'fontcolor' => '#000000',
 993         );
 994
 995     my $dot = sprintf( "digraph %s {\n", $graph_name );
 996     $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
 997     $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
 998
 999         # Output substitute start/end readings if necessary
1000         if( $startrank ) {
1001                 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
1002         }
1003         if( $endrank ) {
1004                 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
1005         }
1006         if( $STRAIGHTENHACK ) {
1007                 ## HACK part 1
1008                 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
1009                 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
1010                 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
1011         }
1012         my %used;  # Keep track of the readings that actually appear in the graph
1013         # Sort the readings by rank if we have ranks; this speeds layout.
1014         my @all_readings = $self->end->has_rank
1015                 ? sort { $a->rank <=> $b->rank } $self->readings
1016                 : $self->readings;
1017         # TODO Refrain from outputting lacuna nodes - just grey out the edges.
1018     foreach my $reading ( @all_readings ) {
1019         # Only output readings within our rank range.
1020         next if $startrank && $reading->rank < $startrank;
1021         next if $endrank && $reading->rank > $endrank;
1022         $used{$reading->id} = 1;
1023         # Need not output nodes without separate labels
1024         next if $reading->id eq $reading->text;
1025         my $rattrs;
1026         my $label = $reading->text;
1027         unless( $label =~ /^[[:punct:]]+$/ ) {
1028                 $label .= '-' if $reading->join_next;
1029             $label = "-$label" if $reading->join_prior;
1030         }
1031         $label =~ s/\"/\\\"/g;
1032                 $rattrs->{'label'} = $label;
1033                 $rattrs->{'id'} = $reading->id;
1034                 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
1035         $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
1036     }
1037
1038         # Add the real edges. Need to weight one edge per rank jump, in a
1039         # continuous line.
1040         # my $weighted = $self->_add_edge_weights;
1041     my @edges = $self->paths;
1042         my( %substart, %subend );
1043     foreach my $edge ( @edges ) {
1044         # Do we need to output this edge?
1045         if( $used{$edge->[0]} && $used{$edge->[1]} ) {
1046                 my $label = $self->_path_display_label( $opts,
1047                         $self->path_witnesses( $edge ) );
1048                         my $variables = { %edge_attrs, 'label' => $label };
1049
1050                         # Account for the rank gap if necessary
1051                         my $rank0 = $self->reading( $edge->[0] )->rank
1052                                 if $self->reading( $edge->[0] )->has_rank;
1053                         my $rank1 = $self->reading( $edge->[1] )->rank
1054                                 if $self->reading( $edge->[1] )->has_rank;
1055                         if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
1056                                 $variables->{'minlen'} = $rank1 - $rank0;
1057                         }
1058
1059                         # Add the calculated edge weights
1060                         # if( exists $weighted->{$edge->[0]}
1061                         #       && $weighted->{$edge->[0]} eq $edge->[1] ) {
1062                         #       # $variables->{'color'} = 'red';
1063                         #       $variables->{'weight'} = 3.0;
1064                         # }
1065
1066                         # EXPERIMENTAL: make edge width reflect no. of witnesses
1067                         my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
1068                         $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
1069
1070                         my $varopts = _dot_attr_string( $variables );
1071                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1072                                 $edge->[0], $edge->[1], $varopts );
1073         } elsif( $used{$edge->[0]} ) {
1074                 $subend{$edge->[0]} = $edge->[1];
1075         } elsif( $used{$edge->[1]} ) {
1076                 $substart{$edge->[1]} = $edge->[0];
1077         }
1078     }
1079
1080     # If we are asked to, add relationship links
1081     if( exists $opts->{show_relations} ) {
1082         my $filter = $opts->{show_relations}; # can be 'transposition' or 'all'
1083         if( $filter eq 'transposition' ) {
1084                 $filter =~ qr/^transposition$/;
1085         }
1086         my %typecolors;
1087         my @types = sort( map { $_->name } $self->relations->types );
1088         if( exists $opts->{graphcolors} ) {
1089                 foreach my $tdx ( 0 .. $#types ) {
1090                         $typecolors{$types[$tdx]} = $opts->{graphcolors}->[$tdx];
1091                 }
1092         } else {
1093                 map { $typecolors{$_} = '#FFA14F' } @types;
1094         }
1095         foreach my $redge ( $self->relationships ) {
1096                 if( $used{$redge->[0]} && $used{$redge->[1]} ) {
1097                                 my $rel = $self->get_relationship( $redge );
1098                                 next unless $filter eq 'all' || $rel->type =~ /$filter/;
1099                                 my $variables = {
1100                                         arrowhead => 'none',
1101                                         color => $typecolors{$rel->type},
1102                                         constraint => 'false',
1103                                         penwidth => '3',
1104                                 };
1105                                 unless( exists $opts->{graphcolors} ) {
1106                                         $variables->{label} = uc( substr( $rel->type, 0, 4 ) ),
1107                                 }
1108                                 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1109                                         $redge->[0], $redge->[1], _dot_attr_string( $variables ) );
1110                 }
1111         }
1112     }
1113
1114     # Add substitute start and end edges if necessary
1115     foreach my $node ( keys %substart ) {
1116         my $witstr = $self->_path_display_label( $opts,
1117                 $self->path_witnesses( $substart{$node}, $node ) );
1118         my $variables = { %edge_attrs, 'label' => $witstr };
1119         my $nrdg = $self->reading( $node );
1120         if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
1121                 # Substart is actually one lower than $startrank
1122                 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
1123         }
1124         my $varopts = _dot_attr_string( $variables );
1125         $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
1126         }
1127     foreach my $node ( keys %subend ) {
1128         my $witstr = $self->_path_display_label( $opts,
1129                 $self->path_witnesses( $node, $subend{$node} ) );
1130         my $variables = { %edge_attrs, 'label' => $witstr };
1131         my $varopts = _dot_attr_string( $variables );
1132         $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
1133         }
1134         # HACK part 2
1135         if( $STRAIGHTENHACK ) {
1136                 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
1137                 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
1138         }
1139
1140     $dot .= "}\n";
1141     return $dot;
1142 }
1143
1144 sub _dot_attr_string {
1145         my( $hash ) = @_;
1146         my @attrs;
1147         foreach my $k ( sort keys %$hash ) {
1148                 my $v = $hash->{$k};
1149                 push( @attrs, $k.'="'.$v.'"' );
1150         }
1151         return( '[ ' . join( ', ', @attrs ) . ' ]' );
1152 }
1153
1154 sub _add_edge_weights {
1155         my $self = shift;
1156         # Walk the graph from START to END, choosing the successor node with
1157         # the largest number of witness paths each time.
1158         my $weighted = {};
1159         my $curr = $self->start->id;
1160         my $ranked = $self->end->has_rank;
1161         while( $curr ne $self->end->id ) {
1162                 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
1163                 my @succ = sort { $self->path_witnesses( $curr, $a )
1164                                                         <=> $self->path_witnesses( $curr, $b ) }
1165                         $self->sequence->successors( $curr );
1166                 my $next = pop @succ;
1167                 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
1168                 # Try to avoid lacunae in the weighted path.
1169                 while( @succ &&
1170                            ( $self->reading( $next )->is_lacuna ||
1171                                  $nextrank - $rank > 1 ) ){
1172                         $next = pop @succ;
1173                 }
1174                 $weighted->{$curr} = $next;
1175                 $curr = $next;
1176         }
1177         return $weighted;
1178 }
1179
1180 =head2 path_witnesses( $edge )
1181
1182 Returns the list of sigils whose witnesses are associated with the given edge.
1183 The edge can be passed as either an array or an arrayref of ( $source, $target ).
1184
1185 =cut
1186
1187 sub path_witnesses {
1188         my( $self, @edge ) = @_;
1189         # If edge is an arrayref, cope.
1190         if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
1191                 my $e = shift @edge;
1192                 @edge = @$e;
1193         }
1194         my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
1195         return @wits;
1196 }
1197
1198 # Helper function. Make a display label for the given witnesses, showing a.c.
1199 # witnesses only where the main witness is not also in the list.
1200 sub _path_display_label {
1201         my $self = shift;
1202         my $opts = shift;
1203         my %wits;
1204         map { $wits{$_} = 1 } @_;
1205
1206         # If an a.c. wit is listed, remove it if the main wit is also listed.
1207         # Otherwise keep it for explicit listing.
1208         my $aclabel = $self->ac_label;
1209         my @disp_ac;
1210         foreach my $w ( sort keys %wits ) {
1211                 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
1212                         if( exists $wits{$1} ) {
1213                                 delete $wits{$w};
1214                         } else {
1215                                 push( @disp_ac, $w );
1216                         }
1217                 }
1218         }
1219
1220         if( $opts->{'explicit_wits'} ) {
1221                 return join( ', ', sort keys %wits );
1222         } else {
1223                 # See if we are in a majority situation.
1224                 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
1225                 $maj = $maj > 5 ? $maj : 5;
1226                 if( scalar keys %wits > $maj ) {
1227                         unshift( @disp_ac, 'majority' );
1228                         return join( ', ', @disp_ac );
1229                 } else {
1230                         return join( ', ', sort keys %wits );
1231                 }
1232         }
1233 }
1234
1235 =head2 readings_at_rank( $rank )
1236
1237 Returns a list of readings at a given rank, taken from the alignment table.
1238
1239 =cut
1240
1241 sub readings_at_rank {
1242         my( $self, $rank ) = @_;
1243         my $table = $self->alignment_table;
1244         # Table rank is real rank - 1.
1245         my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
1246         my %readings;
1247         foreach my $e ( @elements ) {
1248                 next unless ref( $e ) eq 'HASH';
1249                 next unless exists $e->{'t'};
1250                 $readings{$e->{'t'}->id} = $e->{'t'};
1251         }
1252         return values %readings;
1253 }
1254
1255 =head2 as_graphml
1256
1257 Returns a GraphML representation of the collation.  The GraphML will contain
1258 two graphs. The first expresses the attributes of the readings and the witness
1259 paths that link them; the second expresses the relationships that link the
1260 readings.  This is the native transfer format for a tradition.
1261
1262 =begin testing
1263
1264 use Text::Tradition;
1265 use TryCatch;
1266
1267 my $READINGS = 311;
1268 my $PATHS = 361;
1269
1270 my $datafile = 't/data/florilegium_tei_ps.xml';
1271 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1272                                       'name' => 'test0',
1273                                       'file' => $datafile,
1274                                       'linear' => 1 );
1275
1276 ok( $tradition, "Got a tradition object" );
1277 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
1278 ok( $tradition->collation, "Tradition has a collation" );
1279
1280 my $c = $tradition->collation;
1281 is( scalar $c->readings, $READINGS, "Collation has all readings" );
1282 is( scalar $c->paths, $PATHS, "Collation has all paths" );
1283 is( scalar $c->relationships, 0, "Collation has all relationships" );
1284
1285 # Add a few relationships
1286 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
1287 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
1288 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition',
1289                                           'is_significant' => 'yes' } );
1290
1291 # Now write it to GraphML and parse it again.
1292
1293 my $graphml = $c->as_graphml;
1294 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
1295 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
1296 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
1297 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
1298 my $sigrel = $st->collation->get_relationship( 'w257', 'w262' );
1299 is( $sigrel->is_significant, 'yes', "Ternary attribute value was restored" );
1300
1301 # Now add a stemma, write to GraphML, and look at the output.
1302 SKIP: {
1303         skip "Analysis module not present", 3 unless $tradition->can( 'add_stemma' );
1304         my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
1305         is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
1306         is( $tradition->stemmata, 1, "Tradition now has the stemma" );
1307         $graphml = $c->as_graphml;
1308         like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
1309 }
1310
1311 =end testing
1312
1313 =cut
1314
1315 ## TODO MOVE this to Tradition.pm and modularize it better
1316 sub as_graphml {
1317     my( $self, $options ) = @_;
1318         $self->calculate_ranks unless $self->_graphcalc_done;
1319
1320         my $start = $options->{'from'}
1321                 ? $self->reading( $options->{'from'} ) : $self->start;
1322         my $end = $options->{'to'}
1323                 ? $self->reading( $options->{'to'} ) : $self->end;
1324         if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
1325                 throw( 'Start node must be before end node' );
1326         }
1327         # The readings need to be ranked for this to work.
1328         $start = $self->start unless $start->has_rank;
1329         $end = $self->end unless $end->has_rank;
1330         my $rankoffset = 0;
1331         unless( $start eq $self->start ) {
1332                 $rankoffset = $start->rank - 1;
1333         }
1334         my %use_readings;
1335
1336     # Some namespaces
1337     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
1338     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
1339     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
1340         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
1341
1342     # Create the document and root node
1343     require XML::LibXML;
1344     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
1345     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
1346     $graphml->setDocumentElement( $root );
1347     $root->setNamespace( $xsi_ns, 'xsi', 0 );
1348     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
1349
1350     # List of attribute types to save on our objects and their corresponding
1351     # GraphML types
1352     my %save_types = (
1353         'Str' => 'string',
1354         'Int' => 'int',
1355         'Bool' => 'boolean',
1356         'ReadingID' => 'string',
1357         'RelationshipType' => 'string',
1358         'RelationshipScope' => 'string',
1359         'Ternary' => 'string',
1360     );
1361
1362     # Add the data keys for the graph. Include an extra key 'version' for the
1363     # GraphML output version.
1364     my %graph_data_keys;
1365     my $gdi = 0;
1366     my %graph_attributes = ( 'version' => 'string' );
1367         # Graph attributes include those of Tradition and those of Collation.
1368         my %gattr_from;
1369         # TODO Use meta introspection method from duplicate_reading to do this
1370         # instead of naming custom keys.
1371         my $tmeta = $self->tradition->meta;
1372         my $cmeta = $self->meta;
1373         map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
1374         map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
1375         foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
1376                 next if $attr->name =~ /^_/;
1377                 next unless $save_types{$attr->type_constraint->name};
1378                 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1379         }
1380     # Extra custom keys for complex objects that should be saved in some form.
1381     # The subroutine should return a string, or undef/empty.
1382     if( $tmeta->has_method('stemmata') ) {
1383                 $graph_attributes{'stemmata'} = sub {
1384                         my @stemstrs;
1385                         map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1386                                 $self->tradition->stemmata;
1387                         join( "\n", @stemstrs );
1388                 };
1389         }
1390
1391         if( $tmeta->has_method('user') ) {
1392                 $graph_attributes{'user'} = sub {
1393                         $self->tradition->user ? $self->tradition->user->id : undef
1394                 };
1395         }
1396
1397     foreach my $datum ( sort keys %graph_attributes ) {
1398         $graph_data_keys{$datum} = 'dg'.$gdi++;
1399         my $key = $root->addNewChild( $graphml_ns, 'key' );
1400         my $dtype = ref( $graph_attributes{$datum} ) ? 'string'
1401                 : $graph_attributes{$datum};
1402         $key->setAttribute( 'attr.name', $datum );
1403         $key->setAttribute( 'attr.type', $dtype );
1404         $key->setAttribute( 'for', 'graph' );
1405         $key->setAttribute( 'id', $graph_data_keys{$datum} );
1406     }
1407
1408     # Add the data keys for reading nodes
1409     my %reading_attributes;
1410     my $rmeta = Text::Tradition::Collation::Reading->meta;
1411     foreach my $attr( $rmeta->get_all_attributes ) {
1412                 next if $attr->name =~ /^_/;
1413                 next unless $save_types{$attr->type_constraint->name};
1414                 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1415         }
1416         if( $self->start->does('Text::Tradition::Morphology' ) ) {
1417                 # Extra custom key for the reading morphology
1418                 $reading_attributes{'lexemes'} = 'string';
1419         }
1420
1421     my %node_data_keys;
1422     my $ndi = 0;
1423     foreach my $datum ( sort keys %reading_attributes ) {
1424         $node_data_keys{$datum} = 'dn'.$ndi++;
1425         my $key = $root->addNewChild( $graphml_ns, 'key' );
1426         $key->setAttribute( 'attr.name', $datum );
1427         $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
1428         $key->setAttribute( 'for', 'node' );
1429         $key->setAttribute( 'id', $node_data_keys{$datum} );
1430     }
1431
1432     # Add the data keys for edges, that is, paths and relationships. Path
1433     # data does not come from a Moose class so is here manually.
1434     my $edi = 0;
1435     my %edge_data_keys;
1436     my %edge_attributes = (
1437         witness => 'string',                    # ID/label for a path
1438         extra => 'boolean',                             # Path key
1439         );
1440     my @path_attributes = keys %edge_attributes; # track our manual additions
1441     my $pmeta = Text::Tradition::Collation::Relationship->meta;
1442     foreach my $attr( $pmeta->get_all_attributes ) {
1443                 next if $attr->name =~ /^_/;
1444                 next unless $save_types{$attr->type_constraint->name};
1445                 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1446         }
1447     foreach my $datum ( sort keys %edge_attributes ) {
1448         $edge_data_keys{$datum} = 'de'.$edi++;
1449         my $key = $root->addNewChild( $graphml_ns, 'key' );
1450         $key->setAttribute( 'attr.name', $datum );
1451         $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
1452         $key->setAttribute( 'for', 'edge' );
1453         $key->setAttribute( 'id', $edge_data_keys{$datum} );
1454     }
1455
1456     # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1457     my $xmlidname = $self->tradition->name;
1458     $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1459     if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1460         $xmlidname = '_'.$xmlidname;
1461     }
1462     my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1463     $sgraph->setAttribute( 'edgedefault', 'directed' );
1464     $sgraph->setAttribute( 'id', $xmlidname );
1465     $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
1466     $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
1467     $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
1468     $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
1469     $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
1470
1471     # Tradition/collation attribute data
1472     foreach my $datum ( keys %graph_attributes ) {
1473         my $value;
1474         if( $datum eq 'version' ) {
1475                 $value = '3.2';
1476         } elsif( ref( $graph_attributes{$datum} ) ) {
1477                 my $sub = $graph_attributes{$datum};
1478                 $value = &$sub();
1479         } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1480                 $value = $self->tradition->$datum;
1481         } else {
1482                 $value = $self->$datum;
1483         }
1484                 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
1485         }
1486
1487     my $node_ctr = 0;
1488     my %node_hash;
1489     # Add our readings to the graph
1490     foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
1491         next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1492                 ( $n->rank < $start->rank || $n->rank > $end->rank );
1493         $use_readings{$n->id} = 1;
1494         # Add to the main graph
1495         my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
1496         my $node_xmlid = 'n' . $node_ctr++;
1497         $node_hash{ $n->id } = $node_xmlid;
1498         $node_el->setAttribute( 'id', $node_xmlid );
1499         foreach my $d ( keys %reading_attributes ) {
1500                 my $nval = $n->$d;
1501                 # Custom serialization
1502                 if( $d eq 'lexemes' ) {
1503                                 # If nval is a true value, we have lexemes so we need to
1504                                 # serialize them. Otherwise set nval to undef so that the
1505                                 # key is excluded from this reading.
1506                         $nval = $nval ? $n->_serialize_lexemes : undef;
1507                 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1508                         $nval = undef;
1509                 }
1510                 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
1511                         # Adjust the ranks within the subgraph.
1512                         $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1513                                 : $nval - $rankoffset;
1514                 }
1515                 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1516                         if defined $nval;
1517         }
1518     }
1519
1520     # Add the path edges to the sequence graph
1521     my $edge_ctr = 0;
1522     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1523         # We add an edge in the graphml for every witness in $e.
1524         next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1525         my @edge_wits = sort $self->path_witnesses( $e );
1526         $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1527         $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1528         # Skip any path from start to end; that witness is not in the subgraph.
1529         next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
1530         foreach my $wit ( @edge_wits ) {
1531                         my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1532                                                                                 $node_hash{ $e->[0] },
1533                                                                                 $node_hash{ $e->[1] } );
1534                         my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
1535                         $edge_el->setAttribute( 'source', $from );
1536                         $edge_el->setAttribute( 'target', $to );
1537                         $edge_el->setAttribute( 'id', $id );
1538
1539                         # It's a witness path, so add the witness
1540                         my $base = $wit;
1541                         my $key = $edge_data_keys{'witness'};
1542                         # Is this an ante-corr witness?
1543                         my $aclabel = $self->ac_label;
1544                         if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1545                                 # Keep the base witness
1546                                 $base = $1;
1547                                 # ...and record that this is an 'extra' reading path
1548                                 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1549                         }
1550                         _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1551                 }
1552         }
1553
1554         # Report the actual number of nodes and edges that went in
1555         $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1556         $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1557
1558         # Add the relationship graph to the XML
1559         map { delete $edge_data_keys{$_} } @path_attributes;
1560         $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1561                 $node_data_keys{'id'}, \%edge_data_keys );
1562
1563     # Save and return the thing
1564     my $result = decode_utf8( $graphml->toString(1) );
1565     return $result;
1566 }
1567
1568 sub _add_graphml_data {
1569     my( $el, $key, $value ) = @_;
1570     return unless defined $value;
1571     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
1572     $data_el->setAttribute( 'key', $key );
1573     $data_el->appendText( $value );
1574 }
1575
1576 =head2 as_csv
1577
1578 Returns a CSV alignment table representation of the collation graph, one
1579 row per witness (or witness uncorrected.)
1580
1581 =head2 as_tsv
1582
1583 Returns a tab-separated alignment table representation of the collation graph,
1584 one row per witness (or witness uncorrected.)
1585
1586 =begin testing
1587
1588 use Text::Tradition;
1589 use Text::CSV;
1590
1591 my $READINGS = 311;
1592 my $PATHS = 361;
1593 my $WITS = 13;
1594 my $WITAC = 4;
1595
1596 my $datafile = 't/data/florilegium_tei_ps.xml';
1597 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1598                                       'name' => 'test0',
1599                                       'file' => $datafile,
1600                                       'linear' => 1 );
1601
1602 my $c = $tradition->collation;
1603 # Export the thing to CSV
1604 my $csvstr = $c->as_csv();
1605 # Count the columns
1606 my $csv = Text::CSV->new({ sep_char => ',', binary => 1 });
1607 my @lines = split(/\n/, $csvstr );
1608 ok( $csv->parse( $lines[0] ), "Successfully parsed first line of CSV" );
1609 is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1610 my @q_ac = grep { $_ eq 'Q'.$c->ac_label } $csv->fields;
1611 ok( @q_ac, "Found a layered witness" );
1612
1613 my $t2 = Text::Tradition->new( input => 'Tabular',
1614                                                            name => 'test2',
1615                                                            string => $csvstr,
1616                                                            sep_char => ',' );
1617 is( scalar $t2->collation->readings, $READINGS, "Reparsed CSV collation has all readings" );
1618 is( scalar $t2->collation->paths, $PATHS, "Reparsed CSV collation has all paths" );
1619
1620 # Now do it with TSV
1621 my $tsvstr = $c->as_tsv();
1622 my $t3 = Text::Tradition->new( input => 'Tabular',
1623                                                            name => 'test3',
1624                                                            string => $tsvstr,
1625                                                            sep_char => "\t" );
1626 is( scalar $t3->collation->readings, $READINGS, "Reparsed TSV collation has all readings" );
1627 is( scalar $t3->collation->paths, $PATHS, "Reparsed TSV collation has all paths" );
1628
1629 my $table = $c->alignment_table;
1630 my $noaccsv = $c->as_csv({ noac => 1 });
1631 my @noaclines = split(/\n/, $noaccsv );
1632 ok( $csv->parse( $noaclines[0] ), "Successfully parsed first line of no-ac CSV" );
1633 is( scalar( $csv->fields ), $WITS, "CSV has correct number of witness columns" );
1634 is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
1635
1636 my $safecsv = $c->as_csv({ safe_ac => 1});
1637 my @safelines = split(/\n/, $safecsv );
1638 ok( $csv->parse( $safelines[0] ), "Successfully parsed first line of safe CSV" );
1639 is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1640 @q_ac = grep { $_ eq 'Q__L' } $csv->fields;
1641 ok( @q_ac, "Found a sanitized layered witness" );
1642 is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
1643
1644 =end testing
1645
1646 =cut
1647
1648 sub _tabular {
1649     my( $self, $opts ) = @_;
1650     my $table = $self->alignment_table( $opts );
1651         my $csv_options = { binary => 1, quote_null => 0 };
1652         $csv_options->{'sep_char'} = $opts->{fieldsep};
1653         if( $opts->{fieldsep} eq "\t" ) {
1654                 # If it is really tab separated, nothing is an escape char.
1655                 $csv_options->{'quote_char'} = undef;
1656                 $csv_options->{'escape_char'} = '';
1657         }
1658     my $csv = Text::CSV->new( $csv_options );
1659     my @result;
1660     # Make the header row
1661     $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1662         push( @result, $csv->string );
1663     # Make the rest of the rows
1664     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1665         my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1666         my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
1667         $csv->combine( @row );
1668         push( @result, $csv->string );
1669     }
1670     return join( "\n", @result );
1671 }
1672
1673 sub as_csv {
1674         my $self = shift;
1675         my $opts = shift || {};
1676         $opts->{fieldsep} = ',';
1677         return $self->_tabular( $opts );
1678 }
1679
1680 sub as_tsv {
1681         my $self = shift;
1682         my $opts = shift || {};
1683         $opts->{fieldsep} = "\t";
1684         return $self->_tabular( $opts );
1685 }
1686
1687 =head2 alignment_table
1688
1689 Return a reference to an alignment table, in a slightly enhanced CollateX
1690 format which looks like this:
1691
1692  $table = { alignment => [ { witness => "SIGIL",
1693                              tokens => [ { t => "TEXT" }, ... ] },
1694                            { witness => "SIG2",
1695                              tokens => [ { t => "TEXT" }, ... ] },
1696                            ... ],
1697             length => TEXTLEN };
1698
1699 =cut
1700
1701 sub alignment_table {
1702     my( $self, $opts ) = @_;
1703     if( $self->has_cached_table ) {
1704                 return $self->cached_table
1705                         unless $opts->{noac} || $opts->{safe_ac};
1706     }
1707
1708     # Make sure we can do this
1709         throw( "Need a linear graph in order to make an alignment table" )
1710                 unless $self->linear;
1711     $self->calculate_ranks()
1712         unless $self->_graphcalc_done && $self->end->has_rank;
1713
1714     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
1715     my @all_pos = ( 1 .. $self->end->rank - 1 );
1716     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
1717         # say STDERR "Making witness row(s) for " . $wit->sigil;
1718         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1719         my @row = _make_witness_row( \@wit_path, \@all_pos );
1720         my $witobj = { 'witness' => $wit->sigil, 'tokens' => \@row };
1721         $witobj->{'identifier'} = $wit->identifier if $wit->identifier;
1722         push( @{$table->{'alignment'}}, $witobj );
1723         if( $wit->is_layered && !$opts->{noac} ) {
1724                 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
1725                         $wit->sigil.$self->ac_label );
1726             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
1727             my $witlabel = $opts->{safe_ac}
1728                 ? $wit->sigil . '__L' : $wit->sigil.$self->ac_label;
1729             my $witacobj = { 'witness' => $witlabel,
1730                 'tokens' => \@ac_row };
1731             $witacobj->{'identifier'} = $wit->identifier if $wit->identifier;
1732                         push( @{$table->{'alignment'}}, $witacobj );
1733         }
1734     }
1735     unless( $opts->{noac} || $opts->{safe_ac} ) {
1736             $self->cached_table( $table );
1737         }
1738     return $table;
1739 }
1740
1741 sub _make_witness_row {
1742     my( $path, $positions ) = @_;
1743     my %char_hash;
1744     map { $char_hash{$_} = undef } @$positions;
1745     my $debug = 0;
1746     foreach my $rdg ( @$path ) {
1747         say STDERR "rank " . $rdg->rank if $debug;
1748         # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
1749         $char_hash{$rdg->rank} = { 't' => $rdg };
1750     }
1751     my @row = map { $char_hash{$_} } @$positions;
1752     # Fill in lacuna markers for undef spots in the row
1753     my $last_el = shift @row;
1754     my @filled_row = ( $last_el );
1755     foreach my $el ( @row ) {
1756         # If we are using node reference, make the lacuna node appear many times
1757         # in the table.  If not, use the lacuna tag.
1758         if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1759             $el = $last_el;
1760         }
1761         push( @filled_row, $el );
1762         $last_el = $el;
1763     }
1764     return @filled_row;
1765 }
1766
1767
1768 =head1 NAVIGATION METHODS
1769
1770 =head2 reading_sequence( $first, $last, $sigil, $backup )
1771
1772 Returns the ordered list of readings, starting with $first and ending
1773 with $last, for the witness given in $sigil. If a $backup sigil is
1774 specified (e.g. when walking a layered witness), it will be used wherever
1775 no $sigil path exists.  If there is a base text reading, that will be
1776 used wherever no path exists for $sigil or $backup.
1777
1778 =cut
1779
1780 # TODO Think about returning some lazy-eval iterator.
1781 # TODO Get rid of backup; we should know from what witness is whether we need it.
1782
1783 sub reading_sequence {
1784     my( $self, $start, $end, $witness ) = @_;
1785
1786     $witness = $self->baselabel unless $witness;
1787     my @readings = ( $start );
1788     my %seen;
1789     my $n = $start;
1790     while( $n && $n->id ne $end->id ) {
1791         if( exists( $seen{$n->id} ) ) {
1792             throw( "Detected loop for $witness at " . $n->id );
1793         }
1794         $seen{$n->id} = 1;
1795
1796         my $next = $self->next_reading( $n, $witness );
1797         unless( $next ) {
1798             throw( "Did not find any path for $witness from reading " . $n->id );
1799         }
1800         push( @readings, $next );
1801         $n = $next;
1802     }
1803     # Check that the last reading is our end reading.
1804     my $last = $readings[$#readings];
1805     throw( "Last reading found from " . $start->text .
1806         " for witness $witness is not the end!" ) # TODO do we get this far?
1807         unless $last->id eq $end->id;
1808
1809     return @readings;
1810 }
1811
1812 =head2 next_reading( $reading, $sigil );
1813
1814 Returns the reading that follows the given reading along the given witness
1815 path.
1816
1817 =cut
1818
1819 sub next_reading {
1820     # Return the successor via the corresponding path.
1821     my $self = shift;
1822     my $answer = $self->_find_linked_reading( 'next', @_ );
1823         return undef unless $answer;
1824     return $self->reading( $answer );
1825 }
1826
1827 =head2 prior_reading( $reading, $sigil )
1828
1829 Returns the reading that precedes the given reading along the given witness
1830 path.
1831
1832 =cut
1833
1834 sub prior_reading {
1835     # Return the predecessor via the corresponding path.
1836     my $self = shift;
1837     my $answer = $self->_find_linked_reading( 'prior', @_ );
1838     return $self->reading( $answer );
1839 }
1840
1841 sub _find_linked_reading {
1842     my( $self, $direction, $node, $path ) = @_;
1843
1844     # Get a backup if we are dealing with a layered witness
1845     my $alt_path;
1846     my $aclabel = $self->ac_label;
1847     if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1848         $alt_path = $1;
1849     }
1850
1851     my @linked_paths = $direction eq 'next'
1852         ? $self->sequence->edges_from( $node )
1853         : $self->sequence->edges_to( $node );
1854     return undef unless scalar( @linked_paths );
1855
1856     # We have to find the linked path that contains all of the
1857     # witnesses supplied in $path.
1858     my( @path_wits, @alt_path_wits );
1859     @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1860     @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
1861     my $base_le;
1862     my $alt_le;
1863     foreach my $le ( @linked_paths ) {
1864         if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
1865             $base_le = $le;
1866         }
1867                 my @le_wits = sort $self->path_witnesses( $le );
1868                 if( _is_within( \@path_wits, \@le_wits ) ) {
1869                         # This is the right path.
1870                         return $direction eq 'next' ? $le->[1] : $le->[0];
1871                 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1872                         $alt_le = $le;
1873                 }
1874     }
1875     # Got this far? Return the alternate path if it exists.
1876     return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
1877         if $alt_le;
1878
1879     # Got this far? Return the base path if it exists.
1880     return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
1881         if $base_le;
1882
1883     # Got this far? We have no appropriate path.
1884     warn "Could not find $direction node from " . $node->id
1885         . " along path $path";
1886     return undef;
1887 }
1888
1889 # Some set logic.
1890 sub _is_within {
1891     my( $set1, $set2 ) = @_;
1892     my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
1893     foreach my $el ( @$set1 ) {
1894         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
1895     }
1896     return $ret;
1897 }
1898
1899 # Return the string that joins together a list of witnesses for
1900 # display on a single path.
1901 sub _witnesses_of_label {
1902     my( $self, $label ) = @_;
1903     my $regex = $self->wit_list_separator;
1904     my @answer = split( /\Q$regex\E/, $label );
1905     return @answer;
1906 }
1907
1908 =head2 common_readings
1909
1910 Returns the list of common readings in the graph (i.e. those readings that are
1911 shared by all non-lacunose witnesses.)
1912
1913 =cut
1914
1915 sub common_readings {
1916         my $self = shift;
1917         my @common = grep { $_->is_common } $self->readings;
1918         return @common;
1919 }
1920
1921 =head2 path_text( $sigil, [, $start, $end ] )
1922
1923 Returns the text of a witness (plus its backup, if we are using a layer)
1924 as stored in the collation.  The text is returned as a string, where the
1925 individual readings are joined with spaces and the meta-readings (e.g.
1926 lacunae) are omitted.  Optional specification of $start and $end allows
1927 the generation of a subset of the witness text.
1928
1929 =cut
1930
1931 sub path_text {
1932         my( $self, $wit, $start, $end ) = @_;
1933         $start = $self->start unless $start;
1934         $end = $self->end unless $end;
1935         my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
1936         my $pathtext = '';
1937         my $last;
1938         foreach my $r ( @path ) {
1939                 unless ( $r->join_prior || !$last || $last->join_next ) {
1940                         $pathtext .= ' ';
1941                 }
1942                 $pathtext .= $r->text;
1943                 $last = $r;
1944         }
1945         return $pathtext;
1946 }
1947
1948 =head1 INITIALIZATION METHODS
1949
1950 These are mostly for use by parsers.
1951
1952 =head2 make_witness_path( $witness )
1953
1954 Link the array of readings contained in $witness->path (and in
1955 $witness->uncorrected_path if it exists) into collation paths.
1956 Clear out the arrays when finished.
1957
1958 =head2 make_witness_paths
1959
1960 Call make_witness_path for all witnesses in the tradition.
1961
1962 =cut
1963
1964 # For use when a collation is constructed from a base text and an apparatus.
1965 # We have the sequences of readings and just need to add path edges.
1966 # When we are done, clear out the witness path attributes, as they are no
1967 # longer needed.
1968 # TODO Find a way to replace the witness path attributes with encapsulated functions?
1969
1970 sub make_witness_paths {
1971     my( $self ) = @_;
1972     foreach my $wit ( $self->tradition->witnesses ) {
1973         # say STDERR "Making path for " . $wit->sigil;
1974         $self->make_witness_path( $wit );
1975     }
1976 }
1977
1978 sub make_witness_path {
1979     my( $self, $wit ) = @_;
1980     my @chain = @{$wit->path};
1981     my $sig = $wit->sigil;
1982     # Add start and end if necessary
1983     unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1984     push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1985     foreach my $idx ( 0 .. $#chain-1 ) {
1986         $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
1987     }
1988     if( $wit->is_layered ) {
1989         @chain = @{$wit->uncorrected_path};
1990                 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1991                 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1992         foreach my $idx( 0 .. $#chain-1 ) {
1993             my $source = $chain[$idx];
1994             my $target = $chain[$idx+1];
1995             $self->add_path( $source, $target, $sig.$self->ac_label )
1996                 unless $self->has_path( $source, $target, $sig );
1997         }
1998     }
1999     $wit->clear_path;
2000     $wit->clear_uncorrected_path;
2001 }
2002
2003 =head2 calculate_ranks
2004
2005 Calculate the reading ranks (that is, their aligned positions relative
2006 to each other) for the graph.  This can only be called on linear collations.
2007
2008 =begin testing
2009
2010 use Text::Tradition;
2011
2012 my $cxfile = 't/data/Collatex-16.xml';
2013 my $t = Text::Tradition->new(
2014     'name'  => 'inline',
2015     'input' => 'CollateX',
2016     'file'  => $cxfile,
2017     );
2018 my $c = $t->collation;
2019
2020 # Make an svg
2021 my $table = $c->alignment_table;
2022 ok( $c->has_cached_table, "Alignment table was cached" );
2023 is( $c->alignment_table, $table, "Cached table returned upon second call" );
2024 $c->calculate_ranks;
2025 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
2026 $c->add_relationship( 'n13', 'n23', { type => 'repetition' } );
2027 is( $c->alignment_table, $table, "Alignment table unchanged after non-colo relationship add" );
2028 $c->add_relationship( 'n24', 'n23', { type => 'spelling' } );
2029 isnt( $c->alignment_table, $table, "Alignment table changed after colo relationship add" );
2030
2031 =end testing
2032
2033 =cut
2034
2035 sub calculate_ranks {
2036     my $self = shift;
2037     # Save the existing ranks, in case we need to invalidate the cached SVG.
2038     throw( "Cannot calculate ranks on a non-linear graph" )
2039         unless $self->linear;
2040     my %existing_ranks;
2041     map { $existing_ranks{$_} = $_->rank } $self->readings;
2042
2043     # Do the rankings based on the relationship equivalence graph, starting
2044     # with the start node.
2045     my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
2046
2047     # Transfer our rankings from the topological graph to the real one.
2048     foreach my $r ( $self->readings ) {
2049         if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
2050             $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
2051         } else {
2052                 # Die. Find the last rank we calculated.
2053                 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
2054                                  <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
2055                         $self->readings;
2056                 my $last = pop @all_defined;
2057             throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
2058         }
2059     }
2060     # Do we need to invalidate the cached data?
2061     if( $self->has_cached_table ) {
2062         foreach my $r ( $self->readings ) {
2063                 next if defined( $existing_ranks{$r} )
2064                         && $existing_ranks{$r} == $r->rank;
2065                 # Something has changed, so clear the cache
2066                 $self->_clear_cache;
2067                         # ...and recalculate the common readings.
2068                         $self->calculate_common_readings();
2069                 last;
2070         }
2071     }
2072         # The graph calculation information is now up to date.
2073         $self->_graphcalc_done(1);
2074 }
2075
2076 sub _clear_cache {
2077         my $self = shift;
2078         $self->wipe_table if $self->has_cached_table;
2079 }
2080
2081
2082 =head2 flatten_ranks
2083
2084 A convenience method for parsing collation data.  Searches the graph for readings
2085 with the same text at the same rank, and merges any that are found.
2086
2087 =cut
2088
2089 sub flatten_ranks {
2090     my ( $self, %args ) = shift;
2091     my %unique_rank_rdg;
2092     my $changed;
2093     foreach my $p ( $self->identical_readings( %args ) ) {
2094                 # say STDERR "Combining readings at same rank: @$p";
2095                 $changed = 1;
2096                 $self->merge_readings( @$p );
2097                 # TODO see if this now makes a common point.
2098     }
2099     # If we merged readings, the ranks are still fine but the alignment
2100     # table is wrong. Wipe it.
2101     $self->wipe_table() if $changed;
2102 }
2103
2104 =head2 identical_readings
2105 =head2 identical_readings( start => $startnode, end => $endnode )
2106 =head2 identical_readings( startrank => $startrank, endrank => $endrank )
2107
2108 Goes through the graph identifying all pairs of readings that appear to be
2109 identical, and therefore able to be merged into a single reading. Returns the
2110 relevant identical pairs. Can be restricted to run over only a part of the
2111 graph, specified either by node or by rank.
2112
2113 =cut
2114
2115 sub identical_readings {
2116         my ( $self, %args ) = @_;
2117     # Find where we should start and end.
2118     my $startrank = $args{startrank} || 0;
2119     if( $args{start} ) {
2120         throw( "Starting reading has no rank" ) unless $self->reading( $args{start} )
2121                 && $self->reading( $args{start} )->has_rank;
2122         $startrank = $self->reading( $args{start} )->rank;
2123     }
2124     my $endrank = $args{endrank} || $self->end->rank;
2125     if( $args{end} ) {
2126         throw( "Ending reading has no rank" ) unless $self->reading( $args{end} )
2127                 && $self->reading( $args{end} )->has_rank;
2128         $endrank = $self->reading( $args{end} )->rank;
2129     }
2130
2131     # Make sure the ranks are correct.
2132     unless( $self->_graphcalc_done ) {
2133         $self->calculate_ranks;
2134     }
2135     # Go through the readings looking for duplicates.
2136     my %unique_rank_rdg;
2137     my @pairs;
2138     foreach my $rdg ( $self->readings ) {
2139         next unless $rdg->has_rank;
2140         my $rk = $rdg->rank;
2141         next if $rk > $endrank || $rk < $startrank;
2142         my $key = $rk . "||" . $rdg->text;
2143         if( exists $unique_rank_rdg{$key} ) {
2144                 # Make sure they don't have different grammatical forms
2145                         my $ur = $unique_rank_rdg{$key};
2146                 if( $rdg->is_identical( $ur ) ) {
2147                                 push( @pairs, [ $ur, $rdg ] );
2148                         }
2149         } else {
2150             $unique_rank_rdg{$key} = $rdg;
2151         }
2152     }
2153
2154     return @pairs;
2155 }
2156
2157
2158 =head2 calculate_common_readings
2159
2160 Goes through the graph identifying the readings that appear in every witness
2161 (apart from those with lacunae at that spot.) Marks them as common and returns
2162 the list.
2163
2164 =begin testing
2165
2166 use Text::Tradition;
2167
2168 my $cxfile = 't/data/Collatex-16.xml';
2169 my $t = Text::Tradition->new(
2170     'name'  => 'inline',
2171     'input' => 'CollateX',
2172     'file'  => $cxfile,
2173     );
2174 my $c = $t->collation;
2175
2176 my @common = $c->calculate_common_readings();
2177 is( scalar @common, 8, "Found correct number of common readings" );
2178 my @marked = sort $c->common_readings();
2179 is( scalar @common, 8, "All common readings got marked as such" );
2180 my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
2181 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
2182
2183 =end testing
2184
2185 =cut
2186
2187 sub calculate_common_readings {
2188         my $self = shift;
2189         my @common;
2190         map { $_->is_common( 0 ) } $self->readings;
2191         # Implicitly calls calculate_ranks
2192         my $table = $self->alignment_table;
2193         foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
2194                 my @row = map { $_->{'tokens'}->[$idx]
2195                                                         ? $_->{'tokens'}->[$idx]->{'t'} : '' }
2196                                         @{$table->{'alignment'}};
2197                 my %hash;
2198                 foreach my $r ( @row ) {
2199                         if( $r ) {
2200                                 $hash{$r->id} = $r unless $r->is_meta;
2201                         } else {
2202                                 $hash{'UNDEF'} = $r;
2203                         }
2204                 }
2205                 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
2206                         my( $r ) = values %hash;
2207                         $r->is_common( 1 );
2208                         push( @common, $r );
2209                 }
2210         }
2211         return @common;
2212 }
2213
2214 =head2 text_from_paths
2215
2216 Calculate the text array for all witnesses from the path, for later consistency
2217 checking.  Only to be used if there is no non-graph-based way to know the
2218 original texts.
2219
2220 =cut
2221
2222 sub text_from_paths {
2223         my $self = shift;
2224     foreach my $wit ( $self->tradition->witnesses ) {
2225         my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
2226         my @text;
2227         foreach my $r ( @readings ) {
2228                 next if $r->is_meta;
2229                 push( @text, $r->text );
2230         }
2231         $wit->text( \@text );
2232         if( $wit->is_layered ) {
2233                         my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
2234                                                                                                   $wit->sigil.$self->ac_label );
2235                         my @uctext;
2236                         foreach my $r ( @ucrdgs ) {
2237                                 next if $r->is_meta;
2238                                 push( @uctext, $r->text );
2239                         }
2240                         $wit->layertext( \@uctext );
2241         }
2242     }
2243 }
2244
2245 =head1 UTILITY FUNCTIONS
2246
2247 =head2 common_predecessor( $reading_a, $reading_b )
2248
2249 Find the last reading that occurs in sequence before both the given readings.
2250 At the very least this should be $self->start.
2251
2252 =head2 common_successor( $reading_a, $reading_b )
2253
2254 Find the first reading that occurs in sequence after both the given readings.
2255 At the very least this should be $self->end.
2256
2257 =begin testing
2258
2259 use Text::Tradition;
2260
2261 my $cxfile = 't/data/Collatex-16.xml';
2262 my $t = Text::Tradition->new(
2263     'name'  => 'inline',
2264     'input' => 'CollateX',
2265     'file'  => $cxfile,
2266     );
2267 my $c = $t->collation;
2268
2269 is( $c->common_predecessor( 'n24', 'n23' )->id,
2270     'n20', "Found correct common predecessor" );
2271 is( $c->common_successor( 'n24', 'n23' )->id,
2272     '__END__', "Found correct common successor" );
2273
2274 is( $c->common_predecessor( 'n19', 'n17' )->id,
2275     'n16', "Found correct common predecessor for readings on same path" );
2276 is( $c->common_successor( 'n21', 'n10' )->id,
2277     '__END__', "Found correct common successor for readings on same path" );
2278
2279 =end testing
2280
2281 =cut
2282
2283 ## Return the closest reading that is a predecessor of both the given readings.
2284 sub common_predecessor {
2285         my $self = shift;
2286         my( $r1, $r2 ) = $self->_objectify_args( @_ );
2287         return $self->_common_in_path( $r1, $r2, 'predecessors' );
2288 }
2289
2290 sub common_successor {
2291         my $self = shift;
2292         my( $r1, $r2 ) = $self->_objectify_args( @_ );
2293         return $self->_common_in_path( $r1, $r2, 'successors' );
2294 }
2295
2296
2297 # TODO think about how to do this without ranks...
2298 sub _common_in_path {
2299         my( $self, $r1, $r2, $dir ) = @_;
2300         my $iter = $self->end->rank;
2301         my @candidates;
2302         my @last_r1 = ( $r1 );
2303         my @last_r2 = ( $r2 );
2304         # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
2305         my %all_seen;
2306         # say STDERR "Finding common $dir for $r1, $r2";
2307         while( !@candidates ) {
2308                 last unless $iter--;  # Avoid looping infinitely
2309                 # Iterate separately down the graph from r1 and r2
2310                 my( @new_lc1, @new_lc2 );
2311                 foreach my $lc ( @last_r1 ) {
2312                         foreach my $p ( $lc->$dir ) {
2313                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
2314                                         # say STDERR "Path candidate $p from $lc";
2315                                         push( @candidates, $p );
2316                                 } elsif( !$all_seen{$p->id} ) {
2317                                         $all_seen{$p->id} = 'r1';
2318                                         push( @new_lc1, $p );
2319                                 }
2320                         }
2321                 }
2322                 foreach my $lc ( @last_r2 ) {
2323                         foreach my $p ( $lc->$dir ) {
2324                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
2325                                         # say STDERR "Path candidate $p from $lc";
2326                                         push( @candidates, $p );
2327                                 } elsif( !$all_seen{$p->id} ) {
2328                                         $all_seen{$p->id} = 'r2';
2329                                         push( @new_lc2, $p );
2330                                 }
2331                         }
2332                 }
2333                 @last_r1 = @new_lc1;
2334                 @last_r2 = @new_lc2;
2335         }
2336         my @answer = sort { $a->rank <=> $b->rank } @candidates;
2337         return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
2338 }
2339
2340 sub throw {
2341         Text::Tradition::Error->throw(
2342                 'ident' => 'Collation error',
2343                 'message' => $_[0],
2344                 );
2345 }
2346
2347 no Moose;
2348 __PACKAGE__->meta->make_immutable;
2349
2350 =head1 BUGS/TODO
2351
2352 =over
2353
2354 =item * Rework XML serialization in a more modular way
2355
2356 =back
2357
2358 =head1 LICENSE
2359
2360 This package is free software and is provided "as is" without express
2361 or implied warranty.  You can redistribute it and/or modify it under
2362 the same terms as Perl itself.
2363
2364 =head1 AUTHOR
2365
2366 Tara L Andrews E<lt>aurum@cpan.orgE<gt>