base/lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use feature 'say';
   4 use Encode qw( decode_utf8 );
   5 use File::Temp;
   6 use File::Which;
   7 use Graph;
   8 use IPC::Run qw( run binary );
   9 use JSON qw/ to_json /;
  10 use Text::CSV;
  11 use Text::Tradition::Collation::Data;
  12 use Text::Tradition::Collation::Reading;
  13 use Text::Tradition::Collation::RelationshipStore;
  14 use Text::Tradition::Error;
  15 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
  16 use XML::LibXML;
  17 use XML::LibXML::XPathContext;
  18 use Moose;
  19
  20 has _data => (
  21         isa      => 'Text::Tradition::Collation::Data',
  22         is       => 'ro',
  23         required => 1,
  24         handles  => [ qw(
  25                 sequence
  26                 paths
  27                 _set_relations
  28                 relations
  29                 _set_start
  30                 _set_end
  31                 ac_label
  32                 has_cached_table
  33                 relationships
  34                 relationship_types
  35                 related_readings
  36                 get_relationship
  37                 del_relationship
  38                 equivalence
  39                 equivalence_graph
  40                 readings
  41                 reading
  42                 _add_reading
  43                 del_reading
  44                 has_reading
  45                 wit_list_separator
  46                 baselabel
  47                 linear
  48                 wordsep
  49                 direction
  50                 start
  51                 end
  52                 cached_table
  53                 _graphcalc_done
  54                 has_cached_svg
  55                 wipe_table
  56         )]
  57 );
  58
  59 has 'tradition' => (
  60     is => 'ro',
  61     isa => 'Text::Tradition',
  62     writer => '_set_tradition',
  63     weak_ref => 1,
  64     );
  65
  66 =encoding utf8
  67
  68 =head1 NAME
  69
  70 Text::Tradition::Collation - a software model for a text collation
  71
  72 =head1 SYNOPSIS
  73
  74   use Text::Tradition;
  75   my $t = Text::Tradition->new(
  76     'name' => 'this is a text',
  77     'input' => 'TEI',
  78     'file' => '/path/to/tei_parallel_seg_file.xml' );
  79
  80   my $c = $t->collation;
  81   my @readings = $c->readings;
  82   my @paths = $c->paths;
  83   my @relationships = $c->relationships;
  84
  85   my $svg_variant_graph = $t->collation->as_svg();
  86
  87 =head1 DESCRIPTION
  88
  89 Text::Tradition is a library for representation and analysis of collated
  90 texts, particularly medieval ones.  The Collation is the central feature of
  91 a Tradition, where the text, its sequence of readings, and its relationships
  92 between readings are actually kept.
  93
  94 =head1 CONSTRUCTOR
  95
  96 =head2 new
  97
  98 The constructor.  Takes a hash or hashref of the following arguments:
  99
 100 =over
 101
 102 =item * tradition - The Text::Tradition object to which the collation
 103 belongs. Required.
 104
 105 =item * linear - Whether the collation should be linear; that is, whether
 106 transposed readings should be treated as two linked readings rather than one,
 107 and therefore whether the collation graph is acyclic.  Defaults to true.
 108
 109 =item * baselabel - The default label for the path taken by a base text
 110 (if any). Defaults to 'base text'.
 111
 112 =item * wit_list_separator - The string to join a list of witnesses for
 113 purposes of making labels in display graphs.  Defaults to ', '.
 114
 115 =item * ac_label - The extra label to tack onto a witness sigil when
 116 representing another layer of path for the given witness - that is, when
 117 a text has more than one possible reading due to scribal corrections or
 118 the like.  Defaults to ' (a.c.)'.
 119
 120 =item * wordsep - The string used to separate words in the original text.
 121 Defaults to ' '.
 122
 123 =back
 124
 125 =head1 ACCESSORS
 126
 127 =head2 tradition
 128
 129 =head2 linear
 130
 131 =head2 wit_list_separator
 132
 133 =head2 baselabel
 134
 135 =head2 ac_label
 136
 137 =head2 wordsep
 138
 139 Simple accessors for collation attributes.
 140
 141 =head2 start
 142
 143 The meta-reading at the start of every witness path.
 144
 145 =head2 end
 146
 147 The meta-reading at the end of every witness path.
 148
 149 =head2 readings
 150
 151 Returns all Reading objects in the graph.
 152
 153 =head2 reading( $id )
 154
 155 Returns the Reading object corresponding to the given ID.
 156
 157 =head2 add_reading( $reading_args )
 158
 159 Adds a new reading object to the collation.
 160 See L<Text::Tradition::Collation::Reading> for the available arguments.
 161
 162 =head2 del_reading( $object_or_id )
 163
 164 Removes the given reading from the collation, implicitly removing its
 165 paths and relationships.
 166
 167 =head2 has_reading( $id )
 168
 169 Predicate to see whether a given reading ID is in the graph.
 170
 171 =head2 reading_witnesses( $object_or_id )
 172
 173 Returns a list of sigils whose witnesses contain the reading.
 174
 175 =head2 paths
 176
 177 Returns all reading paths within the document - that is, all edges in the
 178 collation graph.  Each path is an arrayref of [ $source, $target ] reading IDs.
 179
 180 =head2 add_path( $source, $target, $sigil )
 181
 182 Links the given readings in the collation in sequence, under the given witness
 183 sigil.  The readings may be specified by object or ID.
 184
 185 =head2 del_path( $source, $target, $sigil )
 186
 187 Links the given readings in the collation in sequence, under the given witness
 188 sigil.  The readings may be specified by object or ID.
 189
 190 =head2 has_path( $source, $target );
 191
 192 Returns true if the two readings are linked in sequence in any witness.
 193 The readings may be specified by object or ID.
 194
 195 =head2 relationships
 196
 197 Returns all Relationship objects in the collation.
 198
 199 =head2 add_relationship( $reading, $other_reading, $options, $changed_readings )
 200
 201 Adds a new relationship of the type given in $options between the two readings,
 202 which may be specified by object or ID.  Returns a value of ( $status, @vectors)
 203 where $status is true on success, and @vectors is a list of relationship edges
 204 that were ultimately added. If an array reference is passed in as $changed_readings,
 205 then any readings that were altered due to the relationship creation are added to
 206 the array.
 207
 208 See L<Text::Tradition::Collation::Relationship> for the available options.
 209
 210 =cut
 211
 212 sub BUILDARGS {
 213         my ( $class, @args ) = @_;
 214         my %args = @args == 1 ? %{ $args[0] } : @args;
 215         # TODO determine these from the Moose::Meta object
 216         my @delegate_attrs = qw(sequence relations readings wit_list_separator baselabel
 217                 linear wordsep direction start end cached_table _graphcalc_done);
 218         my %data_args;
 219         for my $attr (@delegate_attrs) {
 220                 $data_args{$attr} = delete $args{$attr} if exists $args{$attr};
 221         }
 222         $args{_data} = Text::Tradition::Collation::Data->new(%data_args);
 223         return \%args;
 224 }
 225
 226 sub BUILD {
 227     my $self = shift;
 228     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
 229     $self->_set_start( $self->add_reading(
 230         { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
 231     $self->_set_end( $self->add_reading(
 232         { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
 233 }
 234
 235 =head2 register_relationship_type( %relationship_definition )
 236
 237 Add a relationship type definition to this collation. The argument can be either a
 238 hash or a hashref, defining the properties of the relationship. For relationship types
 239 and their properties, see L<Text::Tradition::Collation::RelationshipType>.
 240
 241 =head2 get_relationship_type( $relationship_name )
 242
 243 Retrieve the RelationshipType object for the relationship with the given name.
 244
 245 =cut
 246
 247 sub register_relationship_type {
 248         my $self = shift;
 249         my %args = @_ == 1 ? %{$_[0]} : @_;
 250         if( $self->relations->has_type( $args{name} ) ) {
 251                 throw( 'Relationship type ' . $args{name} . ' already registered' );
 252         }
 253         $self->relations->add_type( %args );
 254 }
 255
 256 sub get_relationship_type {
 257         my( $self, $name ) = @_;
 258                 return $self->relations->has_type( $name )
 259                         ? $self->relations->type( $name ) : undef;
 260 }
 261
 262 ### Reading construct/destruct functions
 263
 264 sub add_reading {
 265         my( $self, $reading ) = @_;
 266         unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
 267                 my %args = %$reading;
 268                 if( $args{'init'} ) {
 269                         # If we are initializing an empty collation, don't assume that we
 270                         # have set a tradition.
 271                         delete $args{'init'};
 272                 } elsif( $self->tradition->can('language') && $self->tradition->has_language
 273                         && !exists $args{'language'} ) {
 274                         $args{'language'} = $self->tradition->language;
 275                 }
 276                 $reading = Text::Tradition::Collation::Reading->new(
 277                         'collation' => $self,
 278                         %args );
 279         }
 280         # First check to see if a reading with this ID exists.
 281         if( $self->reading( $reading->id ) ) {
 282                 throw( "Collation already has a reading with id " . $reading->id );
 283         }
 284         $self->_graphcalc_done(0);
 285         $self->_add_reading( $reading->id => $reading );
 286         # Once the reading has been added, put it in both graphs.
 287         $self->sequence->add_vertex( $reading->id );
 288         $self->relations->add_reading( $reading->id );
 289         return $reading;
 290 };
 291
 292 around del_reading => sub {
 293         my $orig = shift;
 294         my $self = shift;
 295         my $arg = shift;
 296
 297         if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
 298                 $arg = $arg->id;
 299         }
 300         # Remove the reading from the graphs.
 301         $self->_graphcalc_done(0);
 302         $self->_clear_cache; # Explicitly clear caches to GC the reading
 303         $self->sequence->delete_vertex( $arg );
 304         $self->relations->delete_reading( $arg );
 305
 306         # Carry on.
 307         $self->$orig( $arg );
 308 };
 309
 310 =head2 merge_readings( $main, $second, $concatenate, $with_str )
 311
 312 Merges the $second reading into the $main one. If $concatenate is true, then
 313 the merged node will carry the text of both readings, concatenated with either
 314 $with_str (if specified) or a sensible default (the empty string if the
 315 appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
 316
 317 The first two arguments may be either readings or reading IDs.
 318
 319 =begin testing
 320
 321 use Text::Tradition;
 322 use TryCatch;
 323
 324 my $cxfile = 't/data/Collatex-16.xml';
 325 my $t = Text::Tradition->new(
 326     'name'  => 'inline',
 327     'input' => 'CollateX',
 328     'file'  => $cxfile,
 329     );
 330 my $c = $t->collation;
 331
 332 my $rno = scalar $c->readings;
 333 # Split n21 ('unto') for testing purposes
 334 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
 335 my $old_r = $c->reading( 'n21' );
 336 $old_r->alter_text( 'to' );
 337 $c->del_path( 'n20', 'n21', 'A' );
 338 $c->add_path( 'n20', 'n21p0', 'A' );
 339 $c->add_path( 'n21p0', 'n21', 'A' );
 340 $c->add_relationship( 'n21', 'n22', { type => 'collated', scope => 'local' } );
 341 $c->flatten_ranks();
 342 ok( $c->reading( 'n21p0' ), "New reading exists" );
 343 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
 344
 345 # Combine n3 and n4 ( with his )
 346 $c->merge_readings( 'n3', 'n4', 1 );
 347 ok( !$c->reading('n4'), "Reading n4 is gone" );
 348 is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
 349
 350 # Collapse n9 and n10 ( rood / root )
 351 $c->merge_readings( 'n9', 'n10' );
 352 ok( !$c->reading('n10'), "Reading n10 is gone" );
 353 is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
 354
 355 # Try to combine n21 and n21p0. This should break.
 356 my $remaining = $c->reading('n21');
 357 $remaining ||= $c->reading('n22');  # one of these should still exist
 358 try {
 359         $c->merge_readings( 'n21p0', $remaining, 1 );
 360         ok( 0, "Bad reading merge changed the graph" );
 361 } catch( Text::Tradition::Error $e ) {
 362         like( $e->message, qr/neither concatenated nor collated/, "Expected exception from bad concatenation" );
 363 } catch {
 364         ok( 0, "Unexpected error on bad reading merge: $@" );
 365 }
 366
 367 try {
 368         $c->calculate_ranks();
 369         ok( 1, "Graph is still evidently whole" );
 370 } catch( Text::Tradition::Error $e ) {
 371         ok( 0, "Caught a rank exception: " . $e->message );
 372 }
 373
 374 =end testing
 375
 376 =cut
 377
 378 sub merge_readings {
 379         my $self = shift;
 380
 381         # Sanity check
 382         my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
 383         my $mergemeta = $kept_obj->is_meta;
 384         throw( "Cannot merge meta and non-meta reading" )
 385                 unless ( $mergemeta && $del_obj->is_meta )
 386                         || ( !$mergemeta && !$del_obj->is_meta );
 387         if( $mergemeta ) {
 388                 throw( "Cannot merge with start or end node" )
 389                         if( $kept_obj eq $self->start || $kept_obj eq $self->end
 390                                 || $del_obj eq $self->start || $del_obj eq $self->end );
 391                 throw( "Cannot combine text of meta readings" ) if $combine;
 392         }
 393         # We can only merge readings in a linear graph if:
 394         # - they are contiguous with only one edge between them, OR
 395         # - they are at equivalent ranks in the graph.
 396         if( $self->linear ) {
 397                 my @delpred = $del_obj->predecessors;
 398                 my @keptsuc = $kept_obj->successors;
 399                 unless ( @delpred == 1 && $delpred[0] eq $kept_obj
 400                         && @keptsuc == 1 && $keptsuc[0] eq $del_obj ) {
 401                         my( $is_ok, $msg ) = $self->relations->relationship_valid(
 402                                 $kept_obj, $del_obj, 'collated' );
 403                         unless( $is_ok ) {
 404                                 throw( "Readings $kept_obj and $del_obj can be neither concatenated nor collated" );
 405                         }
 406                 }
 407         }
 408
 409         # We only need the IDs for adding paths to the graph, not the reading
 410         # objects themselves.
 411         my $kept = $kept_obj->id;
 412         my $deleted = $del_obj->id;
 413         $self->_graphcalc_done(0);
 414
 415     # The kept reading should inherit the paths and the relationships
 416     # of the deleted reading.
 417         foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
 418                 my @vector = ( $kept );
 419                 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
 420                 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
 421                 next if $vector[0] eq $vector[1]; # Don't add a self loop
 422                 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
 423                 $self->sequence->add_edge( @vector );
 424                 my $fwits = $self->sequence->get_edge_attributes( @vector );
 425                 @wits{keys %$fwits} = values %$fwits;
 426                 $self->sequence->set_edge_attributes( @vector, \%wits );
 427         }
 428         $self->relations->merge_readings( $kept, $deleted, $combine );
 429
 430         # Do the deletion deed.
 431         if( $combine ) {
 432                 # Combine the text of the readings
 433                 my $joinstr = $combine_char;
 434                 unless( defined $joinstr ) {
 435                         $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
 436                         $joinstr = $self->wordsep unless defined $joinstr;
 437                 }
 438                 $kept_obj->_combine( $del_obj, $joinstr );
 439         }
 440         $self->del_reading( $deleted );
 441 }
 442
 443 =head2 merge_related( @relationship_types )
 444
 445 Merge all readings linked with the relationship types given. If any of the selected type(s) is not a colocation, the graph will no longer be linear. The majority/plurality reading in each case will be the one kept.
 446
 447 WARNING: This operation cannot be undone.
 448
 449 =cut
 450
 451 =begin testing
 452
 453 use Test::Warn;
 454 use Text::Tradition;
 455 use TryCatch;
 456
 457 my $t;
 458 warnings_exist {
 459         $t = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/legendfrag.xml' );
 460 } [qr/Cannot set relationship on a meta reading/],
 461         "Got expected relationship drop warning on parse";
 462
 463 my $c = $t->collation;
 464 # Force the transitive propagation of all existing relationships.
 465 $c->relations->propagate_all_relationships();
 466
 467 my %rdg_ids;
 468 map { $rdg_ids{$_} = 1 } $c->readings;
 469 $c->merge_related( 'orthographic' );
 470 is( scalar( $c->readings ), keys( %rdg_ids ) - 9,
 471         "Successfully collapsed orthographic variation" );
 472 map { $rdg_ids{$_} = undef } qw/ r13.3 r11.4 r8.5 r8.2 r7.7 r7.5 r7.4 r7.3 r7.1 /;
 473 foreach my $rid ( keys %rdg_ids ) {
 474         my $exp = $rdg_ids{$rid};
 475         is( !$c->reading( $rid ), !$exp, "Reading $rid correctly " .
 476                 ( $exp ? "retained" : "removed" ) );
 477 }
 478 ok( $c->linear, "Graph is still linear" );
 479 try {
 480         $c->calculate_ranks; # This should succeed
 481         ok( 1, "Can still calculate ranks on the new graph" );
 482 } catch {
 483         ok( 0, "Rank calculation on merged graph failed: $@" );
 484 }
 485
 486 # Now add some transpositions
 487 $c->add_relationship( 'r8.4', 'r10.4', { type => 'transposition' } );
 488 $c->merge_related( 'transposition' );
 489 is( scalar( $c->readings ), keys( %rdg_ids ) - 10,
 490         "Transposed relationship is merged away" );
 491 ok( !$c->reading('r8.4'), "Correct transposed reading removed" );
 492 ok( !$c->linear, "Graph is no longer linear" );
 493 try {
 494         $c->calculate_ranks; # This should fail
 495         ok( 0, "Rank calculation happened on nonlinear graph?!" );
 496 } catch ( Text::Tradition::Error $e ) {
 497         is( $e->message, 'Cannot calculate ranks on a non-linear graph',
 498                 "Rank calculation on merged graph threw an error" );
 499 }
 500
 501 =end testing
 502
 503 =cut
 504
 505 # TODO: there should be a way to display merged without affecting the underlying data!
 506
 507 sub merge_related {
 508         my $self = shift;
 509         my %reltypehash;
 510         map { $reltypehash{$_} = 1 } @_;
 511
 512         # Set up the filter for finding related readings
 513         my $filter = sub {
 514                 exists $reltypehash{$_[0]->type};
 515         };
 516
 517         # Go through all readings looking for related ones
 518         foreach my $r ( $self->readings ) {
 519                 next unless $self->reading( "$r" ); # might have been deleted meanwhile
 520                 while( my @related = $self->related_readings( $r, $filter ) ) {
 521                         push( @related, $r );
 522                         @related = sort {
 523                                         scalar $b->witnesses <=> scalar $a->witnesses
 524                                 } @related;
 525                         my $keep = shift @related;
 526                         foreach my $delr ( @related ) {
 527                                 $self->linear( 0 )
 528                                         unless( $self->get_relationship( $keep, $delr )->colocated );
 529                                 $self->merge_readings( $keep, $delr );
 530                         }
 531                 }
 532         }
 533 }
 534
 535 =head2 compress_readings
 536
 537 Where possible in the graph, compresses plain sequences of readings into a
 538 single reading. The sequences must consist of readings with no
 539 relationships to other readings, with only a single witness path between
 540 them and no other witness paths from either that would skip the other. The
 541 readings must also not be marked as nonsense or bad grammar.
 542
 543 WARNING: This operation cannot be undone.
 544
 545 =begin testing
 546
 547 use Text::Tradition;
 548
 549 my $t = Text::Tradition->new( input => 'CollateX', file => 't/data/Collatex-16.xml' );
 550 my $c = $t->collation;
 551 my $n = scalar $c->readings;
 552 $c->compress_readings();
 553 is( scalar $c->readings, $n - 6, "Compressing readings seems to work" );
 554
 555 # Now put in a join-word and make sure the thing still works.
 556 my $t2 = Text::Tradition->new( input => 'CollateX', file => 't/data/Collatex-16.xml' );
 557 my $c2 = $t2->collation;
 558 # Split n21 ('unto') for testing purposes
 559 my $new_r = $c2->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
 560 my $old_r = $c2->reading( 'n21' );
 561 $old_r->alter_text( 'to' );
 562 $c2->del_path( 'n20', 'n21', 'A' );
 563 $c2->add_path( 'n20', 'n21p0', 'A' );
 564 $c2->add_path( 'n21p0', 'n21', 'A' );
 565 $c2->calculate_ranks();
 566 is( scalar $c2->readings, $n + 1, "We have our extra test reading" );
 567 $c2->compress_readings();
 568 is( scalar $c2->readings, $n - 6, "Compressing readings also works with join_next" );
 569 is( $c2->reading( 'n21p0' )->text, 'unto', "The joined word has no space" );
 570
 571
 572 =end testing
 573
 574 =cut
 575
 576 sub compress_readings {
 577         my $self = shift;
 578         # Sanity check: first save the original text of each witness.
 579         my %origtext;
 580         foreach my $wit ( $self->tradition->witnesses ) {
 581                 $origtext{$wit->sigil} = $self->path_text( $wit->sigil );
 582                 if( $wit->is_layered ) {
 583                         my $acsig = $wit->sigil . $self->ac_label;
 584                         $origtext{$acsig} = $self->path_text( $acsig );
 585                 }
 586         }
 587
 588         # Now do the deed.
 589         # Anywhere in the graph that there is a reading that joins only to a single
 590         # successor, and neither of these have any relationships, just join the two
 591         # readings.
 592         foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
 593                 # Now look for readings that can be joined to their successors.
 594                 next unless $rdg->is_combinable;
 595                 my %seen;
 596                 while( $self->sequence->successors( $rdg ) == 1 ) {
 597                         my( $next ) = $self->reading( $self->sequence->successors( $rdg ) );
 598                         throw( "Infinite loop" ) if $seen{$next->id};
 599                         $seen{$next->id} = 1;
 600                         last if $self->sequence->predecessors( $next ) > 1;
 601                         last unless $next->is_combinable;
 602                         say "Joining readings $rdg and $next";
 603                         $self->merge_readings( $rdg, $next, 1 );
 604                 }
 605         }
 606
 607         # Finally, make sure we haven't screwed anything up.
 608         foreach my $wit ( $self->tradition->witnesses ) {
 609                 my $pathtext = $self->path_text( $wit->sigil );
 610                 throw( "Text differs for witness " . $wit->sigil )
 611                         unless $pathtext eq $origtext{$wit->sigil};
 612                 if( $wit->is_layered ) {
 613                         my $acsig = $wit->sigil . $self->ac_label;
 614                         $pathtext = $self->path_text( $acsig );
 615                         throw( "Layered text differs for witness " . $wit->sigil )
 616                                 unless $pathtext eq $origtext{$acsig};
 617                 }
 618         }
 619
 620         $self->relations->rebuild_equivalence();
 621         $self->calculate_ranks();
 622 }
 623
 624 # Helper function for manipulating the graph.
 625 sub _stringify_args {
 626         my( $self, $first, $second, @args ) = @_;
 627     $first = $first->id
 628         if ref( $first ) eq 'Text::Tradition::Collation::Reading';
 629     $second = $second->id
 630         if ref( $second ) eq 'Text::Tradition::Collation::Reading';
 631     return( $first, $second, @args );
 632 }
 633
 634 # Helper function for manipulating the graph.
 635 sub _objectify_args {
 636         my( $self, $first, $second, $arg ) = @_;
 637     $first = $self->reading( $first )
 638         unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
 639     $second = $self->reading( $second )
 640         unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
 641     return( $first, $second, $arg );
 642 }
 643
 644 =head2 duplicate_reading( $reading, @witlist )
 645
 646 Split the given reading into two, so that the new reading is in the path for
 647 the witnesses given in @witlist. If the result is that certain non-colocated
 648 relationships (e.g. transpositions) are no longer valid, these will be removed.
 649 Returns the newly-created reading.
 650
 651 =begin testing
 652
 653 use Test::More::UTF8;
 654 use Text::Tradition;
 655 use TryCatch;
 656
 657 my $st = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/collatecorr.xml' );
 658 is( ref( $st ), 'Text::Tradition', "Got a tradition from test file" );
 659 ok( $st->has_witness('Ba96'), "Tradition has the affected witness" );
 660
 661 my $sc = $st->collation;
 662 my $numr = 17;
 663 ok( $sc->reading('n131'), "Tradition has the affected reading" );
 664 is( scalar( $sc->readings ), $numr, "There are $numr readings in the graph" );
 665 is( $sc->end->rank, 14, "There are fourteen ranks in the graph" );
 666
 667 # Detach the erroneously collated reading
 668 my( $newr, @del_rdgs ) = $sc->duplicate_reading( 'n131', 'Ba96' );
 669 ok( $newr, "New reading was created" );
 670 ok( $sc->reading('n131_0'), "Detached the bad collation with a new reading" );
 671 is( scalar( $sc->readings ), $numr + 1, "A reading was added to the graph" );
 672 is( $sc->end->rank, 10, "There are now only ten ranks in the graph" );
 673 my $csucc = $sc->common_successor( 'n131', 'n131_0' );
 674 is( $csucc->id, 'n136', "Found correct common successor to duped reading" );
 675
 676 # Check that the bad transposition is gone
 677 is( scalar @del_rdgs, 1, "Deleted reading was returned by API call" );
 678 is( $sc->get_relationship( 'n130', 'n135' ), undef, "Bad transposition relationship is gone" );
 679
 680 # The collation should not be fixed
 681 my @pairs = $sc->identical_readings();
 682 is( scalar @pairs, 0, "Not re-collated yet" );
 683 # Fix the collation
 684 ok( $sc->merge_readings( 'n124', 'n131_0' ), "Collated the readings correctly" );
 685 @pairs = $sc->identical_readings( start => 'n124', end => $csucc->id );
 686 is( scalar @pairs, 3, "Found three more identical readings" );
 687 is( $sc->end->rank, 11, "The ranks shifted appropriately" );
 688 $sc->flatten_ranks();
 689 is( scalar( $sc->readings ), $numr - 3, "Now we are collated correctly" );
 690
 691 # Check that we can't "duplicate" a reading with no wits or with all wits
 692 try {
 693         my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124' );
 694         ok( 0, "Reading duplication without witnesses throws an error" );
 695 } catch( Text::Tradition::Error $e ) {
 696         like( $e->message, qr/Must specify one or more witnesses/,
 697                 "Reading duplication without witnesses throws the expected error" );
 698 } catch {
 699         ok( 0, "Reading duplication without witnesses threw the wrong error" );
 700 }
 701
 702 try {
 703         my( $badr, @del_rdgs ) = $sc->duplicate_reading( 'n124', 'Ba96', 'Mü11475' );
 704         ok( 0, "Reading duplication with all witnesses throws an error" );
 705 } catch( Text::Tradition::Error $e ) {
 706         like( $e->message, qr/Cannot join all witnesses/,
 707                 "Reading duplication with all witnesses throws the expected error" );
 708 } catch {
 709         ok( 0, "Reading duplication with all witnesses threw the wrong error" );
 710 }
 711
 712 try {
 713         $sc->calculate_ranks();
 714         ok( 1, "Graph is still evidently whole" );
 715 } catch( Text::Tradition::Error $e ) {
 716         ok( 0, "Caught a rank exception: " . $e->message );
 717 }
 718
 719 =end testing
 720
 721 =cut
 722
 723 sub duplicate_reading {
 724         my( $self, $r, @wits ) = @_;
 725         # Check that we are not doing anything unwise.
 726         throw( "Must specify one or more witnesses for the duplicated reading" )
 727                 unless @wits;
 728         unless( ref( $r ) eq 'Text::Tradition::Collation::Reading' ) {
 729                 $r = $self->reading( $r );
 730         }
 731         throw( "Cannot duplicate a meta-reading" )
 732                 if $r->is_meta;
 733         throw( "Cannot join all witnesses to the new reading" )
 734                 if scalar( @wits ) == scalar( $r->witnesses );
 735
 736         # Get all the reading attributes and duplicate them.
 737         my $rmeta = Text::Tradition::Collation::Reading->meta;
 738         my %args;
 739     foreach my $attr( $rmeta->get_all_attributes ) {
 740                 next if $attr->name =~ /^_/;
 741                 my $acc = $attr->get_read_method;
 742                 if( !$acc && $attr->has_applied_traits ) {
 743                         my $tr = $attr->applied_traits;
 744                         if( $tr->[0] =~ /::(Array|Hash)$/ ) {
 745                                 my $which = $1;
 746                                 my %methods = reverse %{$attr->handles};
 747                                 $acc = $methods{elements};
 748                                 $args{$attr->name} = $which eq 'Array'
 749                                         ? [ $r->$acc ] : { $r->$acc };
 750                         }
 751                 } elsif( $acc ) {
 752                         my $attrval = $r->$acc;
 753                         if( defined $attrval ) {
 754                                 $args{$attr->name} = $attrval;
 755                         }
 756                 }
 757         }
 758         # By definition the new reading will no longer be common.
 759         $args{is_common} = 0;
 760         # The new reading also needs its own ID.
 761         $args{id} = $self->_generate_dup_id( $r->id );
 762
 763         # Try to make the new reading.
 764         my $newr = $self->add_reading( \%args );
 765         # The old reading is also no longer common.
 766         $r->is_common( 0 );
 767
 768         # For each of the witnesses, dissociate from the old reading and
 769         # associate with the new.
 770         foreach my $wit ( @wits ) {
 771                 my $prior = $self->prior_reading( $r, $wit );
 772                 my $next = $self->next_reading( $r, $wit );
 773                 $self->del_path( $prior, $r, $wit );
 774                 $self->add_path( $prior, $newr, $wit );
 775                 $self->del_path( $r, $next, $wit );
 776                 $self->add_path( $newr, $next, $wit );
 777         }
 778
 779         # If the graph is ranked, we need to look for relationships that are now
 780         # invalid (i.e. 'non-colocation' types that might now be colocated) and
 781         # remove them. If not, we can skip it.
 782         my $succ;
 783         my %rrk;
 784         my @deleted_relations;
 785         if( $self->end->has_rank ) {
 786                 # Find the point where we can stop checking
 787                 $succ = $self->common_successor( $r, $newr );
 788
 789                 # Hash the existing ranks
 790                 foreach my $rdg ( $self->readings ) {
 791                         $rrk{$rdg->id} = $rdg->rank;
 792                 }
 793                 # Calculate the new ranks
 794                 $self->calculate_ranks();
 795
 796                 # Check for invalid non-colocated relationships among changed-rank readings
 797                 # from where the ranks start changing up to $succ
 798                 my $lastrank = $succ->rank;
 799                 foreach my $rdg ( $self->readings ) {
 800                         next if $rdg->rank > $lastrank;
 801                         next if $rdg->rank == $rrk{$rdg->id};
 802                         my @noncolo = $rdg->related_readings( sub { !$_[0]->colocated } );
 803                         next unless @noncolo;
 804                         foreach my $nc ( @noncolo ) {
 805                                 unless( $self->relations->verify_or_delete( $rdg, $nc ) ) {
 806                                         push( @deleted_relations, [ $rdg->id, $nc->id ] );
 807                                 }
 808                         }
 809                 }
 810         }
 811         return ( $newr, @deleted_relations );
 812 }
 813
 814 sub _generate_dup_id {
 815         my( $self, $rid ) = @_;
 816         my $newid;
 817         my $i = 0;
 818         while( !$newid ) {
 819                 $newid = $rid."_$i";
 820                 if( $self->has_reading( $newid ) ) {
 821                         $newid = '';
 822                         $i++;
 823                 }
 824         }
 825         return $newid;
 826 }
 827
 828 ### Path logic
 829
 830 sub add_path {
 831         my $self = shift;
 832
 833         # We only need the IDs for adding paths to the graph, not the reading
 834         # objects themselves.
 835     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 836
 837         $self->_graphcalc_done(0);
 838         # Connect the readings
 839         unless( $self->sequence->has_edge( $source, $target ) ) {
 840             $self->sequence->add_edge( $source, $target );
 841             $self->relations->add_equivalence_edge( $source, $target );
 842         }
 843     # Note the witness in question
 844     $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
 845 }
 846
 847 sub del_path {
 848         my $self = shift;
 849         my @args;
 850         if( ref( $_[0] ) eq 'ARRAY' ) {
 851                 my $e = shift @_;
 852                 @args = ( @$e, @_ );
 853         } else {
 854                 @args = @_;
 855         }
 856
 857         # We only need the IDs for removing paths from the graph, not the reading
 858         # objects themselves.
 859     my( $source, $target, $wit ) = $self->_stringify_args( @args );
 860
 861         $self->_graphcalc_done(0);
 862         if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
 863                 $self->sequence->delete_edge_attribute( $source, $target, $wit );
 864         }
 865         unless( $self->sequence->has_edge_attributes( $source, $target ) ) {
 866                 $self->sequence->delete_edge( $source, $target );
 867                 $self->relations->delete_equivalence_edge( $source, $target );
 868         }
 869 }
 870
 871
 872 # Extra graph-alike utility
 873 sub has_path {
 874         my $self = shift;
 875     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 876         return undef unless $self->sequence->has_edge( $source, $target );
 877         return $self->sequence->has_edge_attribute( $source, $target, $wit );
 878 }
 879
 880 =head2 clear_witness( @sigil_list )
 881
 882 Clear the given witnesses out of the collation entirely, removing references
 883 to them in paths, and removing readings that belong only to them.  Should only
 884 be called via $tradition->del_witness.
 885
 886 =cut
 887
 888 sub clear_witness {
 889         my( $self, @sigils ) = @_;
 890
 891         $self->_graphcalc_done(0);
 892         # Clear the witness(es) out of the paths
 893         foreach my $e ( $self->paths ) {
 894                 foreach my $sig ( @sigils ) {
 895                         $self->del_path( $e, $sig );
 896                 }
 897         }
 898
 899         # Clear out the newly unused readings
 900         foreach my $r ( $self->readings ) {
 901                 unless( $self->reading_witnesses( $r ) ) {
 902                         $self->del_reading( $r );
 903                 }
 904         }
 905 }
 906
 907 sub add_relationship {
 908         my $self = shift;
 909     my( $source, $target, $opts, $altered_readings ) = $self->_stringify_args( @_ );
 910     my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
 911     my $did_recalc;
 912     foreach my $v ( @vectors ) {
 913         my $rel = $self->get_relationship( $v );
 914         next unless $rel->colocated;
 915         my $r1 = $self->reading( $v->[0] );
 916         my $r2 =  $self->reading( $v->[1] );
 917         # If it's a spelling or orthographic relationship, and one is marked
 918         # as a lemma, set the normal form on the non-lemma to reflect that.
 919         if( $r1->does( 'Text::Tradition::Morphology' ) ) {
 920                 my @changed = $r1->relationship_added( $r2, $rel );
 921                 if( ref( $altered_readings ) eq 'ARRAY' ) {
 922                         push( @$altered_readings, @changed );
 923                 }
 924         }
 925         next if $did_recalc;
 926         if( $r1->has_rank && $r2->has_rank && $r1->rank ne $r2->rank ) {
 927                         $self->_graphcalc_done(0);
 928                         $self->_clear_cache;
 929                         $did_recalc = 1;
 930         }
 931     }
 932     return @vectors;
 933 }
 934
 935 around qw/ get_relationship del_relationship / => sub {
 936         my $orig = shift;
 937         my $self = shift;
 938         my @args = @_;
 939         if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
 940                 @args = @{$_[0]};
 941         }
 942         my @stringargs = $self->_stringify_args( @args );
 943         $self->$orig( @stringargs );
 944 };
 945
 946 =head2 reading_witnesses( $reading )
 947
 948 Return a list of sigils corresponding to the witnesses in which the reading appears.
 949
 950 =cut
 951
 952 sub reading_witnesses {
 953         my( $self, $reading ) = @_;
 954         # We need only check either the incoming or the outgoing edges; I have
 955         # arbitrarily chosen "incoming".  Thus, special-case the start node.
 956         if( $reading eq $self->start ) {
 957                 return map { $_->sigil } grep { $_->is_collated } $self->tradition->witnesses;
 958         }
 959         my %all_witnesses;
 960         foreach my $e ( $self->sequence->edges_to( $reading ) ) {
 961                 my $wits = $self->sequence->get_edge_attributes( @$e );
 962                 @all_witnesses{ keys %$wits } = 1;
 963         }
 964         my $acstr = $self->ac_label;
 965         foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
 966                 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
 967         }
 968         return keys %all_witnesses;
 969 }
 970
 971 =head1 OUTPUT METHODS
 972
 973 =head2 as_svg( \%options )
 974
 975 Returns an SVG string that represents the graph, via as_dot and graphviz.
 976 See as_dot for a list of options.  Must have GraphViz (dot) installed to run.
 977
 978 =begin testing
 979
 980 use File::Which;
 981 use Text::Tradition;
 982 use XML::LibXML;
 983 use XML::LibXML::XPathContext;
 984
 985
 986 SKIP: {
 987         skip( 'Need Graphviz installed to test graphs', 16 )
 988                 unless File::Which::which( 'dot' );
 989
 990         my $datafile = 't/data/Collatex-16.xml';
 991
 992         my $tradition = Text::Tradition->new(
 993                 'name'  => 'inline',
 994                 'input' => 'CollateX',
 995                 'file'  => $datafile,
 996                 );
 997         my $collation = $tradition->collation;
 998
 999         # Test the svg creation
1000         my $parser = XML::LibXML->new();
1001         $parser->load_ext_dtd( 0 );
1002         my $svg = $parser->parse_string( $collation->as_svg() );
1003         is( $svg->documentElement->nodeName(), 'svg', 'Got an svg document' );
1004
1005         # Test for the correct number of nodes in the SVG
1006         my $svg_xpc = XML::LibXML::XPathContext->new( $svg->documentElement() );
1007         $svg_xpc->registerNs( 'svg', 'http://www.w3.org/2000/svg' );
1008         my @svg_nodes = $svg_xpc->findnodes( '//svg:g[@class="node"]' );
1009         is( scalar @svg_nodes, 26, "Correct number of nodes in the graph" );
1010
1011         # Test for the correct number of edges
1012         my @svg_edges = $svg_xpc->findnodes( '//svg:g[@class="edge"]' );
1013         is( scalar @svg_edges, 32, "Correct number of edges in the graph" );
1014
1015         # Test svg creation for a subgraph
1016         my $part_svg = $parser->parse_string( $collation->as_svg( { from => 15 } ) ); # start, no end
1017         is( $part_svg->documentElement->nodeName(), 'svg', "Got an svg subgraph to end" );
1018         my $part_xpc = XML::LibXML::XPathContext->new( $part_svg->documentElement() );
1019         $part_xpc->registerNs( 'svg', 'http://www.w3.org/2000/svg' );
1020         @svg_nodes = $part_xpc->findnodes( '//svg:g[@class="node"]' );
1021         is( scalar( @svg_nodes ), 9,
1022                 "Correct number of nodes in the subgraph" );
1023         @svg_edges = $part_xpc->findnodes( '//svg:g[@class="edge"]' );
1024         is( scalar( @svg_edges ), 10,
1025                 "Correct number of edges in the subgraph" );
1026
1027         $part_svg = $parser->parse_string( $collation->as_svg( { from => 10, to => 13 } ) ); # start, no end
1028         is( $part_svg->documentElement->nodeName(), 'svg', "Got an svg subgraph in the middle" );
1029         $part_xpc = XML::LibXML::XPathContext->new( $part_svg->documentElement() );
1030         $part_xpc->registerNs( 'svg', 'http://www.w3.org/2000/svg' );
1031         @svg_nodes = $part_xpc->findnodes( '//svg:g[@class="node"]' );
1032         is( scalar( @svg_nodes ), 9,
1033                 "Correct number of nodes in the subgraph" );
1034         @svg_edges = $part_xpc->findnodes( '//svg:g[@class="edge"]' );
1035         is( scalar( @svg_edges ), 11,
1036                 "Correct number of edges in the subgraph" );
1037
1038
1039         $part_svg = $parser->parse_string( $collation->as_svg( { to => 5 } ) ); # start, no end
1040         is( $part_svg->documentElement->nodeName(), 'svg', "Got an svg subgraph from start" );
1041         $part_xpc = XML::LibXML::XPathContext->new( $part_svg->documentElement() );
1042         $part_xpc->registerNs( 'svg', 'http://www.w3.org/2000/svg' );
1043         @svg_nodes = $part_xpc->findnodes( '//svg:g[@class="node"]' );
1044         is( scalar( @svg_nodes ), 7,
1045                 "Correct number of nodes in the subgraph" );
1046         @svg_edges = $part_xpc->findnodes( '//svg:g[@class="edge"]' );
1047         is( scalar( @svg_edges ), 7,
1048                 "Correct number of edges in the subgraph" );
1049
1050         # Test a right-to-left graph
1051         my $arabic = Text::Tradition->new(
1052                 input => 'Tabular',
1053                 sep_char => ',',
1054                 name => 'arabic',
1055                 direction => 'RL',
1056                 file => 't/data/arabic_snippet.csv' );
1057         my $rl_svg = $parser->parse_string( $arabic->collation->as_svg() );
1058         is( $rl_svg->documentElement->nodeName(), 'svg', "Got an svg subgraph from start" );
1059         my $rl_xpc = XML::LibXML::XPathContext->new( $rl_svg->documentElement() );
1060         $rl_xpc->registerNs( 'svg', 'http://www.w3.org/2000/svg' );
1061         my %node_cx;
1062         foreach my $node ( $rl_xpc->findnodes( '//svg:g[@class="node"]' ) ) {
1063                 my $nid = $node->getAttribute('id');
1064                 $node_cx{$nid} = $rl_xpc->findvalue( './svg:ellipse/@cx', $node );
1065         }
1066         my @sorted = sort { $node_cx{$a} <=> $node_cx{$b} } keys( %node_cx );
1067         is( $sorted[0], '__END__', "End node is the leftmost" );
1068         is( $sorted[$#sorted], '__START__', "Start node is the rightmost" );
1069
1070 } #SKIP
1071
1072 =end testing
1073
1074 =cut
1075
1076 sub as_svg {
1077     my( $self, $opts ) = @_;
1078     throw( "Need GraphViz installed to output SVG" )
1079         unless File::Which::which( 'dot' );
1080     my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
1081     $self->calculate_ranks()
1082         unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
1083         my @cmd = qw/dot -Tsvg/;
1084         my( $svg, $err );
1085         my $dotfile = File::Temp->new();
1086         ## USE FOR DEBUGGING
1087         # $dotfile->unlink_on_destroy(0);
1088         binmode $dotfile, ':utf8';
1089         print $dotfile $self->as_dot( $opts );
1090         push( @cmd, $dotfile->filename );
1091         run( \@cmd, ">", binary(), \$svg );
1092         $svg = decode_utf8( $svg );
1093         return $svg;
1094 }
1095
1096
1097 =head2 as_dot( \%options )
1098
1099 Returns a string that is the collation graph expressed in dot
1100 (i.e. GraphViz) format.  Options include:
1101
1102 =over 4
1103
1104 =item * from
1105
1106 =item * to
1107
1108 =item * color_common
1109
1110 =back
1111
1112 =cut
1113
1114 sub as_dot {
1115     my( $self, $opts ) = @_;
1116     my $startrank = $opts->{'from'} if $opts;
1117     my $endrank = $opts->{'to'} if $opts;
1118     my $color_common = $opts->{'color_common'} if $opts;
1119     my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
1120        && $self->end->rank > 100;
1121     $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
1122
1123     # Check the arguments
1124     if( $startrank ) {
1125         return if $endrank && $startrank > $endrank;
1126         return if $startrank > $self->end->rank;
1127         }
1128         if( defined $endrank ) {
1129                 return if $endrank < 0;
1130                 $endrank = undef if $endrank == $self->end->rank;
1131         }
1132
1133     my $graph_name = $self->tradition->name;
1134     $graph_name =~ s/[^\w\s]//g;
1135     $graph_name = join( '_', split( /\s+/, $graph_name ) );
1136
1137     my %graph_attrs = (
1138         'bgcolor' => 'none',
1139         );
1140     unless( $self->direction eq 'BI' ) {
1141         $graph_attrs{rankdir} = $self->direction;
1142     }
1143     my %node_attrs = (
1144         'fontsize' => 14,
1145         'fillcolor' => 'white',
1146         'style' => 'filled',
1147         'shape' => 'ellipse'
1148         );
1149     my %edge_attrs = (
1150         'arrowhead' => 'open',
1151         'color' => '#000000',
1152         'fontcolor' => '#000000',
1153         );
1154
1155     my $dot = sprintf( "digraph %s {\n", $graph_name );
1156     $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
1157     $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
1158
1159         # Output substitute start/end readings if necessary
1160         if( $startrank ) {
1161                 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
1162         }
1163         if( $endrank ) {
1164                 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
1165         }
1166         if( $STRAIGHTENHACK ) {
1167                 ## HACK part 1
1168                 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
1169                 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
1170                 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
1171         }
1172         my %used;  # Keep track of the readings that actually appear in the graph
1173         # Sort the readings by rank if we have ranks; this speeds layout.
1174         my @all_readings = $self->end->has_rank
1175                 ? sort { $a->rank <=> $b->rank } $self->readings
1176                 : $self->readings;
1177         # TODO Refrain from outputting lacuna nodes - just grey out the edges.
1178     foreach my $reading ( @all_readings ) {
1179         # Only output readings within our rank range.
1180         next if $startrank && $reading->rank < $startrank;
1181         next if $endrank && $reading->rank > $endrank;
1182         $used{$reading->id} = 1;
1183         # Need not output nodes without separate labels
1184         next if $reading->id eq $reading->text;
1185         my $rattrs;
1186         my $label = $reading->text;
1187         unless( $label =~ /^[[:punct:]]+$/ ) {
1188                 $label .= '-' if $reading->join_next;
1189             $label = "-$label" if $reading->join_prior;
1190         }
1191         $label =~ s/\"/\\\"/g;
1192                 $rattrs->{'label'} = $label;
1193                 $rattrs->{'id'} = $reading->id;
1194                 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
1195         $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
1196     }
1197
1198         # Add the real edges.
1199     my @edges = $self->paths;
1200         my( %substart, %subend );
1201     foreach my $edge ( @edges ) {
1202         # Do we need to output this edge?
1203         if( $used{$edge->[0]} && $used{$edge->[1]} ) {
1204                 my $label = $self->_path_display_label( $opts,
1205                         $self->path_witnesses( $edge ) );
1206                         my $variables = { %edge_attrs, 'label' => $label };
1207
1208                         # Account for the rank gap if necessary
1209                         my $rank0 = $self->reading( $edge->[0] )->rank
1210                                 if $self->reading( $edge->[0] )->has_rank;
1211                         my $rank1 = $self->reading( $edge->[1] )->rank
1212                                 if $self->reading( $edge->[1] )->has_rank;
1213                         if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
1214                                 $variables->{'minlen'} = $rank1 - $rank0;
1215                         }
1216
1217                         # EXPERIMENTAL: make edge width reflect no. of witnesses
1218                         my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
1219                         $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
1220
1221                         my $varopts = _dot_attr_string( $variables );
1222                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1223                                 $edge->[0], $edge->[1], $varopts );
1224         } elsif( $used{$edge->[0]} ) {
1225                 $subend{$edge->[0]} = $edge->[1];
1226         } elsif( $used{$edge->[1]} ) {
1227                 $substart{$edge->[1]} = $edge->[0];
1228         }
1229     }
1230
1231     # If we are asked to, add relationship links
1232     if( exists $opts->{show_relations} ) {
1233         my $filter = $opts->{show_relations}; # can be 'transposition' or 'all'
1234         if( $filter eq 'transposition' ) {
1235                 $filter =~ qr/^transposition$/;
1236         }
1237         my %typecolors;
1238         my @types = sort( map { $_->name } $self->relations->types );
1239         if( exists $opts->{graphcolors} ) {
1240                 foreach my $tdx ( 0 .. $#types ) {
1241                         $typecolors{$types[$tdx]} = $opts->{graphcolors}->[$tdx];
1242                 }
1243         } else {
1244                 map { $typecolors{$_} = '#FFA14F' } @types;
1245         }
1246         foreach my $redge ( $self->relationships ) {
1247                 if( $used{$redge->[0]} && $used{$redge->[1]} ) {
1248                                 my $rel = $self->get_relationship( $redge );
1249                                 next unless $filter eq 'all' || $rel->type =~ /$filter/;
1250                                 my $variables = {
1251                                         arrowhead => 'none',
1252                                         color => $typecolors{$rel->type},
1253                                         constraint => 'false',
1254                                         penwidth => '3',
1255                                 };
1256                                 unless( exists $opts->{graphcolors} ) {
1257                                         $variables->{label} = uc( substr( $rel->type, 0, 4 ) ),
1258                                 }
1259                                 $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
1260                                         $redge->[0], $redge->[1], _dot_attr_string( $variables ) );
1261                 }
1262         }
1263     }
1264
1265     # Add substitute start and end edges if necessary
1266     foreach my $node ( keys %substart ) {
1267         my $witstr = $self->_path_display_label( $opts,
1268                 $self->path_witnesses( $substart{$node}, $node ) );
1269         my $variables = { %edge_attrs, 'label' => $witstr };
1270         my $nrdg = $self->reading( $node );
1271         if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
1272                 # Substart is actually one lower than $startrank
1273                 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
1274         }
1275         my $varopts = _dot_attr_string( $variables );
1276         $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
1277         }
1278     foreach my $node ( keys %subend ) {
1279         my $witstr = $self->_path_display_label( $opts,
1280                 $self->path_witnesses( $node, $subend{$node} ) );
1281         my $variables = { %edge_attrs, 'label' => $witstr };
1282         my $varopts = _dot_attr_string( $variables );
1283         $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
1284         }
1285         # HACK part 2
1286         if( $STRAIGHTENHACK ) {
1287                 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
1288                 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
1289         }
1290
1291     $dot .= "}\n";
1292     return $dot;
1293 }
1294
1295 sub _dot_attr_string {
1296         my( $hash ) = @_;
1297         my @attrs;
1298         foreach my $k ( sort keys %$hash ) {
1299                 my $v = $hash->{$k};
1300                 push( @attrs, $k.'="'.$v.'"' );
1301         }
1302         return( '[ ' . join( ', ', @attrs ) . ' ]' );
1303 }
1304
1305 =head2 path_witnesses( $edge )
1306
1307 Returns the list of sigils whose witnesses are associated with the given edge.
1308 The edge can be passed as either an array or an arrayref of ( $source, $target ).
1309
1310 =cut
1311
1312 sub path_witnesses {
1313         my( $self, @edge ) = @_;
1314         # If edge is an arrayref, cope.
1315         if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
1316                 my $e = shift @edge;
1317                 @edge = @$e;
1318         }
1319         my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
1320         return @wits;
1321 }
1322
1323 # Helper function. Make a display label for the given witnesses, showing a.c.
1324 # witnesses only where the main witness is not also in the list.
1325 sub _path_display_label {
1326         my $self = shift;
1327         my $opts = shift;
1328         my %wits;
1329         map { $wits{$_} = 1 } @_;
1330
1331         # If an a.c. wit is listed, remove it if the main wit is also listed.
1332         # Otherwise keep it for explicit listing.
1333         my $aclabel = $self->ac_label;
1334         my @disp_ac;
1335         foreach my $w ( sort keys %wits ) {
1336                 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
1337                         if( exists $wits{$1} ) {
1338                                 delete $wits{$w};
1339                         } else {
1340                                 push( @disp_ac, $w );
1341                         }
1342                 }
1343         }
1344
1345         if( $opts->{'explicit_wits'} ) {
1346                 return join( ', ', sort keys %wits );
1347         } else {
1348                 # See if we are in a majority situation.
1349                 my $maj = scalar( $self->tradition->witnesses ) * 0.6;
1350                 $maj = $maj > 5 ? $maj : 5;
1351                 if( scalar keys %wits > $maj ) {
1352                         unshift( @disp_ac, 'majority' );
1353                         return join( ', ', @disp_ac );
1354                 } else {
1355                         return join( ', ', sort keys %wits );
1356                 }
1357         }
1358 }
1359
1360 =head2 as_adjacency_list
1361
1362 Returns a JSON structure that represents the collation sequence graph.
1363
1364 =begin testing
1365
1366 use JSON qw/ from_json /;
1367 use Text::Tradition;
1368
1369 my $t = Text::Tradition->new(
1370         'input' => 'Self',
1371         'file' => 't/data/florilegium_graphml.xml' );
1372 my $c = $t->collation;
1373
1374 # Make a connection so we can test rank preservation
1375 $c->add_relationship( 'w91', 'w92', { type => 'grammatical' } );
1376
1377 # Create an adjacency list of the whole thing; test the output.
1378 my $adj_whole = from_json( $c->as_adjacency_list() );
1379 is( scalar @$adj_whole, scalar $c->readings(),
1380         "Same number of nodes in graph and adjacency list" );
1381 my @adj_whole_edges;
1382 map { push( @adj_whole_edges, @{$_->{adjacent}} ) } @$adj_whole;
1383 is( scalar @adj_whole_edges, scalar $c->sequence->edges,
1384         "Same number of edges in graph and adjacency list" );
1385 # Find the reading whose rank should be preserved
1386 my( $test_rdg ) = grep { $_->{id} eq 'w89' } @$adj_whole;
1387 my( $test_edge ) = grep { $_->{id} eq 'w92' } @{$test_rdg->{adjacent}};
1388 is( $test_edge->{minlen}, 2, "Rank of test reading is preserved" );
1389
1390 # Now create an adjacency list of just a portion. w76 to w122
1391 my $adj_part = from_json( $c->as_adjacency_list(
1392         { from => $c->reading('w76')->rank,
1393           to   => $c->reading('w122')->rank }));
1394 is( scalar @$adj_part, 48, "Correct number of nodes in partial graph" );
1395 my @adj_part_edges;
1396 map { push( @adj_part_edges, @{$_->{adjacent}} ) } @$adj_part;
1397 is( scalar @adj_part_edges, 58,
1398         "Same number of edges in partial graph and adjacency list" );
1399 # Check for consistency
1400 my %part_nodes;
1401 map { $part_nodes{$_->{id}} = 1 } @$adj_part;
1402 foreach my $edge ( @adj_part_edges ) {
1403         my $testid = $edge->{id};
1404         ok( $part_nodes{$testid}, "ID $testid referenced in edge is given as node" );
1405 }
1406
1407 =end testing
1408
1409 =cut
1410
1411 sub as_adjacency_list {
1412         my( $self, $opts ) = @_;
1413         # Make a structure that contains all the nodes, the nodes they point to,
1414         # and the attributes of the edges that connect them.
1415         # [ { id: 'n0', label: 'Gallia', adjacent: [
1416         #               { id: 'n1', label: 'P Q' } ,
1417         #               { id: 'n2', label: 'R S', minlen: 2 } ] },
1418         #   { id: 'n1', label: 'est', adjacent: [ ... ] },
1419         #   ... ]
1420         my $startrank = $opts->{'from'} || 0;
1421         my $endrank = $opts->{'to'} || $self->end->rank;
1422
1423     $self->calculate_ranks()
1424         unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
1425         my $list = [];
1426         foreach my $rdg ( $self->readings ) {
1427                 my @successors;
1428                 my $phony = '';
1429                 # Figure out what the node's successors should be.
1430                 if( $rdg eq $self->start && $startrank > 0 ) {
1431                         # Connect the start node with all the nodes at startrank.
1432                         # Lacunas should be included only if the node really has that rank.
1433                         @successors = $self->readings_at_rank( $startrank, 1 );
1434                         $phony = 'start';
1435                 } elsif( $rdg->rank < $startrank
1436                                  || $rdg->rank > $endrank && $rdg ne $self->end ) {
1437                         next;
1438                 } else {
1439                         @successors = $rdg->successors;
1440                 }
1441                 # Make sure that the end node is at the end of the successors
1442                 # list if it is needed.
1443                 if( grep { $_ eq $self->end } @successors ) {
1444                         my @ts = grep { $_ ne $self->end } @successors;
1445                         @successors = ( @ts, $self->end );
1446                 } elsif ( grep { $_->rank > $endrank } @successors ) {
1447                         push( @successors, $self->end );
1448                 }
1449
1450                 my $listitem = { id => $rdg->id, label => $rdg->text };
1451                 my $adjacent = [];
1452                 my @endwits;
1453                 foreach my $succ ( @successors ) {
1454                         my @edgewits;
1455                         if( $phony eq 'start' ) {
1456                                 @edgewits = $succ->witnesses;
1457                         } elsif( $self->sequence->has_edge( $rdg->id, $succ->id ) ) {
1458                                 @edgewits = $self->path_witnesses( $rdg->id, $succ->id );
1459                         }
1460
1461                         if( $succ eq $self->end ) {
1462                                 @edgewits = @endwits;
1463                         } elsif( $succ->rank > $endrank ) {
1464                                 # These witnesses will point to 'end' instead, not to the
1465                                 # actual successor.
1466                                 push( @endwits, @edgewits );
1467                                 next;
1468                         }
1469                         my $edgelabel = $self->_path_display_label( $opts, @edgewits );
1470                         my $edgedef = { id => $succ->id, label => $edgelabel };
1471                         my $rankoffset = $succ->rank - $rdg->rank;
1472                         if( $rankoffset > 1 and $succ ne $self->end ) {
1473                                 $edgedef->{minlen} = $rankoffset;
1474                         }
1475                         push( @$adjacent, $edgedef );
1476                 }
1477                 $listitem->{adjacent} = $adjacent;
1478                 push( @$list, $listitem );
1479         }
1480         return to_json( $list );
1481 }
1482
1483 =head2 as_graphml
1484
1485 Returns a GraphML representation of the collation.  The GraphML will contain
1486 two graphs. The first expresses the attributes of the readings and the witness
1487 paths that link them; the second expresses the relationships that link the
1488 readings.  This is the native transfer format for a tradition.
1489
1490 =begin testing
1491
1492 use Text::Tradition;
1493 use TryCatch;
1494
1495 my $READINGS = 311;
1496 my $PATHS = 361;
1497
1498 my $datafile = 't/data/florilegium_tei_ps.xml';
1499 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1500                                       'name' => 'test0',
1501                                       'file' => $datafile,
1502                                       'linear' => 1 );
1503
1504 ok( $tradition, "Got a tradition object" );
1505 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
1506 ok( $tradition->collation, "Tradition has a collation" );
1507
1508 my $c = $tradition->collation;
1509 is( scalar $c->readings, $READINGS, "Collation has all readings" );
1510 is( scalar $c->paths, $PATHS, "Collation has all paths" );
1511 is( scalar $c->relationships, 0, "Collation has all relationships" );
1512
1513 # Add a few relationships
1514 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
1515 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
1516 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition',
1517                                           'is_significant' => 'yes' } );
1518
1519 # Now write it to GraphML and parse it again.
1520
1521 my $graphml = $c->as_graphml;
1522 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
1523 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
1524 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
1525 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
1526 my $sigrel = $st->collation->get_relationship( 'w257', 'w262' );
1527 is( $sigrel->is_significant, 'yes', "Ternary attribute value was restored" );
1528
1529 # Now add a stemma, write to GraphML, and look at the output.
1530 SKIP: {
1531         skip "Analysis module not present", 3 unless $tradition->can( 'add_stemma' );
1532         my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
1533         is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
1534         is( $tradition->stemmata, 1, "Tradition now has the stemma" );
1535         $graphml = $c->as_graphml;
1536         like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
1537 }
1538
1539 =end testing
1540
1541 =cut
1542
1543 ## TODO MOVE this to Tradition.pm and modularize it better
1544 sub as_graphml {
1545     my( $self, $options ) = @_;
1546         $self->calculate_ranks unless $self->_graphcalc_done;
1547
1548         my $start = $options->{'from'}
1549                 ? $self->reading( $options->{'from'} ) : $self->start;
1550         my $end = $options->{'to'}
1551                 ? $self->reading( $options->{'to'} ) : $self->end;
1552         if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
1553                 throw( 'Start node must be before end node' );
1554         }
1555         # The readings need to be ranked for this to work.
1556         $start = $self->start unless $start->has_rank;
1557         $end = $self->end unless $end->has_rank;
1558         my $rankoffset = 0;
1559         unless( $start eq $self->start ) {
1560                 $rankoffset = $start->rank - 1;
1561         }
1562         my %use_readings;
1563
1564     # Some namespaces
1565     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
1566     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
1567     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
1568         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
1569
1570     # Create the document and root node
1571     require XML::LibXML;
1572     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
1573     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
1574     $graphml->setDocumentElement( $root );
1575     $root->setNamespace( $xsi_ns, 'xsi', 0 );
1576     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
1577
1578     # List of attribute types to save on our objects and their corresponding
1579     # GraphML types
1580     my %save_types = (
1581         'Str' => 'string',
1582         'Int' => 'int',
1583         'Bool' => 'boolean',
1584         'ReadingID' => 'string',
1585         'RelationshipType' => 'string',
1586         'RelationshipScope' => 'string',
1587         'Ternary' => 'string',
1588     );
1589
1590     # Add the data keys for the graph. Include an extra key 'version' for the
1591     # GraphML output version.
1592     my %graph_data_keys;
1593     my $gdi = 0;
1594     my %graph_attributes = ( 'version' => 'string' );
1595         # Graph attributes include those of Tradition and those of Collation.
1596         my %gattr_from;
1597         # TODO Use meta introspection method from duplicate_reading to do this
1598         # instead of naming custom keys.
1599         my $tmeta = $self->tradition->meta;
1600         my $cmeta = $self->meta;
1601         map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
1602         map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
1603         foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
1604                 next if $attr->name =~ /^_/;
1605                 next unless $save_types{$attr->type_constraint->name};
1606                 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1607         }
1608     # Extra custom keys for complex objects that should be saved in some form.
1609     # The subroutine should return a string, or undef/empty.
1610     if( $tmeta->has_method('stemmata') ) {
1611                 $graph_attributes{'stemmata'} = sub {
1612                         my @stemstrs;
1613                         map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1614                                 $self->tradition->stemmata;
1615                         join( "\n", @stemstrs );
1616                 };
1617         }
1618
1619         if( $tmeta->has_method('user') ) {
1620                 $graph_attributes{'user'} = sub {
1621                         $self->tradition->user ? $self->tradition->user->id : undef
1622                 };
1623         }
1624
1625     foreach my $datum ( sort keys %graph_attributes ) {
1626         $graph_data_keys{$datum} = 'dg'.$gdi++;
1627         my $key = $root->addNewChild( $graphml_ns, 'key' );
1628         my $dtype = ref( $graph_attributes{$datum} ) ? 'string'
1629                 : $graph_attributes{$datum};
1630         $key->setAttribute( 'attr.name', $datum );
1631         $key->setAttribute( 'attr.type', $dtype );
1632         $key->setAttribute( 'for', 'graph' );
1633         $key->setAttribute( 'id', $graph_data_keys{$datum} );
1634     }
1635
1636     # Add the data keys for reading nodes
1637     my %reading_attributes;
1638     my $rmeta = Text::Tradition::Collation::Reading->meta;
1639     foreach my $attr( $rmeta->get_all_attributes ) {
1640                 next if $attr->name =~ /^_/;
1641                 next unless $save_types{$attr->type_constraint->name};
1642                 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1643         }
1644         if( $self->start->does('Text::Tradition::Morphology' ) ) {
1645                 # Extra custom key for the reading morphology
1646                 $reading_attributes{'lexemes'} = 'string';
1647         }
1648
1649     my %node_data_keys;
1650     my $ndi = 0;
1651     foreach my $datum ( sort keys %reading_attributes ) {
1652         $node_data_keys{$datum} = 'dn'.$ndi++;
1653         my $key = $root->addNewChild( $graphml_ns, 'key' );
1654         $key->setAttribute( 'attr.name', $datum );
1655         $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
1656         $key->setAttribute( 'for', 'node' );
1657         $key->setAttribute( 'id', $node_data_keys{$datum} );
1658     }
1659
1660     # Add the data keys for edges, that is, paths and relationships. Path
1661     # data does not come from a Moose class so is here manually.
1662     my $edi = 0;
1663     my %edge_data_keys;
1664     my %edge_attributes = (
1665         witness => 'string',                    # ID/label for a path
1666         extra => 'boolean',                             # Path key
1667         );
1668     my @path_attributes = keys %edge_attributes; # track our manual additions
1669     my $pmeta = Text::Tradition::Collation::Relationship->meta;
1670     foreach my $attr( $pmeta->get_all_attributes ) {
1671                 next if $attr->name =~ /^_/;
1672                 next unless $save_types{$attr->type_constraint->name};
1673                 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1674         }
1675     foreach my $datum ( sort keys %edge_attributes ) {
1676         $edge_data_keys{$datum} = 'de'.$edi++;
1677         my $key = $root->addNewChild( $graphml_ns, 'key' );
1678         $key->setAttribute( 'attr.name', $datum );
1679         $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
1680         $key->setAttribute( 'for', 'edge' );
1681         $key->setAttribute( 'id', $edge_data_keys{$datum} );
1682     }
1683
1684     # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1685     my $xmlidname = $self->tradition->name;
1686     $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1687     if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1688         $xmlidname = '_'.$xmlidname;
1689     }
1690     my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1691     $sgraph->setAttribute( 'edgedefault', 'directed' );
1692     $sgraph->setAttribute( 'id', $xmlidname );
1693     $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
1694     $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
1695     $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
1696     $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
1697     $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
1698
1699     # Tradition/collation attribute data
1700     foreach my $datum ( keys %graph_attributes ) {
1701         my $value;
1702         if( $datum eq 'version' ) {
1703                 $value = '3.2';
1704         } elsif( ref( $graph_attributes{$datum} ) ) {
1705                 my $sub = $graph_attributes{$datum};
1706                 $value = &$sub();
1707         } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1708                 $value = $self->tradition->$datum;
1709         } else {
1710                 $value = $self->$datum;
1711         }
1712                 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
1713         }
1714
1715     my $node_ctr = 0;
1716     my %node_hash;
1717     # Add our readings to the graph
1718     foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
1719         next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1720                 ( $n->rank < $start->rank || $n->rank > $end->rank );
1721         $use_readings{$n->id} = 1;
1722         # Add to the main graph
1723         my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
1724         my $node_xmlid = 'n' . $node_ctr++;
1725         $node_hash{ $n->id } = $node_xmlid;
1726         $node_el->setAttribute( 'id', $node_xmlid );
1727         foreach my $d ( keys %reading_attributes ) {
1728                 my $nval = $n->$d;
1729                 # Custom serialization
1730                 if( $d eq 'lexemes' ) {
1731                                 # If nval is a true value, we have lexemes so we need to
1732                                 # serialize them. Otherwise set nval to undef so that the
1733                                 # key is excluded from this reading.
1734                         $nval = $nval ? $n->_serialize_lexemes : undef;
1735                 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1736                         $nval = undef;
1737                 }
1738                 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
1739                         # Adjust the ranks within the subgraph.
1740                         $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1741                                 : $nval - $rankoffset;
1742                 }
1743                 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1744                         if defined $nval;
1745         }
1746     }
1747
1748     # Add the path edges to the sequence graph
1749     my $edge_ctr = 0;
1750     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1751         # We add an edge in the graphml for every witness in $e.
1752         next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1753         my @edge_wits = sort $self->path_witnesses( $e );
1754         $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1755         $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1756         # Skip any path from start to end; that witness is not in the subgraph.
1757         next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
1758         foreach my $wit ( @edge_wits ) {
1759                         my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1760                                                                                 $node_hash{ $e->[0] },
1761                                                                                 $node_hash{ $e->[1] } );
1762                         my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
1763                         $edge_el->setAttribute( 'source', $from );
1764                         $edge_el->setAttribute( 'target', $to );
1765                         $edge_el->setAttribute( 'id', $id );
1766
1767                         # It's a witness path, so add the witness
1768                         my $base = $wit;
1769                         my $key = $edge_data_keys{'witness'};
1770                         # Is this an ante-corr witness?
1771                         my $aclabel = $self->ac_label;
1772                         if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1773                                 # Keep the base witness
1774                                 $base = $1;
1775                                 # ...and record that this is an 'extra' reading path
1776                                 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1777                         }
1778                         _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1779                 }
1780         }
1781
1782         # Report the actual number of nodes and edges that went in
1783         $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1784         $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1785
1786         # Add the relationship graph to the XML
1787         map { delete $edge_data_keys{$_} } @path_attributes;
1788         $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1789                 $node_data_keys{'id'}, \%edge_data_keys );
1790
1791     # Save and return the thing
1792     my $result = decode_utf8( $graphml->toString(1) );
1793     return $result;
1794 }
1795
1796 sub _add_graphml_data {
1797     my( $el, $key, $value ) = @_;
1798     return unless defined $value;
1799     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
1800     $data_el->setAttribute( 'key', $key );
1801     $data_el->appendText( $value );
1802 }
1803
1804 =head2 as_csv
1805
1806 Returns a CSV alignment table representation of the collation graph, one
1807 row per witness (or witness uncorrected.)
1808
1809 =head2 as_tsv
1810
1811 Returns a tab-separated alignment table representation of the collation graph,
1812 one row per witness (or witness uncorrected.)
1813
1814 =begin testing
1815
1816 use Text::Tradition;
1817 use Text::CSV;
1818
1819 my $READINGS = 311;
1820 my $PATHS = 361;
1821 my $WITS = 13;
1822 my $WITAC = 4;
1823
1824 my $datafile = 't/data/florilegium_tei_ps.xml';
1825 my $tradition = Text::Tradition->new( 'input' => 'TEI',
1826                                       'name' => 'test0',
1827                                       'file' => $datafile,
1828                                       'linear' => 1 );
1829
1830 my $c = $tradition->collation;
1831 # Export the thing to CSV
1832 my $csvstr = $c->as_csv();
1833 # Count the columns
1834 my $csv = Text::CSV->new({ sep_char => ',', binary => 1 });
1835 my @lines = split(/\n/, $csvstr );
1836 ok( $csv->parse( $lines[0] ), "Successfully parsed first line of CSV" );
1837 is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1838 my @q_ac = grep { $_ eq 'Q'.$c->ac_label } $csv->fields;
1839 ok( @q_ac, "Found a layered witness" );
1840
1841 my $t2 = Text::Tradition->new( input => 'Tabular',
1842                                                            name => 'test2',
1843                                                            string => $csvstr,
1844                                                            sep_char => ',' );
1845 is( scalar $t2->collation->readings, $READINGS, "Reparsed CSV collation has all readings" );
1846 is( scalar $t2->collation->paths, $PATHS, "Reparsed CSV collation has all paths" );
1847
1848 # Now do it with TSV
1849 my $tsvstr = $c->as_tsv();
1850 my $t3 = Text::Tradition->new( input => 'Tabular',
1851                                                            name => 'test3',
1852                                                            string => $tsvstr,
1853                                                            sep_char => "\t" );
1854 is( scalar $t3->collation->readings, $READINGS, "Reparsed TSV collation has all readings" );
1855 is( scalar $t3->collation->paths, $PATHS, "Reparsed TSV collation has all paths" );
1856
1857 my $table = $c->alignment_table;
1858 my $noaccsv = $c->as_csv({ noac => 1 });
1859 my @noaclines = split(/\n/, $noaccsv );
1860 ok( $csv->parse( $noaclines[0] ), "Successfully parsed first line of no-ac CSV" );
1861 is( scalar( $csv->fields ), $WITS, "CSV has correct number of witness columns" );
1862 is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
1863
1864 my $safecsv = $c->as_csv({ safe_ac => 1});
1865 my @safelines = split(/\n/, $safecsv );
1866 ok( $csv->parse( $safelines[0] ), "Successfully parsed first line of safe CSV" );
1867 is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
1868 @q_ac = grep { $_ eq 'Q__L' } $csv->fields;
1869 ok( @q_ac, "Found a sanitized layered witness" );
1870 is( $c->alignment_table, $table, "Request for CSV did not alter the alignment table" );
1871
1872 # Test relationship collapse
1873 $c->add_relationship( $c->readings_at_rank( 37 ), { type => 'spelling' } );
1874 $c->add_relationship( $c->readings_at_rank( 60 ), { type => 'spelling' } );
1875
1876 my $mergedtsv = $c->as_tsv({mergetypes => [ 'spelling', 'orthographic' ] });
1877 my $t4 = Text::Tradition->new( input => 'Tabular',
1878                                                            name => 'test4',
1879                                                            string => $mergedtsv,
1880                                                            sep_char => "\t" );
1881 is( scalar $t4->collation->readings, $READINGS - 2, "Reparsed TSV merge collation has fewer readings" );
1882 is( scalar $t4->collation->paths, $PATHS - 4, "Reparsed TSV merge collation has fewer paths" );
1883
1884 # Test non-ASCII sigla
1885 my $t5 = Text::Tradition->new( input => 'Tabular',
1886                                                            name => 'nonascii',
1887                                                            file => 't/data/armexample.xlsx',
1888                                                            excel => 'xlsx' );
1889 my $awittsv = $t5->collation->as_tsv({ noac => 1, ascii => 1 });
1890 my @awitlines = split( /\n/, $awittsv );
1891 like( $awitlines[0], qr/_A_5315622/, "Found ASCII sigil variant in TSV" );
1892
1893 =end testing
1894
1895 =cut
1896
1897 sub _tabular {
1898     my( $self, $opts ) = @_;
1899     my $table = $self->alignment_table( $opts );
1900         my $csv_options = { binary => 1, quote_null => 0 };
1901         $csv_options->{'sep_char'} = $opts->{fieldsep};
1902         if( $opts->{fieldsep} eq "\t" ) {
1903                 # If it is really tab separated, nothing is an escape char.
1904                 $csv_options->{'quote_char'} = undef;
1905                 $csv_options->{'escape_char'} = '';
1906         }
1907     my $csv = Text::CSV->new( $csv_options );
1908     my @result;
1909
1910     # Make the header row
1911     my @witnesses = map { $_->{'witness'} } @{$table->{'alignment'}};
1912     if( $opts->{ascii} ) {
1913         # TODO think of a fix for this
1914         throw( "Cannot currently produce ASCII sigla with witness layers" )
1915                 unless $opts->{noac};
1916         my @awits = map { $self->tradition->witness( $_ )->ascii_sigil } @witnesses;
1917         @witnesses = @awits;
1918     }
1919     $csv->combine( @witnesses );
1920         push( @result, $csv->string );
1921
1922     # Make the rest of the rows
1923     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1924         my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1925         my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
1926         # Quick and dirty collapse of requested relationship types
1927         if( ref( $opts->{mergetypes} ) eq 'ARRAY' ) {
1928                 # Now substitute the reading in the relevant index of @row
1929                 # for its merge-related reading
1930                 my %substitutes;
1931                 while( @rowobjs ) {
1932                         my $thisr = shift @rowobjs;
1933                         next unless $thisr;
1934                                 next if exists $substitutes{$thisr->{t}->text};
1935                                 # Make sure we don't have A <-> B substitutions.
1936                                 $substitutes{$thisr->{t}->text} = $thisr->{t}->text;
1937                         foreach my $thatr ( @rowobjs ) {
1938                                 next unless $thatr;
1939                                 next if exists $substitutes{$thatr->{t}->text};
1940                                 my $ttrel = $self->get_relationship( $thisr->{t}, $thatr->{t} );
1941                                 next unless $ttrel;
1942                                 next unless grep { $ttrel->type eq $_ } @{$opts->{mergetypes}};
1943                                 # If we have got this far then we need to merge them.
1944                                 $substitutes{$thatr->{t}->text} = $thisr->{t}->text;
1945                         }
1946                 }
1947                 @row = map { $_ && exists $substitutes{$_} ? $substitutes{$_} : $_ } @row;
1948         }
1949         $csv->combine( @row );
1950         push( @result, $csv->string );
1951     }
1952     return join( "\n", @result );
1953 }
1954
1955 sub as_csv {
1956         my $self = shift;
1957         my $opts = shift || {};
1958         $opts->{fieldsep} = ',';
1959         return $self->_tabular( $opts );
1960 }
1961
1962 sub as_tsv {
1963         my $self = shift;
1964         my $opts = shift || {};
1965         $opts->{fieldsep} = "\t";
1966         return $self->_tabular( $opts );
1967 }
1968
1969 =head2 alignment_table
1970
1971 Return a reference to an alignment table, in a slightly enhanced CollateX
1972 format which looks like this:
1973
1974  $table = { alignment => [ { witness => "SIGIL",
1975                              tokens => [ { t => "TEXT" }, ... ] },
1976                            { witness => "SIG2",
1977                              tokens => [ { t => "TEXT" }, ... ] },
1978                            ... ],
1979             length => TEXTLEN };
1980
1981 =cut
1982
1983 sub alignment_table {
1984     my( $self, $opts ) = @_;
1985     if( $self->has_cached_table ) {
1986                 return $self->cached_table
1987                         unless $opts->{noac} || $opts->{safe_ac};
1988     }
1989
1990     # Make sure we can do this
1991         throw( "Need a linear graph in order to make an alignment table" )
1992                 unless $self->linear;
1993     $self->calculate_ranks()
1994         unless $self->_graphcalc_done && $self->end->has_rank;
1995
1996     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
1997     my @all_pos = ( 1 .. $self->end->rank - 1 );
1998     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
1999         # say STDERR "Making witness row(s) for " . $wit->sigil;
2000         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
2001         my @row = _make_witness_row( \@wit_path, \@all_pos );
2002         my $witobj = { 'witness' => $wit->sigil, 'tokens' => \@row };
2003         $witobj->{'identifier'} = $wit->identifier if $wit->identifier;
2004         push( @{$table->{'alignment'}}, $witobj );
2005         if( $wit->is_layered && !$opts->{noac} ) {
2006                 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
2007                         $wit->sigil.$self->ac_label );
2008             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
2009             my $witlabel = $opts->{safe_ac}
2010                 ? $wit->sigil . '__L' : $wit->sigil.$self->ac_label;
2011             my $witacobj = { 'witness' => $witlabel,
2012                 'tokens' => \@ac_row };
2013             $witacobj->{'identifier'} = $wit->identifier if $wit->identifier;
2014                         push( @{$table->{'alignment'}}, $witacobj );
2015         }
2016     }
2017     unless( $opts->{noac} || $opts->{safe_ac} ) {
2018             $self->cached_table( $table );
2019         }
2020     return $table;
2021 }
2022
2023 sub _make_witness_row {
2024     my( $path, $positions ) = @_;
2025     my %char_hash;
2026     map { $char_hash{$_} = undef } @$positions;
2027     my $debug = 0;
2028     foreach my $rdg ( @$path ) {
2029         say STDERR "rank " . $rdg->rank if $debug;
2030         # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank;
2031         $char_hash{$rdg->rank} = { 't' => $rdg };
2032     }
2033     my @row = map { $char_hash{$_} } @$positions;
2034     # Fill in lacuna markers for undef spots in the row
2035     my $last_el = shift @row;
2036     my @filled_row = ( $last_el );
2037     foreach my $el ( @row ) {
2038         # If we are using node reference, make the lacuna node appear many times
2039         # in the table.  If not, use the lacuna tag.
2040         if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
2041             $el = $last_el;
2042         }
2043         push( @filled_row, $el );
2044         $last_el = $el;
2045     }
2046     return @filled_row;
2047 }
2048
2049
2050 =head1 NAVIGATION METHODS
2051
2052 =head2 reading_sequence( $first, $last, $sigil, $backup )
2053
2054 Returns the ordered list of readings, starting with $first and ending
2055 with $last, for the witness given in $sigil. If a $backup sigil is
2056 specified (e.g. when walking a layered witness), it will be used wherever
2057 no $sigil path exists.  If there is a base text reading, that will be
2058 used wherever no path exists for $sigil or $backup.
2059
2060 =cut
2061
2062 # TODO Think about returning some lazy-eval iterator.
2063 # TODO Get rid of backup; we should know from what witness is whether we need it.
2064
2065 sub reading_sequence {
2066     my( $self, $start, $end, $witness ) = @_;
2067
2068     $witness = $self->baselabel unless $witness;
2069     my @readings = ( $start );
2070     my %seen;
2071     my $n = $start;
2072     while( $n && $n->id ne $end->id ) {
2073         if( exists( $seen{$n->id} ) ) {
2074             throw( "Detected loop for $witness at " . $n->id );
2075         }
2076         $seen{$n->id} = 1;
2077
2078         my $next = $self->next_reading( $n, $witness );
2079         unless( $next ) {
2080             throw( "Did not find any path for $witness from reading " . $n->id );
2081         }
2082         push( @readings, $next );
2083         $n = $next;
2084     }
2085     # Check that the last reading is our end reading.
2086     my $last = $readings[$#readings];
2087     throw( "Last reading found from " . $start->text .
2088         " for witness $witness is not the end!" ) # TODO do we get this far?
2089         unless $last->id eq $end->id;
2090
2091     return @readings;
2092 }
2093
2094 =head2 readings_at_rank( $rank )
2095
2096 Returns a list of readings at a given rank, taken from the alignment table.
2097
2098 =cut
2099
2100 sub readings_at_rank {
2101         my( $self, $rank, $nolacuna ) = @_;
2102         my $table = $self->alignment_table;
2103         # Table rank is real rank - 1.
2104         my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
2105         my %readings;
2106         foreach my $e ( @elements ) {
2107                 next unless ref( $e ) eq 'HASH';
2108                 next unless exists $e->{'t'};
2109                 my $rdg = $e->{'t'};
2110                 next if $nolacuna && $rdg->is_lacuna && $rdg->rank ne $rank;
2111                 $readings{$e->{'t'}->id} = $e->{'t'};
2112         }
2113         return values %readings;
2114 }
2115
2116 =head2 next_reading( $reading, $sigil );
2117
2118 Returns the reading that follows the given reading along the given witness
2119 path.
2120
2121 =cut
2122
2123 sub next_reading {
2124     # Return the successor via the corresponding path.
2125     my $self = shift;
2126     my $answer = $self->_find_linked_reading( 'next', @_ );
2127         return undef unless $answer;
2128     return $self->reading( $answer );
2129 }
2130
2131 =head2 prior_reading( $reading, $sigil )
2132
2133 Returns the reading that precedes the given reading along the given witness
2134 path.
2135
2136 =cut
2137
2138 sub prior_reading {
2139     # Return the predecessor via the corresponding path.
2140     my $self = shift;
2141     my $answer = $self->_find_linked_reading( 'prior', @_ );
2142     return $self->reading( $answer );
2143 }
2144
2145 sub _find_linked_reading {
2146     my( $self, $direction, $node, $path ) = @_;
2147
2148     # Get a backup if we are dealing with a layered witness
2149     my $alt_path;
2150     my $aclabel = $self->ac_label;
2151     if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
2152         $alt_path = $1;
2153     }
2154
2155     my @linked_paths = $direction eq 'next'
2156         ? $self->sequence->edges_from( $node )
2157         : $self->sequence->edges_to( $node );
2158     return undef unless scalar( @linked_paths );
2159
2160     # We have to find the linked path that contains all of the
2161     # witnesses supplied in $path.
2162     my( @path_wits, @alt_path_wits );
2163     @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
2164     @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
2165     my $base_le;
2166     my $alt_le;
2167     foreach my $le ( @linked_paths ) {
2168         if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
2169             $base_le = $le;
2170         }
2171                 my @le_wits = sort $self->path_witnesses( $le );
2172                 if( _is_within( \@path_wits, \@le_wits ) ) {
2173                         # This is the right path.
2174                         return $direction eq 'next' ? $le->[1] : $le->[0];
2175                 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
2176                         $alt_le = $le;
2177                 }
2178     }
2179     # Got this far? Return the alternate path if it exists.
2180     return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
2181         if $alt_le;
2182
2183     # Got this far? Return the base path if it exists.
2184     return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
2185         if $base_le;
2186
2187     # Got this far? We have no appropriate path.
2188     warn "Could not find $direction node from " . $node->id
2189         . " along path $path";
2190     return undef;
2191 }
2192
2193 # Some set logic.
2194 sub _is_within {
2195     my( $set1, $set2 ) = @_;
2196     my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
2197     foreach my $el ( @$set1 ) {
2198         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
2199     }
2200     return $ret;
2201 }
2202
2203 # Return the string that joins together a list of witnesses for
2204 # display on a single path.
2205 sub _witnesses_of_label {
2206     my( $self, $label ) = @_;
2207     my $regex = $self->wit_list_separator;
2208     my @answer = split( /\Q$regex\E/, $label );
2209     return @answer;
2210 }
2211
2212 =head2 common_readings
2213
2214 Returns the list of common readings in the graph (i.e. those readings that are
2215 shared by all non-lacunose witnesses.)
2216
2217 =cut
2218
2219 sub common_readings {
2220         my $self = shift;
2221         my @common = grep { $_->is_common } $self->readings;
2222         return @common;
2223 }
2224
2225 =head2 path_text( $sigil, [, $start, $end ] )
2226
2227 Returns the text of a witness (plus its backup, if we are using a layer)
2228 as stored in the collation.  The text is returned as a string, where the
2229 individual readings are joined with spaces and the meta-readings (e.g.
2230 lacunae) are omitted.  Optional specification of $start and $end allows
2231 the generation of a subset of the witness text.
2232
2233 =cut
2234
2235 sub path_text {
2236         my( $self, $wit, $start, $end ) = @_;
2237         $start = $self->start unless $start;
2238         $end = $self->end unless $end;
2239         my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
2240         my $pathtext = '';
2241         my $last;
2242         foreach my $r ( @path ) {
2243                 unless ( $r->join_prior || !$last || $last->join_next ) {
2244                         $pathtext .= ' ';
2245                 }
2246                 $pathtext .= $r->text;
2247                 $last = $r;
2248         }
2249         return $pathtext;
2250 }
2251
2252 =head1 INITIALIZATION METHODS
2253
2254 These are mostly for use by parsers.
2255
2256 =head2 make_witness_path( $witness )
2257
2258 Link the array of readings contained in $witness->path (and in
2259 $witness->uncorrected_path if it exists) into collation paths.
2260 Clear out the arrays when finished.
2261
2262 =head2 make_witness_paths
2263
2264 Call make_witness_path for all witnesses in the tradition.
2265
2266 =cut
2267
2268 # For use when a collation is constructed from a base text and an apparatus.
2269 # We have the sequences of readings and just need to add path edges.
2270 # When we are done, clear out the witness path attributes, as they are no
2271 # longer needed.
2272 # TODO Find a way to replace the witness path attributes with encapsulated functions?
2273
2274 sub make_witness_paths {
2275     my( $self ) = @_;
2276     foreach my $wit ( $self->tradition->witnesses ) {
2277         # say STDERR "Making path for " . $wit->sigil;
2278         $self->make_witness_path( $wit );
2279     }
2280 }
2281
2282 sub make_witness_path {
2283     my( $self, $wit ) = @_;
2284     my @chain = @{$wit->path};
2285     my $sig = $wit->sigil;
2286     # Add start and end if necessary
2287     unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
2288     push( @chain, $self->end ) unless $chain[-1] eq $self->end;
2289     foreach my $idx ( 0 .. $#chain-1 ) {
2290         $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
2291     }
2292     if( $wit->is_layered ) {
2293         @chain = @{$wit->uncorrected_path};
2294                 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
2295                 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
2296         foreach my $idx( 0 .. $#chain-1 ) {
2297             my $source = $chain[$idx];
2298             my $target = $chain[$idx+1];
2299             $self->add_path( $source, $target, $sig.$self->ac_label )
2300                 unless $self->has_path( $source, $target, $sig );
2301         }
2302     }
2303     $wit->clear_path;
2304     $wit->clear_uncorrected_path;
2305 }
2306
2307 =head2 calculate_ranks
2308
2309 Calculate the reading ranks (that is, their aligned positions relative
2310 to each other) for the graph.  This can only be called on linear collations.
2311
2312 =begin testing
2313
2314 use Text::Tradition;
2315
2316 my $cxfile = 't/data/Collatex-16.xml';
2317 my $t = Text::Tradition->new(
2318     'name'  => 'inline',
2319     'input' => 'CollateX',
2320     'file'  => $cxfile,
2321     );
2322 my $c = $t->collation;
2323
2324 # Make an svg
2325 my $table = $c->alignment_table;
2326 ok( $c->has_cached_table, "Alignment table was cached" );
2327 is( $c->alignment_table, $table, "Cached table returned upon second call" );
2328 $c->calculate_ranks;
2329 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
2330 $c->add_relationship( 'n13', 'n23', { type => 'repetition' } );
2331 is( $c->alignment_table, $table, "Alignment table unchanged after non-colo relationship add" );
2332 $c->add_relationship( 'n24', 'n23', { type => 'spelling' } );
2333 isnt( $c->alignment_table, $table, "Alignment table changed after colo relationship add" );
2334
2335 =end testing
2336
2337 =cut
2338
2339 sub calculate_ranks {
2340     my $self = shift;
2341     # Save the existing ranks, in case we need to invalidate the cached SVG.
2342     throw( "Cannot calculate ranks on a non-linear graph" )
2343         unless $self->linear;
2344     my %existing_ranks;
2345     map { $existing_ranks{$_} = $_->rank } $self->readings;
2346
2347     # Do the rankings based on the relationship equivalence graph, starting
2348     # with the start node.
2349     my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
2350
2351     # Transfer our rankings from the topological graph to the real one.
2352     foreach my $r ( $self->readings ) {
2353         if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
2354             $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
2355         } else {
2356                 # Die. Find the last rank we calculated.
2357                 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
2358                                  <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
2359                         $self->readings;
2360                 my $last = pop @all_defined;
2361             throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
2362         }
2363     }
2364     # Do we need to invalidate the cached data?
2365     if( $self->has_cached_table ) {
2366         foreach my $r ( $self->readings ) {
2367                 next if defined( $existing_ranks{$r} )
2368                         && $existing_ranks{$r} == $r->rank;
2369                 # Something has changed, so clear the cache
2370                 $self->_clear_cache;
2371                         # ...and recalculate the common readings.
2372                         $self->calculate_common_readings();
2373                 last;
2374         }
2375     }
2376         # The graph calculation information is now up to date.
2377         $self->_graphcalc_done(1);
2378 }
2379
2380 sub _clear_cache {
2381         my $self = shift;
2382         $self->wipe_table if $self->has_cached_table;
2383 }
2384
2385
2386 =head2 flatten_ranks
2387
2388 A convenience method for parsing collation data.  Searches the graph for readings
2389 with the same text at the same rank, and merges any that are found.
2390
2391 =cut
2392
2393 sub flatten_ranks {
2394     my ( $self, %args ) = shift;
2395     my %unique_rank_rdg;
2396     my $changed;
2397     foreach my $p ( $self->identical_readings( %args ) ) {
2398                 # say STDERR "Combining readings at same rank: @$p";
2399                 $changed = 1;
2400                 $self->merge_readings( @$p );
2401                 # TODO see if this now makes a common point.
2402     }
2403     # If we merged readings, the ranks are still fine but the alignment
2404     # table is wrong. Wipe it.
2405     $self->wipe_table() if $changed;
2406 }
2407
2408 =head2 identical_readings
2409 =head2 identical_readings( start => $startnode, end => $endnode )
2410 =head2 identical_readings( startrank => $startrank, endrank => $endrank )
2411
2412 Goes through the graph identifying all pairs of readings that appear to be
2413 identical, and therefore able to be merged into a single reading. Returns the
2414 relevant identical pairs. Can be restricted to run over only a part of the
2415 graph, specified either by node or by rank.
2416
2417 =cut
2418
2419 sub identical_readings {
2420         my ( $self, %args ) = @_;
2421     # Find where we should start and end.
2422     my $startrank = $args{startrank} || 0;
2423     if( $args{start} ) {
2424         throw( "Starting reading has no rank" ) unless $self->reading( $args{start} )
2425                 && $self->reading( $args{start} )->has_rank;
2426         $startrank = $self->reading( $args{start} )->rank;
2427     }
2428     my $endrank = $args{endrank} || $self->end->rank;
2429     if( $args{end} ) {
2430         throw( "Ending reading has no rank" ) unless $self->reading( $args{end} )
2431                 && $self->reading( $args{end} )->has_rank;
2432         $endrank = $self->reading( $args{end} )->rank;
2433     }
2434
2435     # Make sure the ranks are correct.
2436     unless( $self->_graphcalc_done ) {
2437         $self->calculate_ranks;
2438     }
2439     # Go through the readings looking for duplicates.
2440     my %unique_rank_rdg;
2441     my @pairs;
2442     foreach my $rdg ( $self->readings ) {
2443         next unless $rdg->has_rank;
2444         my $rk = $rdg->rank;
2445         next if $rk > $endrank || $rk < $startrank;
2446         my $key = $rk . "||" . $rdg->text;
2447         if( exists $unique_rank_rdg{$key} ) {
2448                 # Make sure they don't have different grammatical forms
2449                         my $ur = $unique_rank_rdg{$key};
2450                 if( $rdg->is_identical( $ur ) ) {
2451                                 push( @pairs, [ $ur, $rdg ] );
2452                         }
2453         } else {
2454             $unique_rank_rdg{$key} = $rdg;
2455         }
2456     }
2457
2458     return @pairs;
2459 }
2460
2461
2462 =head2 calculate_common_readings
2463
2464 Goes through the graph identifying the readings that appear in every witness
2465 (apart from those with lacunae at that spot.) Marks them as common and returns
2466 the list.
2467
2468 =begin testing
2469
2470 use Text::Tradition;
2471
2472 my $cxfile = 't/data/Collatex-16.xml';
2473 my $t = Text::Tradition->new(
2474     'name'  => 'inline',
2475     'input' => 'CollateX',
2476     'file'  => $cxfile,
2477     );
2478 my $c = $t->collation;
2479
2480 my @common = $c->calculate_common_readings();
2481 is( scalar @common, 8, "Found correct number of common readings" );
2482 my @marked = sort $c->common_readings();
2483 is( scalar @common, 8, "All common readings got marked as such" );
2484 my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
2485 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
2486
2487 =end testing
2488
2489 =cut
2490
2491 sub calculate_common_readings {
2492         my $self = shift;
2493         my @common;
2494         map { $_->is_common( 0 ) } $self->readings;
2495         # Implicitly calls calculate_ranks
2496         my $table = $self->alignment_table;
2497         foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
2498                 my @row = map { $_->{'tokens'}->[$idx]
2499                                                         ? $_->{'tokens'}->[$idx]->{'t'} : '' }
2500                                         @{$table->{'alignment'}};
2501                 my %hash;
2502                 foreach my $r ( @row ) {
2503                         if( $r ) {
2504                                 $hash{$r->id} = $r unless $r->is_meta;
2505                         } else {
2506                                 $hash{'UNDEF'} = $r;
2507                         }
2508                 }
2509                 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
2510                         my( $r ) = values %hash;
2511                         $r->is_common( 1 );
2512                         push( @common, $r );
2513                 }
2514         }
2515         return @common;
2516 }
2517
2518 =head2 text_from_paths
2519
2520 Calculate the text array for all witnesses from the path, for later consistency
2521 checking.  Only to be used if there is no non-graph-based way to know the
2522 original texts.
2523
2524 =cut
2525
2526 sub text_from_paths {
2527         my $self = shift;
2528     foreach my $wit ( $self->tradition->witnesses ) {
2529         my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
2530         my @text;
2531         foreach my $r ( @readings ) {
2532                 next if $r->is_meta;
2533                 push( @text, $r->text );
2534         }
2535         $wit->text( \@text );
2536         if( $wit->is_layered ) {
2537                         my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
2538                                                                                                   $wit->sigil.$self->ac_label );
2539                         my @uctext;
2540                         foreach my $r ( @ucrdgs ) {
2541                                 next if $r->is_meta;
2542                                 push( @uctext, $r->text );
2543                         }
2544                         $wit->layertext( \@uctext );
2545         }
2546     }
2547 }
2548
2549 =head1 UTILITY FUNCTIONS
2550
2551 =head2 common_predecessor( $reading_a, $reading_b )
2552
2553 Find the last reading that occurs in sequence before both the given readings.
2554 At the very least this should be $self->start.
2555
2556 =head2 common_successor( $reading_a, $reading_b )
2557
2558 Find the first reading that occurs in sequence after both the given readings.
2559 At the very least this should be $self->end.
2560
2561 =begin testing
2562
2563 use Text::Tradition;
2564
2565 my $cxfile = 't/data/Collatex-16.xml';
2566 my $t = Text::Tradition->new(
2567     'name'  => 'inline',
2568     'input' => 'CollateX',
2569     'file'  => $cxfile,
2570     );
2571 my $c = $t->collation;
2572
2573 is( $c->common_predecessor( 'n24', 'n23' )->id,
2574     'n20', "Found correct common predecessor" );
2575 is( $c->common_successor( 'n24', 'n23' )->id,
2576     '__END__', "Found correct common successor" );
2577
2578 is( $c->common_predecessor( 'n19', 'n17' )->id,
2579     'n16', "Found correct common predecessor for readings on same path" );
2580 is( $c->common_successor( 'n21', 'n10' )->id,
2581     '__END__', "Found correct common successor for readings on same path" );
2582
2583 =end testing
2584
2585 =cut
2586
2587 ## Return the closest reading that is a predecessor of both the given readings.
2588 sub common_predecessor {
2589         my $self = shift;
2590         my( $r1, $r2 ) = $self->_objectify_args( @_ );
2591         return $self->_common_in_path( $r1, $r2, 'predecessors' );
2592 }
2593
2594 sub common_successor {
2595         my $self = shift;
2596         my( $r1, $r2 ) = $self->_objectify_args( @_ );
2597         return $self->_common_in_path( $r1, $r2, 'successors' );
2598 }
2599
2600
2601 # TODO think about how to do this without ranks...
2602 sub _common_in_path {
2603         my( $self, $r1, $r2, $dir ) = @_;
2604         my $iter = $self->end->rank;
2605         my @candidates;
2606         my @last_r1 = ( $r1 );
2607         my @last_r2 = ( $r2 );
2608         # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
2609         my %all_seen;
2610         # say STDERR "Finding common $dir for $r1, $r2";
2611         while( !@candidates ) {
2612                 last unless $iter--;  # Avoid looping infinitely
2613                 # Iterate separately down the graph from r1 and r2
2614                 my( @new_lc1, @new_lc2 );
2615                 foreach my $lc ( @last_r1 ) {
2616                         foreach my $p ( $lc->$dir ) {
2617                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
2618                                         # say STDERR "Path candidate $p from $lc";
2619                                         push( @candidates, $p );
2620                                 } elsif( !$all_seen{$p->id} ) {
2621                                         $all_seen{$p->id} = 'r1';
2622                                         push( @new_lc1, $p );
2623                                 }
2624                         }
2625                 }
2626                 foreach my $lc ( @last_r2 ) {
2627                         foreach my $p ( $lc->$dir ) {
2628                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
2629                                         # say STDERR "Path candidate $p from $lc";
2630                                         push( @candidates, $p );
2631                                 } elsif( !$all_seen{$p->id} ) {
2632                                         $all_seen{$p->id} = 'r2';
2633                                         push( @new_lc2, $p );
2634                                 }
2635                         }
2636                 }
2637                 @last_r1 = @new_lc1;
2638                 @last_r2 = @new_lc2;
2639         }
2640         my @answer = sort { $a->rank <=> $b->rank } @candidates;
2641         return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
2642 }
2643
2644 sub throw {
2645         Text::Tradition::Error->throw(
2646                 'ident' => 'Collation error',
2647                 'message' => $_[0],
2648                 );
2649 }
2650
2651 no Moose;
2652 __PACKAGE__->meta->make_immutable;
2653
2654 =head1 BUGS/TODO
2655
2656 =over
2657
2658 =item * Rework XML serialization in a more modular way
2659
2660 =back
2661
2662 =head1 LICENSE
2663
2664 This package is free software and is provided "as is" without express
2665 or implied warranty.  You can redistribute it and/or modify it under
2666 the same terms as Perl itself.
2667
2668 =head1 AUTHOR
2669
2670 Tara L Andrews E<lt>aurum@cpan.orgE<gt>