lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use Encode qw( decode_utf8 );
   4 use File::Temp;
   5 use File::Which;
   6 use Graph;
   7 use IPC::Run qw( run binary );
   8 use Text::CSV;
   9 use Text::Tradition::Collation::Reading;
  10 use Text::Tradition::Collation::RelationshipStore;
  11 use Text::Tradition::Error;
  12 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
  13 use XML::LibXML;
  14 use XML::LibXML::XPathContext;
  15 use Moose;
  16
  17 has 'sequence' => (
  18     is => 'ro',
  19     isa => 'Graph',
  20     default => sub { Graph->new() },
  21     handles => {
  22         paths => 'edges',
  23     },
  24     );
  25
  26 has 'relations' => (
  27         is => 'ro',
  28         isa => 'Text::Tradition::Collation::RelationshipStore',
  29         handles => {
  30                 relationships => 'relationships',
  31                 related_readings => 'related_readings',
  32                 get_relationship => 'get_relationship',
  33                 del_relationship => 'del_relationship',
  34                 equivalence => 'equivalence',
  35                 equivalence_graph => 'equivalence_graph',
  36         },
  37         writer => '_set_relations',
  38         );
  39
  40 has 'tradition' => (
  41     is => 'ro',
  42     isa => 'Text::Tradition',
  43     writer => '_set_tradition',
  44     weak_ref => 1,
  45     );
  46
  47 has 'readings' => (
  48         isa => 'HashRef[Text::Tradition::Collation::Reading]',
  49         traits => ['Hash'],
  50     handles => {
  51         reading     => 'get',
  52         _add_reading => 'set',
  53         del_reading => 'delete',
  54         has_reading => 'exists',
  55         readings   => 'values',
  56     },
  57     default => sub { {} },
  58         );
  59
  60 has 'wit_list_separator' => (
  61     is => 'rw',
  62     isa => 'Str',
  63     default => ', ',
  64     );
  65
  66 has 'baselabel' => (
  67     is => 'rw',
  68     isa => 'Str',
  69     default => 'base text',
  70     );
  71
  72 has 'linear' => (
  73     is => 'rw',
  74     isa => 'Bool',
  75     default => 1,
  76     );
  77
  78 has 'ac_label' => (
  79     is => 'rw',
  80     isa => 'Str',
  81     default => ' (a.c.)',
  82     );
  83
  84 has 'wordsep' => (
  85         is => 'rw',
  86         isa => 'Str',
  87         default => ' ',
  88         );
  89
  90 has 'start' => (
  91         is => 'ro',
  92         isa => 'Text::Tradition::Collation::Reading',
  93         writer => '_set_start',
  94         weak_ref => 1,
  95         );
  96
  97 has 'end' => (
  98         is => 'ro',
  99         isa => 'Text::Tradition::Collation::Reading',
 100         writer => '_set_end',
 101         weak_ref => 1,
 102         );
 103
 104 has 'cached_svg' => (
 105         is => 'rw',
 106         isa => 'Str',
 107         predicate => 'has_cached_svg',
 108         clearer => 'wipe_svg',
 109         );
 110
 111 has 'cached_table' => (
 112         is => 'rw',
 113         isa => 'HashRef',
 114         predicate => 'has_cached_table',
 115         clearer => 'wipe_table',
 116         );
 117
 118 has '_graphcalc_done' => (
 119         is => 'rw',
 120         isa => 'Bool',
 121         default => undef,
 122         );
 123
 124 =head1 NAME
 125
 126 Text::Tradition::Collation - a software model for a text collation
 127
 128 =head1 SYNOPSIS
 129
 130   use Text::Tradition;
 131   my $t = Text::Tradition->new(
 132     'name' => 'this is a text',
 133     'input' => 'TEI',
 134     'file' => '/path/to/tei_parallel_seg_file.xml' );
 135
 136   my $c = $t->collation;
 137   my @readings = $c->readings;
 138   my @paths = $c->paths;
 139   my @relationships = $c->relationships;
 140
 141   my $svg_variant_graph = $t->collation->as_svg();
 142
 143 =head1 DESCRIPTION
 144
 145 Text::Tradition is a library for representation and analysis of collated
 146 texts, particularly medieval ones.  The Collation is the central feature of
 147 a Tradition, where the text, its sequence of readings, and its relationships
 148 between readings are actually kept.
 149
 150 =head1 CONSTRUCTOR
 151
 152 =head2 new
 153
 154 The constructor.  Takes a hash or hashref of the following arguments:
 155
 156 =over
 157
 158 =item * tradition - The Text::Tradition object to which the collation
 159 belongs. Required.
 160
 161 =item * linear - Whether the collation should be linear; that is, whether
 162 transposed readings should be treated as two linked readings rather than one,
 163 and therefore whether the collation graph is acyclic.  Defaults to true.
 164
 165 =item * baselabel - The default label for the path taken by a base text
 166 (if any). Defaults to 'base text'.
 167
 168 =item * wit_list_separator - The string to join a list of witnesses for
 169 purposes of making labels in display graphs.  Defaults to ', '.
 170
 171 =item * ac_label - The extra label to tack onto a witness sigil when
 172 representing another layer of path for the given witness - that is, when
 173 a text has more than one possible reading due to scribal corrections or
 174 the like.  Defaults to ' (a.c.)'.
 175
 176 =item * wordsep - The string used to separate words in the original text.
 177 Defaults to ' '.
 178
 179 =back
 180
 181 =head1 ACCESSORS
 182
 183 =head2 tradition
 184
 185 =head2 linear
 186
 187 =head2 wit_list_separator
 188
 189 =head2 baselabel
 190
 191 =head2 ac_label
 192
 193 =head2 wordsep
 194
 195 Simple accessors for collation attributes.
 196
 197 =head2 start
 198
 199 The meta-reading at the start of every witness path.
 200
 201 =head2 end
 202
 203 The meta-reading at the end of every witness path.
 204
 205 =head2 readings
 206
 207 Returns all Reading objects in the graph.
 208
 209 =head2 reading( $id )
 210
 211 Returns the Reading object corresponding to the given ID.
 212
 213 =head2 add_reading( $reading_args )
 214
 215 Adds a new reading object to the collation.
 216 See L<Text::Tradition::Collation::Reading> for the available arguments.
 217
 218 =head2 del_reading( $object_or_id )
 219
 220 Removes the given reading from the collation, implicitly removing its
 221 paths and relationships.
 222
 223 =head2 merge_readings( $main, $second, $concatenate, $with_str )
 224
 225 Merges the $second reading into the $main one. If $concatenate is true, then
 226 the merged node will carry the text of both readings, concatenated with either
 227 $with_str (if specified) or a sensible default (the empty string if the
 228 appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
 229
 230 The first two arguments may be either readings or reading IDs.
 231
 232 =head2 has_reading( $id )
 233
 234 Predicate to see whether a given reading ID is in the graph.
 235
 236 =head2 reading_witnesses( $object_or_id )
 237
 238 Returns a list of sigils whose witnesses contain the reading.
 239
 240 =head2 paths
 241
 242 Returns all reading paths within the document - that is, all edges in the
 243 collation graph.  Each path is an arrayref of [ $source, $target ] reading IDs.
 244
 245 =head2 add_path( $source, $target, $sigil )
 246
 247 Links the given readings in the collation in sequence, under the given witness
 248 sigil.  The readings may be specified by object or ID.
 249
 250 =head2 del_path( $source, $target, $sigil )
 251
 252 Links the given readings in the collation in sequence, under the given witness
 253 sigil.  The readings may be specified by object or ID.
 254
 255 =head2 has_path( $source, $target );
 256
 257 Returns true if the two readings are linked in sequence in any witness.
 258 The readings may be specified by object or ID.
 259
 260 =head2 relationships
 261
 262 Returns all Relationship objects in the collation.
 263
 264 =head2 add_relationship( $reading, $other_reading, $options )
 265
 266 Adds a new relationship of the type given in $options between the two readings,
 267 which may be specified by object or ID.  Returns a value of ( $status, @vectors)
 268 where $status is true on success, and @vectors is a list of relationship edges
 269 that were ultimately added.
 270 See L<Text::Tradition::Collation::Relationship> for the available options.
 271
 272 =cut
 273
 274 sub BUILD {
 275     my $self = shift;
 276     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
 277     $self->_set_start( $self->add_reading(
 278         { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
 279     $self->_set_end( $self->add_reading(
 280         { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
 281 }
 282
 283 ### Reading construct/destruct functions
 284
 285 sub add_reading {
 286         my( $self, $reading ) = @_;
 287         unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
 288                 my %args = %$reading;
 289                 if( $args{'init'} ) {
 290                         # If we are initializing an empty collation, don't assume that we
 291                         # have set a tradition.
 292                         delete $args{'init'};
 293                 } elsif( $self->tradition->has_language && !exists $args{'language'} ) {
 294                         $args{'language'} = $self->tradition->language;
 295                 }
 296                 $reading = Text::Tradition::Collation::Reading->new(
 297                         'collation' => $self,
 298                         %args );
 299         }
 300         # First check to see if a reading with this ID exists.
 301         if( $self->reading( $reading->id ) ) {
 302                 throw( "Collation already has a reading with id " . $reading->id );
 303         }
 304         $self->_graphcalc_done(0);
 305         $self->_add_reading( $reading->id => $reading );
 306         # Once the reading has been added, put it in both graphs.
 307         $self->sequence->add_vertex( $reading->id );
 308         $self->relations->add_reading( $reading->id );
 309         return $reading;
 310 };
 311
 312 around del_reading => sub {
 313         my $orig = shift;
 314         my $self = shift;
 315         my $arg = shift;
 316
 317         if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
 318                 $arg = $arg->id;
 319         }
 320         # Remove the reading from the graphs.
 321         $self->_graphcalc_done(0);
 322         $self->_clear_cache; # Explicitly clear caches to GC the reading
 323         $self->sequence->delete_vertex( $arg );
 324         $self->relations->delete_reading( $arg );
 325
 326         # Carry on.
 327         $self->$orig( $arg );
 328 };
 329
 330 =begin testing
 331
 332 use Text::Tradition;
 333
 334 my $cxfile = 't/data/Collatex-16.xml';
 335 my $t = Text::Tradition->new(
 336     'name'  => 'inline',
 337     'input' => 'CollateX',
 338     'file'  => $cxfile,
 339     );
 340 my $c = $t->collation;
 341
 342 my $rno = scalar $c->readings;
 343 # Split n21 for testing purposes
 344 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
 345 my $old_r = $c->reading( 'n21' );
 346 $old_r->alter_text( 'to' );
 347 $c->del_path( 'n20', 'n21', 'A' );
 348 $c->add_path( 'n20', 'n21p0', 'A' );
 349 $c->add_path( 'n21p0', 'n21', 'A' );
 350 $c->flatten_ranks();
 351 ok( $c->reading( 'n21p0' ), "New reading exists" );
 352 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
 353
 354 # Combine n3 and n4 ( with his )
 355 $c->merge_readings( 'n3', 'n4', 1 );
 356 ok( !$c->reading('n4'), "Reading n4 is gone" );
 357 is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
 358
 359 # Collapse n9 and n10 ( rood / root )
 360 $c->merge_readings( 'n9', 'n10' );
 361 ok( !$c->reading('n10'), "Reading n10 is gone" );
 362 is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
 363
 364 # Combine n21 and n21p0
 365 my $remaining = $c->reading('n21');
 366 $remaining ||= $c->reading('n22');  # one of these should still exist
 367 $c->merge_readings( 'n21p0', $remaining, 1 );
 368 ok( !$c->reading('n21'), "Reading $remaining is gone" );
 369 is( $c->reading('n21p0')->text, 'unto', "Reading n21p0 merged correctly" );
 370
 371 =end testing
 372
 373 =cut
 374
 375 sub merge_readings {
 376         my $self = shift;
 377
 378         # Sanity check
 379         my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
 380         my $mergemeta = $kept_obj->is_meta;
 381         throw( "Cannot merge meta and non-meta reading" )
 382                 unless ( $mergemeta && $del_obj->is_meta )
 383                         || ( !$mergemeta && !$del_obj->is_meta );
 384         if( $mergemeta ) {
 385                 throw( "Cannot merge with start or end node" )
 386                         if( $kept_obj eq $self->start || $kept_obj eq $self->end
 387                                 || $del_obj eq $self->start || $del_obj eq $self->end );
 388         }
 389         # We only need the IDs for adding paths to the graph, not the reading
 390         # objects themselves.
 391         my $kept = $kept_obj->id;
 392         my $deleted = $del_obj->id;
 393         $self->_graphcalc_done(0);
 394
 395     # The kept reading should inherit the paths and the relationships
 396     # of the deleted reading.
 397         foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
 398                 my @vector = ( $kept );
 399                 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
 400                 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
 401                 next if $vector[0] eq $vector[1]; # Don't add a self loop
 402                 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
 403                 $self->sequence->add_edge( @vector );
 404                 my $fwits = $self->sequence->get_edge_attributes( @vector );
 405                 @wits{keys %$fwits} = values %$fwits;
 406                 $self->sequence->set_edge_attributes( @vector, \%wits );
 407         }
 408         $self->relations->merge_readings( $kept, $deleted, $combine );
 409
 410         # Do the deletion deed.
 411         if( $combine ) {
 412                 my $joinstr = $combine_char;
 413                 unless( defined $joinstr ) {
 414                         $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
 415                         $joinstr = $self->wordsep unless defined $joinstr;
 416                 }
 417                 $kept_obj->alter_text( join( $joinstr, $kept_obj->text, $del_obj->text ) );
 418         }
 419         $self->del_reading( $deleted );
 420 }
 421
 422
 423 # Helper function for manipulating the graph.
 424 sub _stringify_args {
 425         my( $self, $first, $second, @args ) = @_;
 426     $first = $first->id
 427         if ref( $first ) eq 'Text::Tradition::Collation::Reading';
 428     $second = $second->id
 429         if ref( $second ) eq 'Text::Tradition::Collation::Reading';
 430     return( $first, $second, @args );
 431 }
 432
 433 # Helper function for manipulating the graph.
 434 sub _objectify_args {
 435         my( $self, $first, $second, $arg ) = @_;
 436     $first = $self->reading( $first )
 437         unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
 438     $second = $self->reading( $second )
 439         unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
 440     return( $first, $second, $arg );
 441 }
 442 ### Path logic
 443
 444 sub add_path {
 445         my $self = shift;
 446
 447         # We only need the IDs for adding paths to the graph, not the reading
 448         # objects themselves.
 449     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 450
 451         $self->_graphcalc_done(0);
 452         # Connect the readings
 453         unless( $self->sequence->has_edge( $source, $target ) ) {
 454             $self->sequence->add_edge( $source, $target );
 455             $self->relations->add_equivalence_edge( $source, $target );
 456         }
 457     # Note the witness in question
 458     $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
 459 }
 460
 461 sub del_path {
 462         my $self = shift;
 463         my @args;
 464         if( ref( $_[0] ) eq 'ARRAY' ) {
 465                 my $e = shift @_;
 466                 @args = ( @$e, @_ );
 467         } else {
 468                 @args = @_;
 469         }
 470
 471         # We only need the IDs for adding paths to the graph, not the reading
 472         # objects themselves.
 473     my( $source, $target, $wit ) = $self->_stringify_args( @args );
 474
 475         $self->_graphcalc_done(0);
 476         if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
 477                 $self->sequence->delete_edge_attribute( $source, $target, $wit );
 478         }
 479         unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
 480                 $self->sequence->delete_edge( $source, $target );
 481                 $self->relations->delete_equivalence_edge( $source, $target );
 482         }
 483 }
 484
 485
 486 # Extra graph-alike utility
 487 sub has_path {
 488         my $self = shift;
 489     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 490         return undef unless $self->sequence->has_edge( $source, $target );
 491         return $self->sequence->has_edge_attribute( $source, $target, $wit );
 492 }
 493
 494 =head2 clear_witness( @sigil_list )
 495
 496 Clear the given witnesses out of the collation entirely, removing references
 497 to them in paths, and removing readings that belong only to them.  Should only
 498 be called via $tradition->del_witness.
 499
 500 =cut
 501
 502 sub clear_witness {
 503         my( $self, @sigils ) = @_;
 504
 505         $self->_graphcalc_done(0);
 506         # Clear the witness(es) out of the paths
 507         foreach my $e ( $self->paths ) {
 508                 foreach my $sig ( @sigils ) {
 509                         $self->del_path( $e, $sig );
 510                 }
 511         }
 512
 513         # Clear out the newly unused readings
 514         foreach my $r ( $self->readings ) {
 515                 unless( $self->reading_witnesses( $r ) ) {
 516                         $self->del_reading( $r );
 517                 }
 518         }
 519 }
 520
 521 sub add_relationship {
 522         my $self = shift;
 523     my( $source, $target, $opts ) = $self->_stringify_args( @_ );
 524     my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
 525         $self->_graphcalc_done(0);
 526     return @vectors;
 527 }
 528
 529 around qw/ get_relationship del_relationship / => sub {
 530         my $orig = shift;
 531         my $self = shift;
 532         my @args = @_;
 533         if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
 534                 @args = @{$_[0]};
 535         }
 536         my( $source, $target ) = $self->_stringify_args( @args );
 537         $self->$orig( $source, $target );
 538 };
 539
 540 =head2 reading_witnesses( $reading )
 541
 542 Return a list of sigils corresponding to the witnesses in which the reading appears.
 543
 544 =cut
 545
 546 sub reading_witnesses {
 547         my( $self, $reading ) = @_;
 548         # We need only check either the incoming or the outgoing edges; I have
 549         # arbitrarily chosen "incoming".  Thus, special-case the start node.
 550         if( $reading eq $self->start ) {
 551                 return map { $_->sigil } $self->tradition->witnesses;
 552         }
 553         my %all_witnesses;
 554         foreach my $e ( $self->sequence->edges_to( $reading ) ) {
 555                 my $wits = $self->sequence->get_edge_attributes( @$e );
 556                 @all_witnesses{ keys %$wits } = 1;
 557         }
 558         my $acstr = $self->ac_label;
 559         foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
 560                 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
 561         }
 562         return keys %all_witnesses;
 563 }
 564
 565 =head1 OUTPUT METHODS
 566
 567 =head2 as_svg( \%options )
 568
 569 Returns an SVG string that represents the graph, via as_dot and graphviz.
 570 See as_dot for a list of options.  Must have GraphViz (dot) installed to run.
 571
 572 =cut
 573
 574 sub as_svg {
 575     my( $self, $opts ) = @_;
 576     throw( "Need GraphViz installed to output SVG" )
 577         unless File::Which::which( 'dot' );
 578     my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
 579     $self->calculate_ranks()
 580         unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
 581     if( !$self->has_cached_svg || $opts->{'recalc'}     || $want_subgraph ) {
 582                 my @cmd = qw/dot -Tsvg/;
 583                 my( $svg, $err );
 584                 my $dotfile = File::Temp->new();
 585                 ## USE FOR DEBUGGING
 586                 # $dotfile->unlink_on_destroy(0);
 587                 binmode $dotfile, ':utf8';
 588                 print $dotfile $self->as_dot( $opts );
 589                 push( @cmd, $dotfile->filename );
 590                 run( \@cmd, ">", binary(), \$svg );
 591                 $svg = decode_utf8( $svg );
 592                 $self->cached_svg( $svg ) unless $want_subgraph;
 593                 return $svg;
 594         } else {
 595                 return $self->cached_svg;
 596         }
 597 }
 598
 599
 600 =head2 as_dot( \%options )
 601
 602 Returns a string that is the collation graph expressed in dot
 603 (i.e. GraphViz) format.  Options include:
 604
 605 =over 4
 606
 607 =item * from
 608
 609 =item * to
 610
 611 =item * color_common
 612
 613 =back
 614
 615 =cut
 616
 617 sub as_dot {
 618     my( $self, $opts ) = @_;
 619     my $startrank = $opts->{'from'} if $opts;
 620     my $endrank = $opts->{'to'} if $opts;
 621     my $color_common = $opts->{'color_common'} if $opts;
 622     my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
 623        && $self->end->rank > 100;
 624     $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
 625
 626     # Check the arguments
 627     if( $startrank ) {
 628         return if $endrank && $startrank > $endrank;
 629         return if $startrank > $self->end->rank;
 630         }
 631         if( defined $endrank ) {
 632                 return if $endrank < 0;
 633                 $endrank = undef if $endrank == $self->end->rank;
 634         }
 635
 636     my $graph_name = $self->tradition->name;
 637     $graph_name =~ s/[^\w\s]//g;
 638     $graph_name = join( '_', split( /\s+/, $graph_name ) );
 639
 640     my %graph_attrs = (
 641         'rankdir' => 'LR',
 642         'bgcolor' => 'none',
 643         );
 644     my %node_attrs = (
 645         'fontsize' => 14,
 646         'fillcolor' => 'white',
 647         'style' => 'filled',
 648         'shape' => 'ellipse'
 649         );
 650     my %edge_attrs = (
 651         'arrowhead' => 'open',
 652         'color' => '#000000',
 653         'fontcolor' => '#000000',
 654         );
 655
 656     my $dot = sprintf( "digraph %s {\n", $graph_name );
 657     $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
 658     $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
 659
 660         # Output substitute start/end readings if necessary
 661         if( $startrank ) {
 662                 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
 663         }
 664         if( $endrank ) {
 665                 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
 666         }
 667         if( $STRAIGHTENHACK ) {
 668                 ## HACK part 1
 669                 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
 670                 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
 671                 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
 672         }
 673         my %used;  # Keep track of the readings that actually appear in the graph
 674         # Sort the readings by rank if we have ranks; this speeds layout.
 675         my @all_readings = $self->end->has_rank
 676                 ? sort { $a->rank <=> $b->rank } $self->readings
 677                 : $self->readings;
 678         # TODO Refrain from outputting lacuna nodes - just grey out the edges.
 679     foreach my $reading ( @all_readings ) {
 680         # Only output readings within our rank range.
 681         next if $startrank && $reading->rank < $startrank;
 682         next if $endrank && $reading->rank > $endrank;
 683         $used{$reading->id} = 1;
 684         # Need not output nodes without separate labels
 685         next if $reading->id eq $reading->text;
 686         my $rattrs;
 687         my $label = $reading->text;
 688         $label .= '-' if $reading->join_next;
 689         $label = "-$label" if $reading->join_prior;
 690         $label =~ s/\"/\\\"/g;
 691                 $rattrs->{'label'} = $label;
 692                 $rattrs->{'id'} = $reading->id;
 693                 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
 694         $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
 695     }
 696
 697         # Add the real edges. Need to weight one edge per rank jump, in a
 698         # continuous line.
 699         # my $weighted = $self->_add_edge_weights;
 700     my @edges = $self->paths;
 701         my( %substart, %subend );
 702     foreach my $edge ( @edges ) {
 703         # Do we need to output this edge?
 704         if( $used{$edge->[0]} && $used{$edge->[1]} ) {
 705                 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
 706                         my $variables = { %edge_attrs, 'label' => $label };
 707
 708                         # Account for the rank gap if necessary
 709                         my $rank0 = $self->reading( $edge->[0] )->rank
 710                                 if $self->reading( $edge->[0] )->has_rank;
 711                         my $rank1 = $self->reading( $edge->[1] )->rank
 712                                 if $self->reading( $edge->[1] )->has_rank;
 713                         if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
 714                                 $variables->{'minlen'} = $rank1 - $rank0;
 715                         }
 716
 717                         # Add the calculated edge weights
 718                         # if( exists $weighted->{$edge->[0]}
 719                         #       && $weighted->{$edge->[0]} eq $edge->[1] ) {
 720                         #       # $variables->{'color'} = 'red';
 721                         #       $variables->{'weight'} = 3.0;
 722                         # }
 723
 724                         # EXPERIMENTAL: make edge width reflect no. of witnesses
 725                         my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
 726                         $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
 727
 728                         my $varopts = _dot_attr_string( $variables );
 729                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
 730                                 $edge->[0], $edge->[1], $varopts );
 731         } elsif( $used{$edge->[0]} ) {
 732                 $subend{$edge->[0]} = 1;
 733         } elsif( $used{$edge->[1]} ) {
 734                 $substart{$edge->[1]} = 1;
 735         }
 736     }
 737     # Add substitute start and end edges if necessary
 738     foreach my $node ( keys %substart ) {
 739         my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
 740         my $variables = { %edge_attrs, 'label' => $witstr };
 741         my $varopts = _dot_attr_string( $variables );
 742         $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;";
 743         }
 744     foreach my $node ( keys %subend ) {
 745         my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
 746         my $variables = { %edge_attrs, 'label' => $witstr };
 747         my $varopts = _dot_attr_string( $variables );
 748         $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;";
 749         }
 750         # HACK part 2
 751         if( $STRAIGHTENHACK ) {
 752                 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
 753                 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
 754         }
 755
 756     $dot .= "}\n";
 757     return $dot;
 758 }
 759
 760 sub _dot_attr_string {
 761         my( $hash ) = @_;
 762         my @attrs;
 763         foreach my $k ( sort keys %$hash ) {
 764                 my $v = $hash->{$k};
 765                 push( @attrs, $k.'="'.$v.'"' );
 766         }
 767         return( '[ ' . join( ', ', @attrs ) . ' ]' );
 768 }
 769
 770 sub _add_edge_weights {
 771         my $self = shift;
 772         # Walk the graph from START to END, choosing the successor node with
 773         # the largest number of witness paths each time.
 774         my $weighted = {};
 775         my $curr = $self->start->id;
 776         my $ranked = $self->end->has_rank;
 777         while( $curr ne $self->end->id ) {
 778                 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
 779                 my @succ = sort { $self->path_witnesses( $curr, $a )
 780                                                         <=> $self->path_witnesses( $curr, $b ) }
 781                         $self->sequence->successors( $curr );
 782                 my $next = pop @succ;
 783                 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
 784                 # Try to avoid lacunae in the weighted path.
 785                 while( @succ &&
 786                            ( $self->reading( $next )->is_lacuna ||
 787                                  $nextrank - $rank > 1 ) ){
 788                         $next = pop @succ;
 789                 }
 790                 $weighted->{$curr} = $next;
 791                 $curr = $next;
 792         }
 793         return $weighted;
 794 }
 795
 796 =head2 path_witnesses( $edge )
 797
 798 Returns the list of sigils whose witnesses are associated with the given edge.
 799 The edge can be passed as either an array or an arrayref of ( $source, $target ).
 800
 801 =cut
 802
 803 sub path_witnesses {
 804         my( $self, @edge ) = @_;
 805         # If edge is an arrayref, cope.
 806         if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
 807                 my $e = shift @edge;
 808                 @edge = @$e;
 809         }
 810         my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
 811         return @wits;
 812 }
 813
 814 # Helper function. Make a display label for the given witnesses, showing a.c.
 815 # witnesses only where the main witness is not also in the list.
 816 sub _path_display_label {
 817         my $self = shift;
 818         my %wits;
 819         map { $wits{$_} = 1 } @_;
 820
 821         # If an a.c. wit is listed, remove it if the main wit is also listed.
 822         # Otherwise keep it for explicit listing.
 823         my $aclabel = $self->ac_label;
 824         my @disp_ac;
 825         foreach my $w ( sort keys %wits ) {
 826                 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
 827                         if( exists $wits{$1} ) {
 828                                 delete $wits{$w};
 829                         } else {
 830                                 push( @disp_ac, $w );
 831                         }
 832                 }
 833         }
 834
 835         # See if we are in a majority situation.
 836         my $maj = scalar( $self->tradition->witnesses ) * 0.6;
 837         $maj = $maj > 5 ? $maj : 5;
 838         if( scalar keys %wits > $maj ) {
 839                 unshift( @disp_ac, 'majority' );
 840                 return join( ', ', @disp_ac );
 841         } else {
 842                 return join( ', ', sort keys %wits );
 843         }
 844 }
 845
 846 =head2 readings_at_rank( $rank )
 847
 848 Returns a list of readings at a given rank, taken from the alignment table.
 849
 850 =cut
 851
 852 sub readings_at_rank {
 853         my( $self, $rank ) = @_;
 854         my $table = $self->alignment_table;
 855         # Table rank is real rank - 1.
 856         my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
 857         my %readings;
 858         foreach my $e ( @elements ) {
 859                 next unless ref( $e ) eq 'HASH';
 860                 next unless exists $e->{'t'};
 861                 $readings{$e->{'t'}->id} = $e->{'t'};
 862         }
 863         return values %readings;
 864 }
 865
 866 =head2 as_graphml
 867
 868 Returns a GraphML representation of the collation.  The GraphML will contain
 869 two graphs. The first expresses the attributes of the readings and the witness
 870 paths that link them; the second expresses the relationships that link the
 871 readings.  This is the native transfer format for a tradition.
 872
 873 =begin testing
 874
 875 use Text::Tradition;
 876
 877 my $READINGS = 311;
 878 my $PATHS = 361;
 879
 880 my $datafile = 't/data/florilegium_tei_ps.xml';
 881 my $tradition = Text::Tradition->new( 'input' => 'TEI',
 882                                       'name' => 'test0',
 883                                       'file' => $datafile,
 884                                       'linear' => 1 );
 885
 886 ok( $tradition, "Got a tradition object" );
 887 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
 888 ok( $tradition->collation, "Tradition has a collation" );
 889
 890 my $c = $tradition->collation;
 891 is( scalar $c->readings, $READINGS, "Collation has all readings" );
 892 is( scalar $c->paths, $PATHS, "Collation has all paths" );
 893 is( scalar $c->relationships, 0, "Collation has all relationships" );
 894
 895 # Add a few relationships
 896 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
 897 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
 898 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
 899
 900 # Now write it to GraphML and parse it again.
 901
 902 my $graphml = $c->as_graphml;
 903 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
 904 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
 905 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
 906 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
 907
 908 # Now add a stemma, write to GraphML, and parse again.
 909 my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
 910 is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
 911 is( $tradition->stemmata, 1, "Tradition now has the stemma" );
 912 $graphml = $c->as_graphml;
 913 like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
 914
 915 =end testing
 916
 917 =cut
 918
 919 sub as_graphml {
 920     my( $self, $options ) = @_;
 921         $self->calculate_ranks unless $self->_graphcalc_done;
 922
 923         my $start = $options->{'from'}
 924                 ? $self->reading( $options->{'from'} ) : $self->start;
 925         my $end = $options->{'to'}
 926                 ? $self->reading( $options->{'to'} ) : $self->end;
 927         if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
 928                 throw( 'Start node must be before end node' );
 929         }
 930         # The readings need to be ranked for this to work.
 931         $start = $self->start unless $start->has_rank;
 932         $end = $self->end unless $end->has_rank;
 933         my $rankoffset = 0;
 934         unless( $start eq $self->start ) {
 935                 $rankoffset = $start->rank - 1;
 936         }
 937         my %use_readings;
 938
 939     # Some namespaces
 940     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
 941     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
 942     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
 943         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
 944
 945     # Create the document and root node
 946     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
 947     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
 948     $graphml->setDocumentElement( $root );
 949     $root->setNamespace( $xsi_ns, 'xsi', 0 );
 950     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
 951
 952     # List of attribute types to save on our objects and their corresponding
 953     # GraphML types
 954     my %save_types = (
 955         'Str' => 'string',
 956         'Int' => 'int',
 957         'Bool' => 'boolean',
 958         'ReadingID' => 'string',
 959         'RelationshipType' => 'string',
 960         'RelationshipScope' => 'string',
 961     );
 962
 963     # List of attribute names *not* to save on our objects.
 964     # We will also not save any attribute beginning with _.
 965     my %skipsave;
 966     map { $skipsave{$_} = 1 } qw/ cached_svg /;
 967
 968     # Add the data keys for the graph. Include an extra key 'version' for the
 969     # GraphML output version.
 970     my %graph_data_keys;
 971     my $gdi = 0;
 972     my %graph_attributes = ( 'version' => 'string' );
 973         # Graph attributes include those of Tradition and those of Collation.
 974         my %gattr_from;
 975         my $tmeta = $self->tradition->meta;
 976         my $cmeta = $self->meta;
 977         map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
 978         map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
 979         foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
 980                 next if $attr->name =~ /^_/;
 981                 next if $skipsave{$attr->name};
 982                 next unless $save_types{$attr->type_constraint->name};
 983                 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
 984         }
 985     # Extra custom key for the tradition stemma(ta)
 986     $graph_attributes{'stemmata'} = 'string';
 987
 988     foreach my $datum ( sort keys %graph_attributes ) {
 989         $graph_data_keys{$datum} = 'dg'.$gdi++;
 990         my $key = $root->addNewChild( $graphml_ns, 'key' );
 991         $key->setAttribute( 'attr.name', $datum );
 992         $key->setAttribute( 'attr.type', $graph_attributes{$datum} );
 993         $key->setAttribute( 'for', 'graph' );
 994         $key->setAttribute( 'id', $graph_data_keys{$datum} );
 995     }
 996
 997     # Add the data keys for reading nodes
 998     my %reading_attributes;
 999     my $rmeta = Text::Tradition::Collation::Reading->meta;
1000     foreach my $attr( $rmeta->get_all_attributes ) {
1001                 next if $attr->name =~ /^_/;
1002                 next if $skipsave{$attr->name};
1003                 next unless $save_types{$attr->type_constraint->name};
1004                 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1005         }
1006         # Extra custom key for the reading morphology
1007         $reading_attributes{'lexemes'} = 'string';
1008
1009     my %node_data_keys;
1010     my $ndi = 0;
1011     foreach my $datum ( sort keys %reading_attributes ) {
1012         $node_data_keys{$datum} = 'dn'.$ndi++;
1013         my $key = $root->addNewChild( $graphml_ns, 'key' );
1014         $key->setAttribute( 'attr.name', $datum );
1015         $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
1016         $key->setAttribute( 'for', 'node' );
1017         $key->setAttribute( 'id', $node_data_keys{$datum} );
1018     }
1019
1020     # Add the data keys for edges, that is, paths and relationships. Path
1021     # data does not come from a Moose class so is here manually.
1022     my $edi = 0;
1023     my %edge_data_keys;
1024     my %edge_attributes = (
1025         witness => 'string',                    # ID/label for a path
1026         extra => 'boolean',                             # Path key
1027         );
1028     my @path_attributes = keys %edge_attributes; # track our manual additions
1029     my $pmeta = Text::Tradition::Collation::Relationship->meta;
1030     foreach my $attr( $pmeta->get_all_attributes ) {
1031                 next if $attr->name =~ /^_/;
1032                 next if $skipsave{$attr->name};
1033                 next unless $save_types{$attr->type_constraint->name};
1034                 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1035         }
1036     foreach my $datum ( sort keys %edge_attributes ) {
1037         $edge_data_keys{$datum} = 'de'.$edi++;
1038         my $key = $root->addNewChild( $graphml_ns, 'key' );
1039         $key->setAttribute( 'attr.name', $datum );
1040         $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
1041         $key->setAttribute( 'for', 'edge' );
1042         $key->setAttribute( 'id', $edge_data_keys{$datum} );
1043     }
1044
1045     # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1046     my $xmlidname = $self->tradition->name;
1047     $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1048     if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1049         $xmlidname = '_'.$xmlidname;
1050     }
1051     my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1052     $sgraph->setAttribute( 'edgedefault', 'directed' );
1053     $sgraph->setAttribute( 'id', $xmlidname );
1054     $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
1055     $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
1056     $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
1057     $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
1058     $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
1059
1060     # Tradition/collation attribute data
1061     foreach my $datum ( keys %graph_attributes ) {
1062         my $value;
1063         if( $datum eq 'version' ) {
1064                 $value = '3.2';
1065         } elsif( $datum eq 'stemmata' ) {
1066                 my @stemstrs;
1067                 map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1068                         $self->tradition->stemmata;
1069                 $value = join( "\n", @stemstrs );
1070         } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1071                 $value = $self->tradition->$datum;
1072         } else {
1073                 $value = $self->$datum;
1074         }
1075                 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
1076         }
1077
1078     my $node_ctr = 0;
1079     my %node_hash;
1080     # Add our readings to the graph
1081     foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
1082         next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1083                 ( $n->rank < $start->rank || $n->rank > $end->rank );
1084         $use_readings{$n->id} = 1;
1085         # Add to the main graph
1086         my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
1087         my $node_xmlid = 'n' . $node_ctr++;
1088         $node_hash{ $n->id } = $node_xmlid;
1089         $node_el->setAttribute( 'id', $node_xmlid );
1090         foreach my $d ( keys %reading_attributes ) {
1091                 my $nval = $n->$d;
1092                 # Custom serialization
1093                 if( $d eq 'lexemes' ) {
1094                                 # If nval is a true value, we have lexemes so we need to
1095                                 # serialize them. Otherwise set nval to undef so that the
1096                                 # key is excluded from this reading.
1097                         $nval = $nval ? $n->_serialize_lexemes : undef;
1098                 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1099                         $nval = undef;
1100                 }
1101                 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
1102                         # Adjust the ranks within the subgraph.
1103                         $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1104                                 : $nval - $rankoffset;
1105                 }
1106                 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1107                         if defined $nval;
1108         }
1109     }
1110
1111     # Add the path edges to the sequence graph
1112     my $edge_ctr = 0;
1113     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1114         # We add an edge in the graphml for every witness in $e.
1115         next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1116         my @edge_wits = sort $self->path_witnesses( $e );
1117         $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1118         $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1119         # Skip any path from start to end; that witness is not in the subgraph.
1120         next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
1121         foreach my $wit ( @edge_wits ) {
1122                         my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1123                                                                                 $node_hash{ $e->[0] },
1124                                                                                 $node_hash{ $e->[1] } );
1125                         my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
1126                         $edge_el->setAttribute( 'source', $from );
1127                         $edge_el->setAttribute( 'target', $to );
1128                         $edge_el->setAttribute( 'id', $id );
1129
1130                         # It's a witness path, so add the witness
1131                         my $base = $wit;
1132                         my $key = $edge_data_keys{'witness'};
1133                         # Is this an ante-corr witness?
1134                         my $aclabel = $self->ac_label;
1135                         if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1136                                 # Keep the base witness
1137                                 $base = $1;
1138                                 # ...and record that this is an 'extra' reading path
1139                                 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1140                         }
1141                         _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1142                 }
1143         }
1144
1145         # Report the actual number of nodes and edges that went in
1146         $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1147         $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1148
1149         # Add the relationship graph to the XML
1150         map { delete $edge_data_keys{$_} } @path_attributes;
1151         $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1152                 $node_data_keys{'id'}, \%edge_data_keys );
1153
1154     # Save and return the thing
1155     my $result = decode_utf8( $graphml->toString(1) );
1156     return $result;
1157 }
1158
1159 sub _add_graphml_data {
1160     my( $el, $key, $value ) = @_;
1161     return unless defined $value;
1162     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
1163     $data_el->setAttribute( 'key', $key );
1164     $data_el->appendText( $value );
1165 }
1166
1167 =head2 as_csv
1168
1169 Returns a CSV alignment table representation of the collation graph, one
1170 row per witness (or witness uncorrected.)
1171
1172 =cut
1173
1174 sub as_csv {
1175     my( $self ) = @_;
1176     my $table = $self->alignment_table;
1177     my $csv = Text::CSV->new( { binary => 1, quote_null => 0 } );
1178     my @result;
1179     # Make the header row
1180     $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1181         push( @result, decode_utf8( $csv->string ) );
1182     # Make the rest of the rows
1183     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1184         my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1185         my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
1186         $csv->combine( @row );
1187         push( @result, decode_utf8( $csv->string ) );
1188     }
1189     return join( "\n", @result );
1190 }
1191
1192 =head2 alignment_table( $use_refs, $include_witnesses )
1193
1194 Return a reference to an alignment table, in a slightly enhanced CollateX
1195 format which looks like this:
1196
1197  $table = { alignment => [ { witness => "SIGIL",
1198                              tokens => [ { t => "TEXT" }, ... ] },
1199                            { witness => "SIG2",
1200                              tokens => [ { t => "TEXT" }, ... ] },
1201                            ... ],
1202             length => TEXTLEN };
1203
1204 If $use_refs is set to 1, the reading object is returned in the table
1205 instead of READINGTEXT; if not, the text of the reading is returned.
1206
1207 If $include_witnesses is set to a hashref, only the witnesses whose sigil
1208 keys have a true hash value will be included.
1209
1210 =cut
1211
1212 sub alignment_table {
1213     my( $self ) = @_;
1214     $self->calculate_ranks() unless $self->_graphcalc_done;
1215     return $self->cached_table if $self->has_cached_table;
1216
1217     # Make sure we can do this
1218         throw( "Need a linear graph in order to make an alignment table" )
1219                 unless $self->linear;
1220         $self->calculate_ranks unless $self->end->has_rank;
1221
1222     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
1223     my @all_pos = ( 1 .. $self->end->rank - 1 );
1224     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
1225         # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1226         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1227         my @row = _make_witness_row( \@wit_path, \@all_pos );
1228         push( @{$table->{'alignment'}},
1229                 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1230         if( $wit->is_layered ) {
1231                 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
1232                         $wit->sigil.$self->ac_label );
1233             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
1234                         push( @{$table->{'alignment'}},
1235                                 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
1236         }
1237     }
1238     $self->cached_table( $table );
1239     return $table;
1240 }
1241
1242 sub _make_witness_row {
1243     my( $path, $positions ) = @_;
1244     my %char_hash;
1245     map { $char_hash{$_} = undef } @$positions;
1246     my $debug = 0;
1247     foreach my $rdg ( @$path ) {
1248         my $rtext = $rdg->text;
1249         $rtext = '#LACUNA#' if $rdg->is_lacuna;
1250         print STDERR "rank " . $rdg->rank . "\n" if $debug;
1251         # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
1252         $char_hash{$rdg->rank} = { 't' => $rdg };
1253     }
1254     my @row = map { $char_hash{$_} } @$positions;
1255     # Fill in lacuna markers for undef spots in the row
1256     my $last_el = shift @row;
1257     my @filled_row = ( $last_el );
1258     foreach my $el ( @row ) {
1259         # If we are using node reference, make the lacuna node appear many times
1260         # in the table.  If not, use the lacuna tag.
1261         if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1262             $el = $last_el;
1263         }
1264         push( @filled_row, $el );
1265         $last_el = $el;
1266     }
1267     return @filled_row;
1268 }
1269
1270 =head1 NAVIGATION METHODS
1271
1272 =head2 reading_sequence( $first, $last, $sigil, $backup )
1273
1274 Returns the ordered list of readings, starting with $first and ending
1275 with $last, for the witness given in $sigil. If a $backup sigil is
1276 specified (e.g. when walking a layered witness), it will be used wherever
1277 no $sigil path exists.  If there is a base text reading, that will be
1278 used wherever no path exists for $sigil or $backup.
1279
1280 =cut
1281
1282 # TODO Think about returning some lazy-eval iterator.
1283 # TODO Get rid of backup; we should know from what witness is whether we need it.
1284
1285 sub reading_sequence {
1286     my( $self, $start, $end, $witness ) = @_;
1287
1288     $witness = $self->baselabel unless $witness;
1289     my @readings = ( $start );
1290     my %seen;
1291     my $n = $start;
1292     while( $n && $n->id ne $end->id ) {
1293         if( exists( $seen{$n->id} ) ) {
1294             throw( "Detected loop for $witness at " . $n->id );
1295         }
1296         $seen{$n->id} = 1;
1297
1298         my $next = $self->next_reading( $n, $witness );
1299         unless( $next ) {
1300             throw( "Did not find any path for $witness from reading " . $n->id );
1301         }
1302         push( @readings, $next );
1303         $n = $next;
1304     }
1305     # Check that the last reading is our end reading.
1306     my $last = $readings[$#readings];
1307     throw( "Last reading found from " . $start->text .
1308         " for witness $witness is not the end!" ) # TODO do we get this far?
1309         unless $last->id eq $end->id;
1310
1311     return @readings;
1312 }
1313
1314 =head2 next_reading( $reading, $sigil );
1315
1316 Returns the reading that follows the given reading along the given witness
1317 path.
1318
1319 =cut
1320
1321 sub next_reading {
1322     # Return the successor via the corresponding path.
1323     my $self = shift;
1324     my $answer = $self->_find_linked_reading( 'next', @_ );
1325         return undef unless $answer;
1326     return $self->reading( $answer );
1327 }
1328
1329 =head2 prior_reading( $reading, $sigil )
1330
1331 Returns the reading that precedes the given reading along the given witness
1332 path.
1333
1334 =cut
1335
1336 sub prior_reading {
1337     # Return the predecessor via the corresponding path.
1338     my $self = shift;
1339     my $answer = $self->_find_linked_reading( 'prior', @_ );
1340     return $self->reading( $answer );
1341 }
1342
1343 sub _find_linked_reading {
1344     my( $self, $direction, $node, $path ) = @_;
1345
1346     # Get a backup if we are dealing with a layered witness
1347     my $alt_path;
1348     my $aclabel = $self->ac_label;
1349     if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1350         $alt_path = $1;
1351     }
1352
1353     my @linked_paths = $direction eq 'next'
1354         ? $self->sequence->edges_from( $node )
1355         : $self->sequence->edges_to( $node );
1356     return undef unless scalar( @linked_paths );
1357
1358     # We have to find the linked path that contains all of the
1359     # witnesses supplied in $path.
1360     my( @path_wits, @alt_path_wits );
1361     @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1362     @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
1363     my $base_le;
1364     my $alt_le;
1365     foreach my $le ( @linked_paths ) {
1366         if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
1367             $base_le = $le;
1368         }
1369                 my @le_wits = sort $self->path_witnesses( $le );
1370                 if( _is_within( \@path_wits, \@le_wits ) ) {
1371                         # This is the right path.
1372                         return $direction eq 'next' ? $le->[1] : $le->[0];
1373                 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1374                         $alt_le = $le;
1375                 }
1376     }
1377     # Got this far? Return the alternate path if it exists.
1378     return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
1379         if $alt_le;
1380
1381     # Got this far? Return the base path if it exists.
1382     return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
1383         if $base_le;
1384
1385     # Got this far? We have no appropriate path.
1386     warn "Could not find $direction node from " . $node->id
1387         . " along path $path";
1388     return undef;
1389 }
1390
1391 # Some set logic.
1392 sub _is_within {
1393     my( $set1, $set2 ) = @_;
1394     my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
1395     foreach my $el ( @$set1 ) {
1396         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
1397     }
1398     return $ret;
1399 }
1400
1401 # Return the string that joins together a list of witnesses for
1402 # display on a single path.
1403 sub _witnesses_of_label {
1404     my( $self, $label ) = @_;
1405     my $regex = $self->wit_list_separator;
1406     my @answer = split( /\Q$regex\E/, $label );
1407     return @answer;
1408 }
1409
1410 =head2 common_readings
1411
1412 Returns the list of common readings in the graph (i.e. those readings that are
1413 shared by all non-lacunose witnesses.)
1414
1415 =cut
1416
1417 sub common_readings {
1418         my $self = shift;
1419         my @common = grep { $_->is_common } $self->readings;
1420         return @common;
1421 }
1422
1423 =head2 path_text( $sigil, [, $start, $end ] )
1424
1425 Returns the text of a witness (plus its backup, if we are using a layer)
1426 as stored in the collation.  The text is returned as a string, where the
1427 individual readings are joined with spaces and the meta-readings (e.g.
1428 lacunae) are omitted.  Optional specification of $start and $end allows
1429 the generation of a subset of the witness text.
1430
1431 =cut
1432
1433 sub path_text {
1434         my( $self, $wit, $start, $end ) = @_;
1435         $start = $self->start unless $start;
1436         $end = $self->end unless $end;
1437         my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
1438         my $pathtext = '';
1439         my $last;
1440         foreach my $r ( @path ) {
1441                 unless ( $r->join_prior || !$last || $last->join_next ) {
1442                         $pathtext .= ' ';
1443                 }
1444                 $pathtext .= $r->text;
1445                 $last = $r;
1446         }
1447         return $pathtext;
1448 }
1449
1450 =head1 INITIALIZATION METHODS
1451
1452 These are mostly for use by parsers.
1453
1454 =head2 make_witness_path( $witness )
1455
1456 Link the array of readings contained in $witness->path (and in
1457 $witness->uncorrected_path if it exists) into collation paths.
1458 Clear out the arrays when finished.
1459
1460 =head2 make_witness_paths
1461
1462 Call make_witness_path for all witnesses in the tradition.
1463
1464 =cut
1465
1466 # For use when a collation is constructed from a base text and an apparatus.
1467 # We have the sequences of readings and just need to add path edges.
1468 # When we are done, clear out the witness path attributes, as they are no
1469 # longer needed.
1470 # TODO Find a way to replace the witness path attributes with encapsulated functions?
1471
1472 sub make_witness_paths {
1473     my( $self ) = @_;
1474     foreach my $wit ( $self->tradition->witnesses ) {
1475         # print STDERR "Making path for " . $wit->sigil . "\n";
1476         $self->make_witness_path( $wit );
1477     }
1478 }
1479
1480 sub make_witness_path {
1481     my( $self, $wit ) = @_;
1482     my @chain = @{$wit->path};
1483     my $sig = $wit->sigil;
1484     # Add start and end if necessary
1485     unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1486     push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1487     foreach my $idx ( 0 .. $#chain-1 ) {
1488         $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
1489     }
1490     if( $wit->is_layered ) {
1491         @chain = @{$wit->uncorrected_path};
1492                 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1493                 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1494         foreach my $idx( 0 .. $#chain-1 ) {
1495             my $source = $chain[$idx];
1496             my $target = $chain[$idx+1];
1497             $self->add_path( $source, $target, $sig.$self->ac_label )
1498                 unless $self->has_path( $source, $target, $sig );
1499         }
1500     }
1501     $wit->clear_path;
1502     $wit->clear_uncorrected_path;
1503 }
1504
1505 =head2 calculate_ranks
1506
1507 Calculate the reading ranks (that is, their aligned positions relative
1508 to each other) for the graph.  This can only be called on linear collations.
1509
1510 =begin testing
1511
1512 use Text::Tradition;
1513
1514 my $cxfile = 't/data/Collatex-16.xml';
1515 my $t = Text::Tradition->new(
1516     'name'  => 'inline',
1517     'input' => 'CollateX',
1518     'file'  => $cxfile,
1519     );
1520 my $c = $t->collation;
1521
1522 # Make an svg
1523 my $table = $c->alignment_table;
1524 ok( $c->has_cached_table, "Alignment table was cached" );
1525 is( $c->alignment_table, $table, "Cached table returned upon second call" );
1526 $c->calculate_ranks;
1527 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
1528 $c->add_relationship( 'n24', 'n23', { 'type' => 'spelling' } );
1529 isnt( $c->alignment_table, $table, "Alignment table changed after relationship add" );
1530
1531 =end testing
1532
1533 =cut
1534
1535 sub calculate_ranks {
1536     my $self = shift;
1537     # Save the existing ranks, in case we need to invalidate the cached SVG.
1538     my %existing_ranks;
1539     map { $existing_ranks{$_} = $_->rank } $self->readings;
1540
1541     # Do the rankings based on the relationship equivalence graph, starting
1542     # with the start node.
1543     my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
1544
1545     # Transfer our rankings from the topological graph to the real one.
1546     foreach my $r ( $self->readings ) {
1547         if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
1548             $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
1549         } else {
1550                 # Die. Find the last rank we calculated.
1551                 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
1552                                  <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
1553                         $self->readings;
1554                 my $last = pop @all_defined;
1555             throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
1556         }
1557     }
1558     # Do we need to invalidate the cached data?
1559     if( $self->has_cached_svg || $self->has_cached_table ) {
1560         foreach my $r ( $self->readings ) {
1561                 next if defined( $existing_ranks{$r} )
1562                         && $existing_ranks{$r} == $r->rank;
1563                 # Something has changed, so clear the cache
1564                 $self->_clear_cache;
1565                         # ...and recalculate the common readings.
1566                         $self->calculate_common_readings();
1567                 last;
1568         }
1569     }
1570         # The graph calculation information is now up to date.
1571         $self->_graphcalc_done(1);
1572 }
1573
1574 sub _clear_cache {
1575         my $self = shift;
1576         $self->wipe_svg if $self->has_cached_svg;
1577         $self->wipe_table if $self->has_cached_table;
1578 }
1579
1580
1581 =head2 flatten_ranks
1582
1583 A convenience method for parsing collation data.  Searches the graph for readings
1584 with the same text at the same rank, and merges any that are found.
1585
1586 =cut
1587
1588 sub flatten_ranks {
1589     my $self = shift;
1590     my %unique_rank_rdg;
1591     my $changed;
1592     foreach my $rdg ( $self->readings ) {
1593         next unless $rdg->has_rank;
1594         my $key = $rdg->rank . "||" . $rdg->text;
1595         if( exists $unique_rank_rdg{$key} ) {
1596                 # Make sure they don't have different grammatical forms
1597                         my $ur = $unique_rank_rdg{$key};
1598                         if( $rdg->disambiguated && $ur->disambiguated ) {
1599                                 my $rform = join( '//', map { $_->form->to_string } $rdg->lexemes );
1600                                 my $uform = join( '//', map { $_->form->to_string } $ur->lexemes );
1601                                 next unless $rform eq $uform;
1602                         } elsif( $rdg->disambiguated xor $ur->disambiguated ) {
1603                                 next;
1604                         }
1605             # Combine!
1606                 #print STDERR "Combining readings at same rank: $key\n";
1607                 $changed = 1;
1608             $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
1609             # TODO see if this now makes a common point.
1610         } else {
1611             $unique_rank_rdg{$key} = $rdg;
1612         }
1613     }
1614     # If we merged readings, the ranks are still fine but the alignment
1615     # table is wrong. Wipe it.
1616     $self->wipe_table() if $changed;
1617 }
1618
1619
1620 =head2 calculate_common_readings
1621
1622 Goes through the graph identifying the readings that appear in every witness
1623 (apart from those with lacunae at that spot.) Marks them as common and returns
1624 the list.
1625
1626 =begin testing
1627
1628 use Text::Tradition;
1629
1630 my $cxfile = 't/data/Collatex-16.xml';
1631 my $t = Text::Tradition->new(
1632     'name'  => 'inline',
1633     'input' => 'CollateX',
1634     'file'  => $cxfile,
1635     );
1636 my $c = $t->collation;
1637
1638 my @common = $c->calculate_common_readings();
1639 is( scalar @common, 8, "Found correct number of common readings" );
1640 my @marked = sort $c->common_readings();
1641 is( scalar @common, 8, "All common readings got marked as such" );
1642 my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
1643 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1644
1645 =end testing
1646
1647 =cut
1648
1649 sub calculate_common_readings {
1650         my $self = shift;
1651         my @common;
1652         map { $_->is_common( 0 ) } $self->readings;
1653         # Implicitly calls calculate_ranks
1654         my $table = $self->alignment_table;
1655         foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1656                 my @row = map { $_->{'tokens'}->[$idx]
1657                                                         ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1658                                         @{$table->{'alignment'}};
1659                 my %hash;
1660                 foreach my $r ( @row ) {
1661                         if( $r ) {
1662                                 $hash{$r->id} = $r unless $r->is_meta;
1663                         } else {
1664                                 $hash{'UNDEF'} = $r;
1665                         }
1666                 }
1667                 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1668                         my( $r ) = values %hash;
1669                         $r->is_common( 1 );
1670                         push( @common, $r );
1671                 }
1672         }
1673         return @common;
1674 }
1675
1676 =head2 text_from_paths
1677
1678 Calculate the text array for all witnesses from the path, for later consistency
1679 checking.  Only to be used if there is no non-graph-based way to know the
1680 original texts.
1681
1682 =cut
1683
1684 sub text_from_paths {
1685         my $self = shift;
1686     foreach my $wit ( $self->tradition->witnesses ) {
1687         my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1688         my @text;
1689         foreach my $r ( @readings ) {
1690                 next if $r->is_meta;
1691                 push( @text, $r->text );
1692         }
1693         $wit->text( \@text );
1694         if( $wit->is_layered ) {
1695                         my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
1696                                                                                                   $wit->sigil.$self->ac_label );
1697                         my @uctext;
1698                         foreach my $r ( @ucrdgs ) {
1699                                 next if $r->is_meta;
1700                                 push( @uctext, $r->text );
1701                         }
1702                         $wit->layertext( \@uctext );
1703         }
1704     }
1705 }
1706
1707 =head1 UTILITY FUNCTIONS
1708
1709 =head2 common_predecessor( $reading_a, $reading_b )
1710
1711 Find the last reading that occurs in sequence before both the given readings.
1712 At the very least this should be $self->start.
1713
1714 =head2 common_successor( $reading_a, $reading_b )
1715
1716 Find the first reading that occurs in sequence after both the given readings.
1717 At the very least this should be $self->end.
1718
1719 =begin testing
1720
1721 use Text::Tradition;
1722
1723 my $cxfile = 't/data/Collatex-16.xml';
1724 my $t = Text::Tradition->new(
1725     'name'  => 'inline',
1726     'input' => 'CollateX',
1727     'file'  => $cxfile,
1728     );
1729 my $c = $t->collation;
1730
1731 is( $c->common_predecessor( 'n24', 'n23' )->id,
1732     'n20', "Found correct common predecessor" );
1733 is( $c->common_successor( 'n24', 'n23' )->id,
1734     '__END__', "Found correct common successor" );
1735
1736 is( $c->common_predecessor( 'n19', 'n17' )->id,
1737     'n16', "Found correct common predecessor for readings on same path" );
1738 is( $c->common_successor( 'n21', 'n10' )->id,
1739     '__END__', "Found correct common successor for readings on same path" );
1740
1741 =end testing
1742
1743 =cut
1744
1745 ## Return the closest reading that is a predecessor of both the given readings.
1746 sub common_predecessor {
1747         my $self = shift;
1748         my( $r1, $r2 ) = $self->_objectify_args( @_ );
1749         return $self->_common_in_path( $r1, $r2, 'predecessors' );
1750 }
1751
1752 sub common_successor {
1753         my $self = shift;
1754         my( $r1, $r2 ) = $self->_objectify_args( @_ );
1755         return $self->_common_in_path( $r1, $r2, 'successors' );
1756 }
1757
1758
1759 # TODO think about how to do this without ranks...
1760 sub _common_in_path {
1761         my( $self, $r1, $r2, $dir ) = @_;
1762         my $iter = $self->end->rank;
1763         my @candidates;
1764         my @last_r1 = ( $r1 );
1765         my @last_r2 = ( $r2 );
1766         # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
1767         my %all_seen;
1768         # print STDERR "Finding common $dir for $r1, $r2\n";
1769         while( !@candidates ) {
1770                 last unless $iter--;  # Avoid looping infinitely
1771                 # Iterate separately down the graph from r1 and r2
1772                 my( @new_lc1, @new_lc2 );
1773                 foreach my $lc ( @last_r1 ) {
1774                         foreach my $p ( $lc->$dir ) {
1775                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
1776                                         # print STDERR "Path candidate $p from $lc\n";
1777                                         push( @candidates, $p );
1778                                 } elsif( !$all_seen{$p->id} ) {
1779                                         $all_seen{$p->id} = 'r1';
1780                                         push( @new_lc1, $p );
1781                                 }
1782                         }
1783                 }
1784                 foreach my $lc ( @last_r2 ) {
1785                         foreach my $p ( $lc->$dir ) {
1786                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
1787                                         # print STDERR "Path candidate $p from $lc\n";
1788                                         push( @candidates, $p );
1789                                 } elsif( !$all_seen{$p->id} ) {
1790                                         $all_seen{$p->id} = 'r2';
1791                                         push( @new_lc2, $p );
1792                                 }
1793                         }
1794                 }
1795                 @last_r1 = @new_lc1;
1796                 @last_r2 = @new_lc2;
1797         }
1798         my @answer = sort { $a->rank <=> $b->rank } @candidates;
1799         return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1800 }
1801
1802 sub throw {
1803         Text::Tradition::Error->throw(
1804                 'ident' => 'Collation error',
1805                 'message' => $_[0],
1806                 );
1807 }
1808
1809 no Moose;
1810 __PACKAGE__->meta->make_immutable;
1811
1812 =head1 LICENSE
1813
1814 This package is free software and is provided "as is" without express
1815 or implied warranty.  You can redistribute it and/or modify it under
1816 the same terms as Perl itself.
1817
1818 =head1 AUTHOR
1819
1820 Tara L Andrews E<lt>aurum@cpan.orgE<gt>