lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use Encode qw( decode_utf8 );
   4 use File::Temp;
   5 use File::Which;
   6 use Graph;
   7 use IPC::Run qw( run binary );
   8 use Text::CSV;
   9 use Text::Tradition::Collation::Reading;
  10 use Text::Tradition::Collation::RelationshipStore;
  11 use Text::Tradition::Error;
  12 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
  13 use XML::LibXML;
  14 use XML::LibXML::XPathContext;
  15 use Moose;
  16
  17 has 'sequence' => (
  18     is => 'ro',
  19     isa => 'Graph',
  20     default => sub { Graph->new() },
  21     handles => {
  22         paths => 'edges',
  23     },
  24     );
  25
  26 has 'relations' => (
  27         is => 'ro',
  28         isa => 'Text::Tradition::Collation::RelationshipStore',
  29         handles => {
  30                 relationships => 'relationships',
  31                 related_readings => 'related_readings',
  32                 get_relationship => 'get_relationship',
  33                 del_relationship => 'del_relationship',
  34                 equivalence => 'equivalence',
  35                 equivalence_graph => 'equivalence_graph',
  36         },
  37         writer => '_set_relations',
  38         );
  39
  40 has 'tradition' => (
  41     is => 'ro',
  42     isa => 'Text::Tradition',
  43     writer => '_set_tradition',
  44     weak_ref => 1,
  45     );
  46
  47 has 'readings' => (
  48         isa => 'HashRef[Text::Tradition::Collation::Reading]',
  49         traits => ['Hash'],
  50     handles => {
  51         reading     => 'get',
  52         _add_reading => 'set',
  53         del_reading => 'delete',
  54         has_reading => 'exists',
  55         readings   => 'values',
  56     },
  57     default => sub { {} },
  58         );
  59
  60 has 'wit_list_separator' => (
  61     is => 'rw',
  62     isa => 'Str',
  63     default => ', ',
  64     );
  65
  66 has 'baselabel' => (
  67     is => 'rw',
  68     isa => 'Str',
  69     default => 'base text',
  70     );
  71
  72 has 'linear' => (
  73     is => 'rw',
  74     isa => 'Bool',
  75     default => 1,
  76     );
  77
  78 has 'ac_label' => (
  79     is => 'rw',
  80     isa => 'Str',
  81     default => ' (a.c.)',
  82     );
  83
  84 has 'wordsep' => (
  85         is => 'rw',
  86         isa => 'Str',
  87         default => ' ',
  88         );
  89
  90 has 'start' => (
  91         is => 'ro',
  92         isa => 'Text::Tradition::Collation::Reading',
  93         writer => '_set_start',
  94         weak_ref => 1,
  95         );
  96
  97 has 'end' => (
  98         is => 'ro',
  99         isa => 'Text::Tradition::Collation::Reading',
 100         writer => '_set_end',
 101         weak_ref => 1,
 102         );
 103
 104 has 'cached_svg' => (
 105         is => 'rw',
 106         isa => 'Str',
 107         predicate => 'has_cached_svg',
 108         clearer => 'wipe_svg',
 109         );
 110
 111 has 'cached_table' => (
 112         is => 'rw',
 113         isa => 'HashRef',
 114         predicate => 'has_cached_table',
 115         clearer => 'wipe_table',
 116         );
 117
 118 has '_graphcalc_done' => (
 119         is => 'rw',
 120         isa => 'Bool',
 121         default => undef,
 122         );
 123
 124 =head1 NAME
 125
 126 Text::Tradition::Collation - a software model for a text collation
 127
 128 =head1 SYNOPSIS
 129
 130   use Text::Tradition;
 131   my $t = Text::Tradition->new(
 132     'name' => 'this is a text',
 133     'input' => 'TEI',
 134     'file' => '/path/to/tei_parallel_seg_file.xml' );
 135
 136   my $c = $t->collation;
 137   my @readings = $c->readings;
 138   my @paths = $c->paths;
 139   my @relationships = $c->relationships;
 140
 141   my $svg_variant_graph = $t->collation->as_svg();
 142
 143 =head1 DESCRIPTION
 144
 145 Text::Tradition is a library for representation and analysis of collated
 146 texts, particularly medieval ones.  The Collation is the central feature of
 147 a Tradition, where the text, its sequence of readings, and its relationships
 148 between readings are actually kept.
 149
 150 =head1 CONSTRUCTOR
 151
 152 =head2 new
 153
 154 The constructor.  Takes a hash or hashref of the following arguments:
 155
 156 =over
 157
 158 =item * tradition - The Text::Tradition object to which the collation
 159 belongs. Required.
 160
 161 =item * linear - Whether the collation should be linear; that is, whether
 162 transposed readings should be treated as two linked readings rather than one,
 163 and therefore whether the collation graph is acyclic.  Defaults to true.
 164
 165 =item * baselabel - The default label for the path taken by a base text
 166 (if any). Defaults to 'base text'.
 167
 168 =item * wit_list_separator - The string to join a list of witnesses for
 169 purposes of making labels in display graphs.  Defaults to ', '.
 170
 171 =item * ac_label - The extra label to tack onto a witness sigil when
 172 representing another layer of path for the given witness - that is, when
 173 a text has more than one possible reading due to scribal corrections or
 174 the like.  Defaults to ' (a.c.)'.
 175
 176 =item * wordsep - The string used to separate words in the original text.
 177 Defaults to ' '.
 178
 179 =back
 180
 181 =head1 ACCESSORS
 182
 183 =head2 tradition
 184
 185 =head2 linear
 186
 187 =head2 wit_list_separator
 188
 189 =head2 baselabel
 190
 191 =head2 ac_label
 192
 193 =head2 wordsep
 194
 195 Simple accessors for collation attributes.
 196
 197 =head2 start
 198
 199 The meta-reading at the start of every witness path.
 200
 201 =head2 end
 202
 203 The meta-reading at the end of every witness path.
 204
 205 =head2 readings
 206
 207 Returns all Reading objects in the graph.
 208
 209 =head2 reading( $id )
 210
 211 Returns the Reading object corresponding to the given ID.
 212
 213 =head2 add_reading( $reading_args )
 214
 215 Adds a new reading object to the collation.
 216 See L<Text::Tradition::Collation::Reading> for the available arguments.
 217
 218 =head2 del_reading( $object_or_id )
 219
 220 Removes the given reading from the collation, implicitly removing its
 221 paths and relationships.
 222
 223 =head2 merge_readings( $main, $second, $concatenate, $with_str )
 224
 225 Merges the $second reading into the $main one. If $concatenate is true, then
 226 the merged node will carry the text of both readings, concatenated with either
 227 $with_str (if specified) or a sensible default (the empty string if the
 228 appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
 229
 230 The first two arguments may be either readings or reading IDs.
 231
 232 =head2 has_reading( $id )
 233
 234 Predicate to see whether a given reading ID is in the graph.
 235
 236 =head2 reading_witnesses( $object_or_id )
 237
 238 Returns a list of sigils whose witnesses contain the reading.
 239
 240 =head2 paths
 241
 242 Returns all reading paths within the document - that is, all edges in the
 243 collation graph.  Each path is an arrayref of [ $source, $target ] reading IDs.
 244
 245 =head2 add_path( $source, $target, $sigil )
 246
 247 Links the given readings in the collation in sequence, under the given witness
 248 sigil.  The readings may be specified by object or ID.
 249
 250 =head2 del_path( $source, $target, $sigil )
 251
 252 Links the given readings in the collation in sequence, under the given witness
 253 sigil.  The readings may be specified by object or ID.
 254
 255 =head2 has_path( $source, $target );
 256
 257 Returns true if the two readings are linked in sequence in any witness.
 258 The readings may be specified by object or ID.
 259
 260 =head2 relationships
 261
 262 Returns all Relationship objects in the collation.
 263
 264 =head2 add_relationship( $reading, $other_reading, $options )
 265
 266 Adds a new relationship of the type given in $options between the two readings,
 267 which may be specified by object or ID.  Returns a value of ( $status, @vectors)
 268 where $status is true on success, and @vectors is a list of relationship edges
 269 that were ultimately added.
 270 See L<Text::Tradition::Collation::Relationship> for the available options.
 271
 272 =cut
 273
 274 sub BUILD {
 275     my $self = shift;
 276     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
 277     $self->_set_start( $self->add_reading(
 278         { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
 279     $self->_set_end( $self->add_reading(
 280         { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
 281 }
 282
 283 ### Reading construct/destruct functions
 284
 285 sub add_reading {
 286         my( $self, $reading ) = @_;
 287         unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
 288                 my %args = %$reading;
 289                 if( $args{'init'} ) {
 290                         # If we are initializing an empty collation, don't assume that we
 291                         # have set a tradition.
 292                         delete $args{'init'};
 293                 } elsif( $self->tradition->has_language && !exists $args{'language'} ) {
 294                         $args{'language'} = $self->tradition->language;
 295                 }
 296                 $reading = Text::Tradition::Collation::Reading->new(
 297                         'collation' => $self,
 298                         %args );
 299         }
 300         # First check to see if a reading with this ID exists.
 301         if( $self->reading( $reading->id ) ) {
 302                 throw( "Collation already has a reading with id " . $reading->id );
 303         }
 304         $self->_graphcalc_done(0);
 305         $self->_add_reading( $reading->id => $reading );
 306         # Once the reading has been added, put it in both graphs.
 307         $self->sequence->add_vertex( $reading->id );
 308         $self->relations->add_reading( $reading->id );
 309         return $reading;
 310 };
 311
 312 around del_reading => sub {
 313         my $orig = shift;
 314         my $self = shift;
 315         my $arg = shift;
 316
 317         if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
 318                 $arg = $arg->id;
 319         }
 320         # Remove the reading from the graphs.
 321         $self->_graphcalc_done(0);
 322         $self->_clear_cache; # Explicitly clear caches to GC the reading
 323         $self->sequence->delete_vertex( $arg );
 324         $self->relations->delete_reading( $arg );
 325
 326         # Carry on.
 327         $self->$orig( $arg );
 328 };
 329
 330 =begin testing
 331
 332 use Text::Tradition;
 333
 334 my $cxfile = 't/data/Collatex-16.xml';
 335 my $t = Text::Tradition->new(
 336     'name'  => 'inline',
 337     'input' => 'CollateX',
 338     'file'  => $cxfile,
 339     );
 340 my $c = $t->collation;
 341
 342 my $rno = scalar $c->readings;
 343 # Split n21 for testing purposes
 344 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
 345 my $old_r = $c->reading( 'n21' );
 346 $old_r->alter_text( 'to' );
 347 $c->del_path( 'n20', 'n21', 'A' );
 348 $c->add_path( 'n20', 'n21p0', 'A' );
 349 $c->add_path( 'n21p0', 'n21', 'A' );
 350 $c->flatten_ranks();
 351 ok( $c->reading( 'n21p0' ), "New reading exists" );
 352 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
 353
 354 # Combine n3 and n4 ( with his )
 355 $c->merge_readings( 'n3', 'n4', 1 );
 356 ok( !$c->reading('n4'), "Reading n4 is gone" );
 357 is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
 358
 359 # Collapse n9 and n10 ( rood / root )
 360 $c->merge_readings( 'n9', 'n10' );
 361 ok( !$c->reading('n10'), "Reading n10 is gone" );
 362 is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
 363
 364 # Combine n21 and n21p0
 365 my $remaining = $c->reading('n21');
 366 $remaining ||= $c->reading('n22');  # one of these should still exist
 367 $c->merge_readings( 'n21p0', $remaining, 1 );
 368 ok( !$c->reading('n21'), "Reading $remaining is gone" );
 369 is( $c->reading('n21p0')->text, 'unto', "Reading n21p0 merged correctly" );
 370
 371 =end testing
 372
 373 =cut
 374
 375 sub merge_readings {
 376         my $self = shift;
 377
 378         # Sanity check
 379         my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
 380         my $mergemeta = $kept_obj->is_meta;
 381         throw( "Cannot merge meta and non-meta reading" )
 382                 unless ( $mergemeta && $del_obj->is_meta )
 383                         || ( !$mergemeta && !$del_obj->is_meta );
 384         if( $mergemeta ) {
 385                 throw( "Cannot merge with start or end node" )
 386                         if( $kept_obj eq $self->start || $kept_obj eq $self->end
 387                                 || $del_obj eq $self->start || $del_obj eq $self->end );
 388         }
 389         # We only need the IDs for adding paths to the graph, not the reading
 390         # objects themselves.
 391         my $kept = $kept_obj->id;
 392         my $deleted = $del_obj->id;
 393         $self->_graphcalc_done(0);
 394
 395     # The kept reading should inherit the paths and the relationships
 396     # of the deleted reading.
 397         foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
 398                 my @vector = ( $kept );
 399                 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
 400                 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
 401                 next if $vector[0] eq $vector[1]; # Don't add a self loop
 402                 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
 403                 $self->sequence->add_edge( @vector );
 404                 my $fwits = $self->sequence->get_edge_attributes( @vector );
 405                 @wits{keys %$fwits} = values %$fwits;
 406                 $self->sequence->set_edge_attributes( @vector, \%wits );
 407         }
 408         $self->relations->merge_readings( $kept, $deleted, $combine );
 409
 410         # Do the deletion deed.
 411         if( $combine ) {
 412                 my $joinstr = $combine_char;
 413                 unless( defined $joinstr ) {
 414                         $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
 415                         $joinstr = $self->wordsep unless defined $joinstr;
 416                 }
 417                 $kept_obj->alter_text( join( $joinstr, $kept_obj->text, $del_obj->text ) );
 418         }
 419         $self->del_reading( $deleted );
 420 }
 421
 422
 423 # Helper function for manipulating the graph.
 424 sub _stringify_args {
 425         my( $self, $first, $second, @args ) = @_;
 426     $first = $first->id
 427         if ref( $first ) eq 'Text::Tradition::Collation::Reading';
 428     $second = $second->id
 429         if ref( $second ) eq 'Text::Tradition::Collation::Reading';
 430     return( $first, $second, @args );
 431 }
 432
 433 # Helper function for manipulating the graph.
 434 sub _objectify_args {
 435         my( $self, $first, $second, $arg ) = @_;
 436     $first = $self->reading( $first )
 437         unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
 438     $second = $self->reading( $second )
 439         unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
 440     return( $first, $second, $arg );
 441 }
 442 ### Path logic
 443
 444 sub add_path {
 445         my $self = shift;
 446
 447         # We only need the IDs for adding paths to the graph, not the reading
 448         # objects themselves.
 449     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 450
 451         $self->_graphcalc_done(0);
 452         # Connect the readings
 453         unless( $self->sequence->has_edge( $source, $target ) ) {
 454             $self->sequence->add_edge( $source, $target );
 455             $self->relations->add_equivalence_edge( $source, $target );
 456         }
 457     # Note the witness in question
 458     $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
 459 }
 460
 461 sub del_path {
 462         my $self = shift;
 463         my @args;
 464         if( ref( $_[0] ) eq 'ARRAY' ) {
 465                 my $e = shift @_;
 466                 @args = ( @$e, @_ );
 467         } else {
 468                 @args = @_;
 469         }
 470
 471         # We only need the IDs for adding paths to the graph, not the reading
 472         # objects themselves.
 473     my( $source, $target, $wit ) = $self->_stringify_args( @args );
 474
 475         $self->_graphcalc_done(0);
 476         if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
 477                 $self->sequence->delete_edge_attribute( $source, $target, $wit );
 478         }
 479         unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
 480                 $self->sequence->delete_edge( $source, $target );
 481                 $self->relations->delete_equivalence_edge( $source, $target );
 482         }
 483 }
 484
 485
 486 # Extra graph-alike utility
 487 sub has_path {
 488         my $self = shift;
 489     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 490         return undef unless $self->sequence->has_edge( $source, $target );
 491         return $self->sequence->has_edge_attribute( $source, $target, $wit );
 492 }
 493
 494 =head2 clear_witness( @sigil_list )
 495
 496 Clear the given witnesses out of the collation entirely, removing references
 497 to them in paths, and removing readings that belong only to them.  Should only
 498 be called via $tradition->del_witness.
 499
 500 =cut
 501
 502 sub clear_witness {
 503         my( $self, @sigils ) = @_;
 504
 505         $self->_graphcalc_done(0);
 506         # Clear the witness(es) out of the paths
 507         foreach my $e ( $self->paths ) {
 508                 foreach my $sig ( @sigils ) {
 509                         $self->del_path( $e, $sig );
 510                 }
 511         }
 512
 513         # Clear out the newly unused readings
 514         foreach my $r ( $self->readings ) {
 515                 unless( $self->reading_witnesses( $r ) ) {
 516                         $self->del_reading( $r );
 517                 }
 518         }
 519 }
 520
 521 sub add_relationship {
 522         my $self = shift;
 523     my( $source, $target, $opts ) = $self->_stringify_args( @_ );
 524     my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
 525         $self->_graphcalc_done(0);
 526     return @vectors;
 527 }
 528
 529 around qw/ get_relationship del_relationship / => sub {
 530         my $orig = shift;
 531         my $self = shift;
 532         my @args = @_;
 533         if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
 534                 @args = @{$_[0]};
 535         }
 536         my( $source, $target ) = $self->_stringify_args( @args );
 537         $self->$orig( $source, $target );
 538 };
 539
 540 =head2 reading_witnesses( $reading )
 541
 542 Return a list of sigils corresponding to the witnesses in which the reading appears.
 543
 544 =cut
 545
 546 sub reading_witnesses {
 547         my( $self, $reading ) = @_;
 548         # We need only check either the incoming or the outgoing edges; I have
 549         # arbitrarily chosen "incoming".  Thus, special-case the start node.
 550         if( $reading eq $self->start ) {
 551                 return map { $_->sigil } $self->tradition->witnesses;
 552         }
 553         my %all_witnesses;
 554         foreach my $e ( $self->sequence->edges_to( $reading ) ) {
 555                 my $wits = $self->sequence->get_edge_attributes( @$e );
 556                 @all_witnesses{ keys %$wits } = 1;
 557         }
 558         my $acstr = $self->ac_label;
 559         foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
 560                 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
 561         }
 562         return keys %all_witnesses;
 563 }
 564
 565 =head1 OUTPUT METHODS
 566
 567 =head2 as_svg( \%options )
 568
 569 Returns an SVG string that represents the graph, via as_dot and graphviz.
 570 See as_dot for a list of options.  Must have GraphViz (dot) installed to run.
 571
 572 =cut
 573
 574 sub as_svg {
 575     my( $self, $opts ) = @_;
 576     throw( "Need GraphViz installed to output SVG" )
 577         unless File::Which::which( 'dot' );
 578     my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
 579     $self->calculate_ranks()
 580         unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
 581     if( !$self->has_cached_svg || $opts->{'recalc'}     || $want_subgraph ) {
 582                 my @cmd = qw/dot -Tsvg/;
 583                 my( $svg, $err );
 584                 my $dotfile = File::Temp->new();
 585                 ## USE FOR DEBUGGING
 586                 # $dotfile->unlink_on_destroy(0);
 587                 binmode $dotfile, ':utf8';
 588                 print $dotfile $self->as_dot( $opts );
 589                 push( @cmd, $dotfile->filename );
 590                 run( \@cmd, ">", binary(), \$svg );
 591                 $svg = decode_utf8( $svg );
 592                 $self->cached_svg( $svg ) unless $want_subgraph;
 593                 return $svg;
 594         } else {
 595                 return $self->cached_svg;
 596         }
 597 }
 598
 599
 600 =head2 as_dot( \%options )
 601
 602 Returns a string that is the collation graph expressed in dot
 603 (i.e. GraphViz) format.  Options include:
 604
 605 =over 4
 606
 607 =item * from
 608
 609 =item * to
 610
 611 =item * color_common
 612
 613 =back
 614
 615 =cut
 616
 617 sub as_dot {
 618     my( $self, $opts ) = @_;
 619     my $startrank = $opts->{'from'} if $opts;
 620     my $endrank = $opts->{'to'} if $opts;
 621     my $color_common = $opts->{'color_common'} if $opts;
 622     my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
 623        && $self->end->rank > 100;
 624     $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
 625
 626     # Check the arguments
 627     if( $startrank ) {
 628         return if $endrank && $startrank > $endrank;
 629         return if $startrank > $self->end->rank;
 630         }
 631         if( defined $endrank ) {
 632                 return if $endrank < 0;
 633                 $endrank = undef if $endrank == $self->end->rank;
 634         }
 635
 636     my $graph_name = $self->tradition->name;
 637     $graph_name =~ s/[^\w\s]//g;
 638     $graph_name = join( '_', split( /\s+/, $graph_name ) );
 639
 640     my %graph_attrs = (
 641         'rankdir' => 'LR',
 642         'bgcolor' => 'none',
 643         );
 644     my %node_attrs = (
 645         'fontsize' => 14,
 646         'fillcolor' => 'white',
 647         'style' => 'filled',
 648         'shape' => 'ellipse'
 649         );
 650     my %edge_attrs = (
 651         'arrowhead' => 'open',
 652         'color' => '#000000',
 653         'fontcolor' => '#000000',
 654         );
 655
 656     my $dot = sprintf( "digraph %s {\n", $graph_name );
 657     $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
 658     $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
 659
 660         # Output substitute start/end readings if necessary
 661         if( $startrank ) {
 662                 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
 663         }
 664         if( $endrank ) {
 665                 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
 666         }
 667         if( $STRAIGHTENHACK ) {
 668                 ## HACK part 1
 669                 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
 670                 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
 671                 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
 672         }
 673         my %used;  # Keep track of the readings that actually appear in the graph
 674         # Sort the readings by rank if we have ranks; this speeds layout.
 675         my @all_readings = $self->end->has_rank
 676                 ? sort { $a->rank <=> $b->rank } $self->readings
 677                 : $self->readings;
 678         # TODO Refrain from outputting lacuna nodes - just grey out the edges.
 679     foreach my $reading ( @all_readings ) {
 680         # Only output readings within our rank range.
 681         next if $startrank && $reading->rank < $startrank;
 682         next if $endrank && $reading->rank > $endrank;
 683         $used{$reading->id} = 1;
 684         # Need not output nodes without separate labels
 685         next if $reading->id eq $reading->text;
 686         my $rattrs;
 687         my $label = $reading->text;
 688         $label .= '-' if $reading->join_next;
 689         $label = "-$label" if $reading->join_prior;
 690         $label =~ s/\"/\\\"/g;
 691                 $rattrs->{'label'} = $label;
 692                 $rattrs->{'id'} = $reading->id;
 693                 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
 694         $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
 695     }
 696
 697         # Add the real edges. Need to weight one edge per rank jump, in a
 698         # continuous line.
 699         # my $weighted = $self->_add_edge_weights;
 700     my @edges = $self->paths;
 701         my( %substart, %subend );
 702     foreach my $edge ( @edges ) {
 703         # Do we need to output this edge?
 704         if( $used{$edge->[0]} && $used{$edge->[1]} ) {
 705                 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
 706                         my $variables = { %edge_attrs, 'label' => $label };
 707
 708                         # Account for the rank gap if necessary
 709                         my $rank0 = $self->reading( $edge->[0] )->rank
 710                                 if $self->reading( $edge->[0] )->has_rank;
 711                         my $rank1 = $self->reading( $edge->[1] )->rank
 712                                 if $self->reading( $edge->[1] )->has_rank;
 713                         if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
 714                                 $variables->{'minlen'} = $rank1 - $rank0;
 715                         }
 716
 717                         # Add the calculated edge weights
 718                         # if( exists $weighted->{$edge->[0]}
 719                         #       && $weighted->{$edge->[0]} eq $edge->[1] ) {
 720                         #       # $variables->{'color'} = 'red';
 721                         #       $variables->{'weight'} = 3.0;
 722                         # }
 723
 724                         # EXPERIMENTAL: make edge width reflect no. of witnesses
 725                         my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
 726                         $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
 727
 728                         my $varopts = _dot_attr_string( $variables );
 729                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
 730                                 $edge->[0], $edge->[1], $varopts );
 731         } elsif( $used{$edge->[0]} ) {
 732                 $subend{$edge->[0]} = $edge->[1];
 733         } elsif( $used{$edge->[1]} ) {
 734                 $substart{$edge->[1]} = $edge->[0];
 735         }
 736     }
 737     # Add substitute start and end edges if necessary
 738     foreach my $node ( keys %substart ) {
 739         my $witstr = $self->_path_display_label ( $self->path_witnesses( $substart{$node}, $node ) );
 740         my $variables = { %edge_attrs, 'label' => $witstr };
 741         my $nrdg = $self->reading( $node );
 742         if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
 743                 # Substart is actually one lower than $startrank
 744                 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
 745         }
 746         my $varopts = _dot_attr_string( $variables );
 747         $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
 748         }
 749     foreach my $node ( keys %subend ) {
 750         my $witstr = $self->_path_display_label ( $self->path_witnesses( $node, $subend{$node} ) );
 751         my $variables = { %edge_attrs, 'label' => $witstr };
 752         my $varopts = _dot_attr_string( $variables );
 753         $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
 754         }
 755         # HACK part 2
 756         if( $STRAIGHTENHACK ) {
 757                 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
 758                 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
 759         }
 760
 761     $dot .= "}\n";
 762     return $dot;
 763 }
 764
 765 sub _dot_attr_string {
 766         my( $hash ) = @_;
 767         my @attrs;
 768         foreach my $k ( sort keys %$hash ) {
 769                 my $v = $hash->{$k};
 770                 push( @attrs, $k.'="'.$v.'"' );
 771         }
 772         return( '[ ' . join( ', ', @attrs ) . ' ]' );
 773 }
 774
 775 sub _add_edge_weights {
 776         my $self = shift;
 777         # Walk the graph from START to END, choosing the successor node with
 778         # the largest number of witness paths each time.
 779         my $weighted = {};
 780         my $curr = $self->start->id;
 781         my $ranked = $self->end->has_rank;
 782         while( $curr ne $self->end->id ) {
 783                 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
 784                 my @succ = sort { $self->path_witnesses( $curr, $a )
 785                                                         <=> $self->path_witnesses( $curr, $b ) }
 786                         $self->sequence->successors( $curr );
 787                 my $next = pop @succ;
 788                 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
 789                 # Try to avoid lacunae in the weighted path.
 790                 while( @succ &&
 791                            ( $self->reading( $next )->is_lacuna ||
 792                                  $nextrank - $rank > 1 ) ){
 793                         $next = pop @succ;
 794                 }
 795                 $weighted->{$curr} = $next;
 796                 $curr = $next;
 797         }
 798         return $weighted;
 799 }
 800
 801 =head2 path_witnesses( $edge )
 802
 803 Returns the list of sigils whose witnesses are associated with the given edge.
 804 The edge can be passed as either an array or an arrayref of ( $source, $target ).
 805
 806 =cut
 807
 808 sub path_witnesses {
 809         my( $self, @edge ) = @_;
 810         # If edge is an arrayref, cope.
 811         if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
 812                 my $e = shift @edge;
 813                 @edge = @$e;
 814         }
 815         my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
 816         return @wits;
 817 }
 818
 819 # Helper function. Make a display label for the given witnesses, showing a.c.
 820 # witnesses only where the main witness is not also in the list.
 821 sub _path_display_label {
 822         my $self = shift;
 823         my %wits;
 824         map { $wits{$_} = 1 } @_;
 825
 826         # If an a.c. wit is listed, remove it if the main wit is also listed.
 827         # Otherwise keep it for explicit listing.
 828         my $aclabel = $self->ac_label;
 829         my @disp_ac;
 830         foreach my $w ( sort keys %wits ) {
 831                 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
 832                         if( exists $wits{$1} ) {
 833                                 delete $wits{$w};
 834                         } else {
 835                                 push( @disp_ac, $w );
 836                         }
 837                 }
 838         }
 839
 840         # See if we are in a majority situation.
 841         my $maj = scalar( $self->tradition->witnesses ) * 0.6;
 842         $maj = $maj > 5 ? $maj : 5;
 843         if( scalar keys %wits > $maj ) {
 844                 unshift( @disp_ac, 'majority' );
 845                 return join( ', ', @disp_ac );
 846         } else {
 847                 return join( ', ', sort keys %wits );
 848         }
 849 }
 850
 851 =head2 readings_at_rank( $rank )
 852
 853 Returns a list of readings at a given rank, taken from the alignment table.
 854
 855 =cut
 856
 857 sub readings_at_rank {
 858         my( $self, $rank ) = @_;
 859         my $table = $self->alignment_table;
 860         # Table rank is real rank - 1.
 861         my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
 862         my %readings;
 863         foreach my $e ( @elements ) {
 864                 next unless ref( $e ) eq 'HASH';
 865                 next unless exists $e->{'t'};
 866                 $readings{$e->{'t'}->id} = $e->{'t'};
 867         }
 868         return values %readings;
 869 }
 870
 871 =head2 as_graphml
 872
 873 Returns a GraphML representation of the collation.  The GraphML will contain
 874 two graphs. The first expresses the attributes of the readings and the witness
 875 paths that link them; the second expresses the relationships that link the
 876 readings.  This is the native transfer format for a tradition.
 877
 878 =begin testing
 879
 880 use Text::Tradition;
 881
 882 my $READINGS = 311;
 883 my $PATHS = 361;
 884
 885 my $datafile = 't/data/florilegium_tei_ps.xml';
 886 my $tradition = Text::Tradition->new( 'input' => 'TEI',
 887                                       'name' => 'test0',
 888                                       'file' => $datafile,
 889                                       'linear' => 1 );
 890
 891 ok( $tradition, "Got a tradition object" );
 892 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
 893 ok( $tradition->collation, "Tradition has a collation" );
 894
 895 my $c = $tradition->collation;
 896 is( scalar $c->readings, $READINGS, "Collation has all readings" );
 897 is( scalar $c->paths, $PATHS, "Collation has all paths" );
 898 is( scalar $c->relationships, 0, "Collation has all relationships" );
 899
 900 # Add a few relationships
 901 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
 902 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
 903 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
 904
 905 # Now write it to GraphML and parse it again.
 906
 907 my $graphml = $c->as_graphml;
 908 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
 909 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
 910 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
 911 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
 912
 913 # Now add a stemma, write to GraphML, and parse again.
 914 my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
 915 is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
 916 is( $tradition->stemmata, 1, "Tradition now has the stemma" );
 917 $graphml = $c->as_graphml;
 918 like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
 919
 920 =end testing
 921
 922 =cut
 923
 924 sub as_graphml {
 925     my( $self, $options ) = @_;
 926         $self->calculate_ranks unless $self->_graphcalc_done;
 927
 928         my $start = $options->{'from'}
 929                 ? $self->reading( $options->{'from'} ) : $self->start;
 930         my $end = $options->{'to'}
 931                 ? $self->reading( $options->{'to'} ) : $self->end;
 932         if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
 933                 throw( 'Start node must be before end node' );
 934         }
 935         # The readings need to be ranked for this to work.
 936         $start = $self->start unless $start->has_rank;
 937         $end = $self->end unless $end->has_rank;
 938         my $rankoffset = 0;
 939         unless( $start eq $self->start ) {
 940                 $rankoffset = $start->rank - 1;
 941         }
 942         my %use_readings;
 943
 944     # Some namespaces
 945     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
 946     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
 947     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
 948         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
 949
 950     # Create the document and root node
 951     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
 952     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
 953     $graphml->setDocumentElement( $root );
 954     $root->setNamespace( $xsi_ns, 'xsi', 0 );
 955     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
 956
 957     # List of attribute types to save on our objects and their corresponding
 958     # GraphML types
 959     my %save_types = (
 960         'Str' => 'string',
 961         'Int' => 'int',
 962         'Bool' => 'boolean',
 963         'ReadingID' => 'string',
 964         'RelationshipType' => 'string',
 965         'RelationshipScope' => 'string',
 966     );
 967
 968     # List of attribute names *not* to save on our objects.
 969     # We will also not save any attribute beginning with _.
 970     my %skipsave;
 971     map { $skipsave{$_} = 1 } qw/ cached_svg /;
 972
 973     # Add the data keys for the graph. Include an extra key 'version' for the
 974     # GraphML output version.
 975     my %graph_data_keys;
 976     my $gdi = 0;
 977     my %graph_attributes = ( 'version' => 'string' );
 978         # Graph attributes include those of Tradition and those of Collation.
 979         my %gattr_from;
 980         my $tmeta = $self->tradition->meta;
 981         my $cmeta = $self->meta;
 982         map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
 983         map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
 984         foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
 985                 next if $attr->name =~ /^_/;
 986                 next if $skipsave{$attr->name};
 987                 next unless $save_types{$attr->type_constraint->name};
 988                 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
 989         }
 990     # Extra custom key for the tradition stemma(ta)
 991     $graph_attributes{'stemmata'} = 'string';
 992
 993     foreach my $datum ( sort keys %graph_attributes ) {
 994         $graph_data_keys{$datum} = 'dg'.$gdi++;
 995         my $key = $root->addNewChild( $graphml_ns, 'key' );
 996         $key->setAttribute( 'attr.name', $datum );
 997         $key->setAttribute( 'attr.type', $graph_attributes{$datum} );
 998         $key->setAttribute( 'for', 'graph' );
 999         $key->setAttribute( 'id', $graph_data_keys{$datum} );
1000     }
1001
1002     # Add the data keys for reading nodes
1003     my %reading_attributes;
1004     my $rmeta = Text::Tradition::Collation::Reading->meta;
1005     foreach my $attr( $rmeta->get_all_attributes ) {
1006                 next if $attr->name =~ /^_/;
1007                 next if $skipsave{$attr->name};
1008                 next unless $save_types{$attr->type_constraint->name};
1009                 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1010         }
1011         # Extra custom key for the reading morphology
1012         $reading_attributes{'lexemes'} = 'string';
1013
1014     my %node_data_keys;
1015     my $ndi = 0;
1016     foreach my $datum ( sort keys %reading_attributes ) {
1017         $node_data_keys{$datum} = 'dn'.$ndi++;
1018         my $key = $root->addNewChild( $graphml_ns, 'key' );
1019         $key->setAttribute( 'attr.name', $datum );
1020         $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
1021         $key->setAttribute( 'for', 'node' );
1022         $key->setAttribute( 'id', $node_data_keys{$datum} );
1023     }
1024
1025     # Add the data keys for edges, that is, paths and relationships. Path
1026     # data does not come from a Moose class so is here manually.
1027     my $edi = 0;
1028     my %edge_data_keys;
1029     my %edge_attributes = (
1030         witness => 'string',                    # ID/label for a path
1031         extra => 'boolean',                             # Path key
1032         );
1033     my @path_attributes = keys %edge_attributes; # track our manual additions
1034     my $pmeta = Text::Tradition::Collation::Relationship->meta;
1035     foreach my $attr( $pmeta->get_all_attributes ) {
1036                 next if $attr->name =~ /^_/;
1037                 next if $skipsave{$attr->name};
1038                 next unless $save_types{$attr->type_constraint->name};
1039                 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1040         }
1041     foreach my $datum ( sort keys %edge_attributes ) {
1042         $edge_data_keys{$datum} = 'de'.$edi++;
1043         my $key = $root->addNewChild( $graphml_ns, 'key' );
1044         $key->setAttribute( 'attr.name', $datum );
1045         $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
1046         $key->setAttribute( 'for', 'edge' );
1047         $key->setAttribute( 'id', $edge_data_keys{$datum} );
1048     }
1049
1050     # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1051     my $xmlidname = $self->tradition->name;
1052     $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1053     if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1054         $xmlidname = '_'.$xmlidname;
1055     }
1056     my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1057     $sgraph->setAttribute( 'edgedefault', 'directed' );
1058     $sgraph->setAttribute( 'id', $xmlidname );
1059     $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
1060     $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
1061     $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
1062     $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
1063     $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
1064
1065     # Tradition/collation attribute data
1066     foreach my $datum ( keys %graph_attributes ) {
1067         my $value;
1068         if( $datum eq 'version' ) {
1069                 $value = '3.2';
1070         } elsif( $datum eq 'stemmata' ) {
1071                 my @stemstrs;
1072                 map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1073                         $self->tradition->stemmata;
1074                 $value = join( "\n", @stemstrs );
1075         } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1076                 $value = $self->tradition->$datum;
1077         } else {
1078                 $value = $self->$datum;
1079         }
1080                 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
1081         }
1082
1083     my $node_ctr = 0;
1084     my %node_hash;
1085     # Add our readings to the graph
1086     foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
1087         next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1088                 ( $n->rank < $start->rank || $n->rank > $end->rank );
1089         $use_readings{$n->id} = 1;
1090         # Add to the main graph
1091         my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
1092         my $node_xmlid = 'n' . $node_ctr++;
1093         $node_hash{ $n->id } = $node_xmlid;
1094         $node_el->setAttribute( 'id', $node_xmlid );
1095         foreach my $d ( keys %reading_attributes ) {
1096                 my $nval = $n->$d;
1097                 # Custom serialization
1098                 if( $d eq 'lexemes' ) {
1099                                 # If nval is a true value, we have lexemes so we need to
1100                                 # serialize them. Otherwise set nval to undef so that the
1101                                 # key is excluded from this reading.
1102                         $nval = $nval ? $n->_serialize_lexemes : undef;
1103                 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1104                         $nval = undef;
1105                 }
1106                 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
1107                         # Adjust the ranks within the subgraph.
1108                         $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1109                                 : $nval - $rankoffset;
1110                 }
1111                 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1112                         if defined $nval;
1113         }
1114     }
1115
1116     # Add the path edges to the sequence graph
1117     my $edge_ctr = 0;
1118     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1119         # We add an edge in the graphml for every witness in $e.
1120         next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1121         my @edge_wits = sort $self->path_witnesses( $e );
1122         $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1123         $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1124         # Skip any path from start to end; that witness is not in the subgraph.
1125         next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
1126         foreach my $wit ( @edge_wits ) {
1127                         my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1128                                                                                 $node_hash{ $e->[0] },
1129                                                                                 $node_hash{ $e->[1] } );
1130                         my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
1131                         $edge_el->setAttribute( 'source', $from );
1132                         $edge_el->setAttribute( 'target', $to );
1133                         $edge_el->setAttribute( 'id', $id );
1134
1135                         # It's a witness path, so add the witness
1136                         my $base = $wit;
1137                         my $key = $edge_data_keys{'witness'};
1138                         # Is this an ante-corr witness?
1139                         my $aclabel = $self->ac_label;
1140                         if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1141                                 # Keep the base witness
1142                                 $base = $1;
1143                                 # ...and record that this is an 'extra' reading path
1144                                 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1145                         }
1146                         _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1147                 }
1148         }
1149
1150         # Report the actual number of nodes and edges that went in
1151         $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1152         $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1153
1154         # Add the relationship graph to the XML
1155         map { delete $edge_data_keys{$_} } @path_attributes;
1156         $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1157                 $node_data_keys{'id'}, \%edge_data_keys );
1158
1159     # Save and return the thing
1160     my $result = decode_utf8( $graphml->toString(1) );
1161     return $result;
1162 }
1163
1164 sub _add_graphml_data {
1165     my( $el, $key, $value ) = @_;
1166     return unless defined $value;
1167     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
1168     $data_el->setAttribute( 'key', $key );
1169     $data_el->appendText( $value );
1170 }
1171
1172 =head2 as_csv
1173
1174 Returns a CSV alignment table representation of the collation graph, one
1175 row per witness (or witness uncorrected.)
1176
1177 =cut
1178
1179 sub as_csv {
1180     my( $self ) = @_;
1181     my $table = $self->alignment_table;
1182     my $csv = Text::CSV->new( { binary => 1, quote_null => 0 } );
1183     my @result;
1184     # Make the header row
1185     $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1186         push( @result, decode_utf8( $csv->string ) );
1187     # Make the rest of the rows
1188     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1189         my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1190         my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
1191         $csv->combine( @row );
1192         push( @result, decode_utf8( $csv->string ) );
1193     }
1194     return join( "\n", @result );
1195 }
1196
1197 =head2 alignment_table( $use_refs, $include_witnesses )
1198
1199 Return a reference to an alignment table, in a slightly enhanced CollateX
1200 format which looks like this:
1201
1202  $table = { alignment => [ { witness => "SIGIL",
1203                              tokens => [ { t => "TEXT" }, ... ] },
1204                            { witness => "SIG2",
1205                              tokens => [ { t => "TEXT" }, ... ] },
1206                            ... ],
1207             length => TEXTLEN };
1208
1209 If $use_refs is set to 1, the reading object is returned in the table
1210 instead of READINGTEXT; if not, the text of the reading is returned.
1211
1212 If $include_witnesses is set to a hashref, only the witnesses whose sigil
1213 keys have a true hash value will be included.
1214
1215 =cut
1216
1217 sub alignment_table {
1218     my( $self ) = @_;
1219     $self->calculate_ranks() unless $self->_graphcalc_done;
1220     return $self->cached_table if $self->has_cached_table;
1221
1222     # Make sure we can do this
1223         throw( "Need a linear graph in order to make an alignment table" )
1224                 unless $self->linear;
1225         $self->calculate_ranks unless $self->end->has_rank;
1226
1227     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
1228     my @all_pos = ( 1 .. $self->end->rank - 1 );
1229     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
1230         # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1231         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1232         my @row = _make_witness_row( \@wit_path, \@all_pos );
1233         push( @{$table->{'alignment'}},
1234                 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1235         if( $wit->is_layered ) {
1236                 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
1237                         $wit->sigil.$self->ac_label );
1238             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
1239                         push( @{$table->{'alignment'}},
1240                                 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
1241         }
1242     }
1243     $self->cached_table( $table );
1244     return $table;
1245 }
1246
1247 sub _make_witness_row {
1248     my( $path, $positions ) = @_;
1249     my %char_hash;
1250     map { $char_hash{$_} = undef } @$positions;
1251     my $debug = 0;
1252     foreach my $rdg ( @$path ) {
1253         my $rtext = $rdg->text;
1254         $rtext = '#LACUNA#' if $rdg->is_lacuna;
1255         print STDERR "rank " . $rdg->rank . "\n" if $debug;
1256         # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
1257         $char_hash{$rdg->rank} = { 't' => $rdg };
1258     }
1259     my @row = map { $char_hash{$_} } @$positions;
1260     # Fill in lacuna markers for undef spots in the row
1261     my $last_el = shift @row;
1262     my @filled_row = ( $last_el );
1263     foreach my $el ( @row ) {
1264         # If we are using node reference, make the lacuna node appear many times
1265         # in the table.  If not, use the lacuna tag.
1266         if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1267             $el = $last_el;
1268         }
1269         push( @filled_row, $el );
1270         $last_el = $el;
1271     }
1272     return @filled_row;
1273 }
1274
1275 =head1 NAVIGATION METHODS
1276
1277 =head2 reading_sequence( $first, $last, $sigil, $backup )
1278
1279 Returns the ordered list of readings, starting with $first and ending
1280 with $last, for the witness given in $sigil. If a $backup sigil is
1281 specified (e.g. when walking a layered witness), it will be used wherever
1282 no $sigil path exists.  If there is a base text reading, that will be
1283 used wherever no path exists for $sigil or $backup.
1284
1285 =cut
1286
1287 # TODO Think about returning some lazy-eval iterator.
1288 # TODO Get rid of backup; we should know from what witness is whether we need it.
1289
1290 sub reading_sequence {
1291     my( $self, $start, $end, $witness ) = @_;
1292
1293     $witness = $self->baselabel unless $witness;
1294     my @readings = ( $start );
1295     my %seen;
1296     my $n = $start;
1297     while( $n && $n->id ne $end->id ) {
1298         if( exists( $seen{$n->id} ) ) {
1299             throw( "Detected loop for $witness at " . $n->id );
1300         }
1301         $seen{$n->id} = 1;
1302
1303         my $next = $self->next_reading( $n, $witness );
1304         unless( $next ) {
1305             throw( "Did not find any path for $witness from reading " . $n->id );
1306         }
1307         push( @readings, $next );
1308         $n = $next;
1309     }
1310     # Check that the last reading is our end reading.
1311     my $last = $readings[$#readings];
1312     throw( "Last reading found from " . $start->text .
1313         " for witness $witness is not the end!" ) # TODO do we get this far?
1314         unless $last->id eq $end->id;
1315
1316     return @readings;
1317 }
1318
1319 =head2 next_reading( $reading, $sigil );
1320
1321 Returns the reading that follows the given reading along the given witness
1322 path.
1323
1324 =cut
1325
1326 sub next_reading {
1327     # Return the successor via the corresponding path.
1328     my $self = shift;
1329     my $answer = $self->_find_linked_reading( 'next', @_ );
1330         return undef unless $answer;
1331     return $self->reading( $answer );
1332 }
1333
1334 =head2 prior_reading( $reading, $sigil )
1335
1336 Returns the reading that precedes the given reading along the given witness
1337 path.
1338
1339 =cut
1340
1341 sub prior_reading {
1342     # Return the predecessor via the corresponding path.
1343     my $self = shift;
1344     my $answer = $self->_find_linked_reading( 'prior', @_ );
1345     return $self->reading( $answer );
1346 }
1347
1348 sub _find_linked_reading {
1349     my( $self, $direction, $node, $path ) = @_;
1350
1351     # Get a backup if we are dealing with a layered witness
1352     my $alt_path;
1353     my $aclabel = $self->ac_label;
1354     if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1355         $alt_path = $1;
1356     }
1357
1358     my @linked_paths = $direction eq 'next'
1359         ? $self->sequence->edges_from( $node )
1360         : $self->sequence->edges_to( $node );
1361     return undef unless scalar( @linked_paths );
1362
1363     # We have to find the linked path that contains all of the
1364     # witnesses supplied in $path.
1365     my( @path_wits, @alt_path_wits );
1366     @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1367     @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
1368     my $base_le;
1369     my $alt_le;
1370     foreach my $le ( @linked_paths ) {
1371         if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
1372             $base_le = $le;
1373         }
1374                 my @le_wits = sort $self->path_witnesses( $le );
1375                 if( _is_within( \@path_wits, \@le_wits ) ) {
1376                         # This is the right path.
1377                         return $direction eq 'next' ? $le->[1] : $le->[0];
1378                 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1379                         $alt_le = $le;
1380                 }
1381     }
1382     # Got this far? Return the alternate path if it exists.
1383     return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
1384         if $alt_le;
1385
1386     # Got this far? Return the base path if it exists.
1387     return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
1388         if $base_le;
1389
1390     # Got this far? We have no appropriate path.
1391     warn "Could not find $direction node from " . $node->id
1392         . " along path $path";
1393     return undef;
1394 }
1395
1396 # Some set logic.
1397 sub _is_within {
1398     my( $set1, $set2 ) = @_;
1399     my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
1400     foreach my $el ( @$set1 ) {
1401         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
1402     }
1403     return $ret;
1404 }
1405
1406 # Return the string that joins together a list of witnesses for
1407 # display on a single path.
1408 sub _witnesses_of_label {
1409     my( $self, $label ) = @_;
1410     my $regex = $self->wit_list_separator;
1411     my @answer = split( /\Q$regex\E/, $label );
1412     return @answer;
1413 }
1414
1415 =head2 common_readings
1416
1417 Returns the list of common readings in the graph (i.e. those readings that are
1418 shared by all non-lacunose witnesses.)
1419
1420 =cut
1421
1422 sub common_readings {
1423         my $self = shift;
1424         my @common = grep { $_->is_common } $self->readings;
1425         return @common;
1426 }
1427
1428 =head2 path_text( $sigil, [, $start, $end ] )
1429
1430 Returns the text of a witness (plus its backup, if we are using a layer)
1431 as stored in the collation.  The text is returned as a string, where the
1432 individual readings are joined with spaces and the meta-readings (e.g.
1433 lacunae) are omitted.  Optional specification of $start and $end allows
1434 the generation of a subset of the witness text.
1435
1436 =cut
1437
1438 sub path_text {
1439         my( $self, $wit, $start, $end ) = @_;
1440         $start = $self->start unless $start;
1441         $end = $self->end unless $end;
1442         my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
1443         my $pathtext = '';
1444         my $last;
1445         foreach my $r ( @path ) {
1446                 unless ( $r->join_prior || !$last || $last->join_next ) {
1447                         $pathtext .= ' ';
1448                 }
1449                 $pathtext .= $r->text;
1450                 $last = $r;
1451         }
1452         return $pathtext;
1453 }
1454
1455 =head1 INITIALIZATION METHODS
1456
1457 These are mostly for use by parsers.
1458
1459 =head2 make_witness_path( $witness )
1460
1461 Link the array of readings contained in $witness->path (and in
1462 $witness->uncorrected_path if it exists) into collation paths.
1463 Clear out the arrays when finished.
1464
1465 =head2 make_witness_paths
1466
1467 Call make_witness_path for all witnesses in the tradition.
1468
1469 =cut
1470
1471 # For use when a collation is constructed from a base text and an apparatus.
1472 # We have the sequences of readings and just need to add path edges.
1473 # When we are done, clear out the witness path attributes, as they are no
1474 # longer needed.
1475 # TODO Find a way to replace the witness path attributes with encapsulated functions?
1476
1477 sub make_witness_paths {
1478     my( $self ) = @_;
1479     foreach my $wit ( $self->tradition->witnesses ) {
1480         # print STDERR "Making path for " . $wit->sigil . "\n";
1481         $self->make_witness_path( $wit );
1482     }
1483 }
1484
1485 sub make_witness_path {
1486     my( $self, $wit ) = @_;
1487     my @chain = @{$wit->path};
1488     my $sig = $wit->sigil;
1489     # Add start and end if necessary
1490     unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1491     push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1492     foreach my $idx ( 0 .. $#chain-1 ) {
1493         $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
1494     }
1495     if( $wit->is_layered ) {
1496         @chain = @{$wit->uncorrected_path};
1497                 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1498                 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1499         foreach my $idx( 0 .. $#chain-1 ) {
1500             my $source = $chain[$idx];
1501             my $target = $chain[$idx+1];
1502             $self->add_path( $source, $target, $sig.$self->ac_label )
1503                 unless $self->has_path( $source, $target, $sig );
1504         }
1505     }
1506     $wit->clear_path;
1507     $wit->clear_uncorrected_path;
1508 }
1509
1510 =head2 calculate_ranks
1511
1512 Calculate the reading ranks (that is, their aligned positions relative
1513 to each other) for the graph.  This can only be called on linear collations.
1514
1515 =begin testing
1516
1517 use Text::Tradition;
1518
1519 my $cxfile = 't/data/Collatex-16.xml';
1520 my $t = Text::Tradition->new(
1521     'name'  => 'inline',
1522     'input' => 'CollateX',
1523     'file'  => $cxfile,
1524     );
1525 my $c = $t->collation;
1526
1527 # Make an svg
1528 my $table = $c->alignment_table;
1529 ok( $c->has_cached_table, "Alignment table was cached" );
1530 is( $c->alignment_table, $table, "Cached table returned upon second call" );
1531 $c->calculate_ranks;
1532 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
1533 $c->add_relationship( 'n24', 'n23', { 'type' => 'spelling' } );
1534 isnt( $c->alignment_table, $table, "Alignment table changed after relationship add" );
1535
1536 =end testing
1537
1538 =cut
1539
1540 sub calculate_ranks {
1541     my $self = shift;
1542     # Save the existing ranks, in case we need to invalidate the cached SVG.
1543     my %existing_ranks;
1544     map { $existing_ranks{$_} = $_->rank } $self->readings;
1545
1546     # Do the rankings based on the relationship equivalence graph, starting
1547     # with the start node.
1548     my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
1549
1550     # Transfer our rankings from the topological graph to the real one.
1551     foreach my $r ( $self->readings ) {
1552         if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
1553             $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
1554         } else {
1555                 # Die. Find the last rank we calculated.
1556                 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
1557                                  <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
1558                         $self->readings;
1559                 my $last = pop @all_defined;
1560             throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
1561         }
1562     }
1563     # Do we need to invalidate the cached data?
1564     if( $self->has_cached_svg || $self->has_cached_table ) {
1565         foreach my $r ( $self->readings ) {
1566                 next if defined( $existing_ranks{$r} )
1567                         && $existing_ranks{$r} == $r->rank;
1568                 # Something has changed, so clear the cache
1569                 $self->_clear_cache;
1570                         # ...and recalculate the common readings.
1571                         $self->calculate_common_readings();
1572                 last;
1573         }
1574     }
1575         # The graph calculation information is now up to date.
1576         $self->_graphcalc_done(1);
1577 }
1578
1579 sub _clear_cache {
1580         my $self = shift;
1581         $self->wipe_svg if $self->has_cached_svg;
1582         $self->wipe_table if $self->has_cached_table;
1583 }
1584
1585
1586 =head2 flatten_ranks
1587
1588 A convenience method for parsing collation data.  Searches the graph for readings
1589 with the same text at the same rank, and merges any that are found.
1590
1591 =cut
1592
1593 sub flatten_ranks {
1594     my $self = shift;
1595     my %unique_rank_rdg;
1596     my $changed;
1597     foreach my $rdg ( $self->readings ) {
1598         next unless $rdg->has_rank;
1599         my $key = $rdg->rank . "||" . $rdg->text;
1600         if( exists $unique_rank_rdg{$key} ) {
1601                 # Make sure they don't have different grammatical forms
1602                         my $ur = $unique_rank_rdg{$key};
1603                         if( $rdg->disambiguated && $ur->disambiguated ) {
1604                                 my $rform = join( '//', map { $_->form->to_string } $rdg->lexemes );
1605                                 my $uform = join( '//', map { $_->form->to_string } $ur->lexemes );
1606                                 next unless $rform eq $uform;
1607                         } elsif( $rdg->disambiguated xor $ur->disambiguated ) {
1608                                 next;
1609                         }
1610             # Combine!
1611                 #print STDERR "Combining readings at same rank: $key\n";
1612                 $changed = 1;
1613             $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
1614             # TODO see if this now makes a common point.
1615         } else {
1616             $unique_rank_rdg{$key} = $rdg;
1617         }
1618     }
1619     # If we merged readings, the ranks are still fine but the alignment
1620     # table is wrong. Wipe it.
1621     $self->wipe_table() if $changed;
1622 }
1623
1624
1625 =head2 calculate_common_readings
1626
1627 Goes through the graph identifying the readings that appear in every witness
1628 (apart from those with lacunae at that spot.) Marks them as common and returns
1629 the list.
1630
1631 =begin testing
1632
1633 use Text::Tradition;
1634
1635 my $cxfile = 't/data/Collatex-16.xml';
1636 my $t = Text::Tradition->new(
1637     'name'  => 'inline',
1638     'input' => 'CollateX',
1639     'file'  => $cxfile,
1640     );
1641 my $c = $t->collation;
1642
1643 my @common = $c->calculate_common_readings();
1644 is( scalar @common, 8, "Found correct number of common readings" );
1645 my @marked = sort $c->common_readings();
1646 is( scalar @common, 8, "All common readings got marked as such" );
1647 my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
1648 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1649
1650 =end testing
1651
1652 =cut
1653
1654 sub calculate_common_readings {
1655         my $self = shift;
1656         my @common;
1657         map { $_->is_common( 0 ) } $self->readings;
1658         # Implicitly calls calculate_ranks
1659         my $table = $self->alignment_table;
1660         foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1661                 my @row = map { $_->{'tokens'}->[$idx]
1662                                                         ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1663                                         @{$table->{'alignment'}};
1664                 my %hash;
1665                 foreach my $r ( @row ) {
1666                         if( $r ) {
1667                                 $hash{$r->id} = $r unless $r->is_meta;
1668                         } else {
1669                                 $hash{'UNDEF'} = $r;
1670                         }
1671                 }
1672                 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1673                         my( $r ) = values %hash;
1674                         $r->is_common( 1 );
1675                         push( @common, $r );
1676                 }
1677         }
1678         return @common;
1679 }
1680
1681 =head2 text_from_paths
1682
1683 Calculate the text array for all witnesses from the path, for later consistency
1684 checking.  Only to be used if there is no non-graph-based way to know the
1685 original texts.
1686
1687 =cut
1688
1689 sub text_from_paths {
1690         my $self = shift;
1691     foreach my $wit ( $self->tradition->witnesses ) {
1692         my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1693         my @text;
1694         foreach my $r ( @readings ) {
1695                 next if $r->is_meta;
1696                 push( @text, $r->text );
1697         }
1698         $wit->text( \@text );
1699         if( $wit->is_layered ) {
1700                         my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
1701                                                                                                   $wit->sigil.$self->ac_label );
1702                         my @uctext;
1703                         foreach my $r ( @ucrdgs ) {
1704                                 next if $r->is_meta;
1705                                 push( @uctext, $r->text );
1706                         }
1707                         $wit->layertext( \@uctext );
1708         }
1709     }
1710 }
1711
1712 =head1 UTILITY FUNCTIONS
1713
1714 =head2 common_predecessor( $reading_a, $reading_b )
1715
1716 Find the last reading that occurs in sequence before both the given readings.
1717 At the very least this should be $self->start.
1718
1719 =head2 common_successor( $reading_a, $reading_b )
1720
1721 Find the first reading that occurs in sequence after both the given readings.
1722 At the very least this should be $self->end.
1723
1724 =begin testing
1725
1726 use Text::Tradition;
1727
1728 my $cxfile = 't/data/Collatex-16.xml';
1729 my $t = Text::Tradition->new(
1730     'name'  => 'inline',
1731     'input' => 'CollateX',
1732     'file'  => $cxfile,
1733     );
1734 my $c = $t->collation;
1735
1736 is( $c->common_predecessor( 'n24', 'n23' )->id,
1737     'n20', "Found correct common predecessor" );
1738 is( $c->common_successor( 'n24', 'n23' )->id,
1739     '__END__', "Found correct common successor" );
1740
1741 is( $c->common_predecessor( 'n19', 'n17' )->id,
1742     'n16', "Found correct common predecessor for readings on same path" );
1743 is( $c->common_successor( 'n21', 'n10' )->id,
1744     '__END__', "Found correct common successor for readings on same path" );
1745
1746 =end testing
1747
1748 =cut
1749
1750 ## Return the closest reading that is a predecessor of both the given readings.
1751 sub common_predecessor {
1752         my $self = shift;
1753         my( $r1, $r2 ) = $self->_objectify_args( @_ );
1754         return $self->_common_in_path( $r1, $r2, 'predecessors' );
1755 }
1756
1757 sub common_successor {
1758         my $self = shift;
1759         my( $r1, $r2 ) = $self->_objectify_args( @_ );
1760         return $self->_common_in_path( $r1, $r2, 'successors' );
1761 }
1762
1763
1764 # TODO think about how to do this without ranks...
1765 sub _common_in_path {
1766         my( $self, $r1, $r2, $dir ) = @_;
1767         my $iter = $self->end->rank;
1768         my @candidates;
1769         my @last_r1 = ( $r1 );
1770         my @last_r2 = ( $r2 );
1771         # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
1772         my %all_seen;
1773         # print STDERR "Finding common $dir for $r1, $r2\n";
1774         while( !@candidates ) {
1775                 last unless $iter--;  # Avoid looping infinitely
1776                 # Iterate separately down the graph from r1 and r2
1777                 my( @new_lc1, @new_lc2 );
1778                 foreach my $lc ( @last_r1 ) {
1779                         foreach my $p ( $lc->$dir ) {
1780                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
1781                                         # print STDERR "Path candidate $p from $lc\n";
1782                                         push( @candidates, $p );
1783                                 } elsif( !$all_seen{$p->id} ) {
1784                                         $all_seen{$p->id} = 'r1';
1785                                         push( @new_lc1, $p );
1786                                 }
1787                         }
1788                 }
1789                 foreach my $lc ( @last_r2 ) {
1790                         foreach my $p ( $lc->$dir ) {
1791                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
1792                                         # print STDERR "Path candidate $p from $lc\n";
1793                                         push( @candidates, $p );
1794                                 } elsif( !$all_seen{$p->id} ) {
1795                                         $all_seen{$p->id} = 'r2';
1796                                         push( @new_lc2, $p );
1797                                 }
1798                         }
1799                 }
1800                 @last_r1 = @new_lc1;
1801                 @last_r2 = @new_lc2;
1802         }
1803         my @answer = sort { $a->rank <=> $b->rank } @candidates;
1804         return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1805 }
1806
1807 sub throw {
1808         Text::Tradition::Error->throw(
1809                 'ident' => 'Collation error',
1810                 'message' => $_[0],
1811                 );
1812 }
1813
1814 no Moose;
1815 __PACKAGE__->meta->make_immutable;
1816
1817 =head1 LICENSE
1818
1819 This package is free software and is provided "as is" without express
1820 or implied warranty.  You can redistribute it and/or modify it under
1821 the same terms as Perl itself.
1822
1823 =head1 AUTHOR
1824
1825 Tara L Andrews E<lt>aurum@cpan.orgE<gt>