lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use Encode qw( decode_utf8 );
   4 use File::Temp;
   5 use File::Which;
   6 use Graph;
   7 use IPC::Run qw( run binary );
   8 use Text::CSV;
   9 use Text::Tradition::Collation::Reading;
  10 use Text::Tradition::Collation::RelationshipStore;
  11 use Text::Tradition::Error;
  12 use XML::LibXML;
  13 use XML::LibXML::XPathContext;
  14 use Moose;
  15
  16 has 'sequence' => (
  17     is => 'ro',
  18     isa => 'Graph',
  19     default => sub { Graph->new() },
  20     handles => {
  21         paths => 'edges',
  22     },
  23     );
  24
  25 has 'relations' => (
  26         is => 'ro',
  27         isa => 'Text::Tradition::Collation::RelationshipStore',
  28         handles => {
  29                 relationships => 'relationships',
  30                 related_readings => 'related_readings',
  31                 get_relationship => 'get_relationship',
  32                 del_relationship => 'del_relationship',
  33         },
  34         writer => '_set_relations',
  35         );
  36
  37 has 'tradition' => (
  38     is => 'ro',
  39     isa => 'Text::Tradition',
  40     writer => '_set_tradition',
  41     weak_ref => 1,
  42     );
  43
  44 has 'readings' => (
  45         isa => 'HashRef[Text::Tradition::Collation::Reading]',
  46         traits => ['Hash'],
  47     handles => {
  48         reading     => 'get',
  49         _add_reading => 'set',
  50         del_reading => 'delete',
  51         has_reading => 'exists',
  52         readings   => 'values',
  53     },
  54     default => sub { {} },
  55         );
  56
  57 has 'wit_list_separator' => (
  58     is => 'rw',
  59     isa => 'Str',
  60     default => ', ',
  61     );
  62
  63 has 'baselabel' => (
  64     is => 'rw',
  65     isa => 'Str',
  66     default => 'base text',
  67     );
  68
  69 has 'linear' => (
  70     is => 'rw',
  71     isa => 'Bool',
  72     default => 1,
  73     );
  74
  75 has 'ac_label' => (
  76     is => 'rw',
  77     isa => 'Str',
  78     default => ' (a.c.)',
  79     );
  80
  81 has 'wordsep' => (
  82         is => 'rw',
  83         isa => 'Str',
  84         default => ' ',
  85         );
  86
  87 has 'start' => (
  88         is => 'ro',
  89         isa => 'Text::Tradition::Collation::Reading',
  90         writer => '_set_start',
  91         weak_ref => 1,
  92         );
  93
  94 has 'end' => (
  95         is => 'ro',
  96         isa => 'Text::Tradition::Collation::Reading',
  97         writer => '_set_end',
  98         weak_ref => 1,
  99         );
 100
 101 has 'cached_svg' => (
 102         is => 'rw',
 103         isa => 'Str',
 104         predicate => 'has_cached_svg',
 105         clearer => 'wipe_svg',
 106         );
 107
 108 has 'cached_table' => (
 109         is => 'rw',
 110         isa => 'HashRef',
 111         predicate => 'has_cached_table',
 112         clearer => 'wipe_table',
 113         );
 114
 115 has '_graphcalc_done' => (
 116         is => 'rw',
 117         isa => 'Bool',
 118         default => undef,
 119         );
 120
 121 =head1 NAME
 122
 123 Text::Tradition::Collation - a software model for a text collation
 124
 125 =head1 SYNOPSIS
 126
 127   use Text::Tradition;
 128   my $t = Text::Tradition->new(
 129     'name' => 'this is a text',
 130     'input' => 'TEI',
 131     'file' => '/path/to/tei_parallel_seg_file.xml' );
 132
 133   my $c = $t->collation;
 134   my @readings = $c->readings;
 135   my @paths = $c->paths;
 136   my @relationships = $c->relationships;
 137
 138   my $svg_variant_graph = $t->collation->as_svg();
 139
 140 =head1 DESCRIPTION
 141
 142 Text::Tradition is a library for representation and analysis of collated
 143 texts, particularly medieval ones.  The Collation is the central feature of
 144 a Tradition, where the text, its sequence of readings, and its relationships
 145 between readings are actually kept.
 146
 147 =head1 CONSTRUCTOR
 148
 149 =head2 new
 150
 151 The constructor.  Takes a hash or hashref of the following arguments:
 152
 153 =over
 154
 155 =item * tradition - The Text::Tradition object to which the collation
 156 belongs. Required.
 157
 158 =item * linear - Whether the collation should be linear; that is, whether
 159 transposed readings should be treated as two linked readings rather than one,
 160 and therefore whether the collation graph is acyclic.  Defaults to true.
 161
 162 =item * baselabel - The default label for the path taken by a base text
 163 (if any). Defaults to 'base text'.
 164
 165 =item * wit_list_separator - The string to join a list of witnesses for
 166 purposes of making labels in display graphs.  Defaults to ', '.
 167
 168 =item * ac_label - The extra label to tack onto a witness sigil when
 169 representing another layer of path for the given witness - that is, when
 170 a text has more than one possible reading due to scribal corrections or
 171 the like.  Defaults to ' (a.c.)'.
 172
 173 =item * wordsep - The string used to separate words in the original text.
 174 Defaults to ' '.
 175
 176 =back
 177
 178 =head1 ACCESSORS
 179
 180 =head2 tradition
 181
 182 =head2 linear
 183
 184 =head2 wit_list_separator
 185
 186 =head2 baselabel
 187
 188 =head2 ac_label
 189
 190 =head2 wordsep
 191
 192 Simple accessors for collation attributes.
 193
 194 =head2 start
 195
 196 The meta-reading at the start of every witness path.
 197
 198 =head2 end
 199
 200 The meta-reading at the end of every witness path.
 201
 202 =head2 readings
 203
 204 Returns all Reading objects in the graph.
 205
 206 =head2 reading( $id )
 207
 208 Returns the Reading object corresponding to the given ID.
 209
 210 =head2 add_reading( $reading_args )
 211
 212 Adds a new reading object to the collation.
 213 See L<Text::Tradition::Collation::Reading> for the available arguments.
 214
 215 =head2 del_reading( $object_or_id )
 216
 217 Removes the given reading from the collation, implicitly removing its
 218 paths and relationships.
 219
 220 =head2 merge_readings( $main, $second, $concatenate, $with_str )
 221
 222 Merges the $second reading into the $main one. If $concatenate is true, then
 223 the merged node will carry the text of both readings, concatenated with either
 224 $with_str (if specified) or a sensible default (the empty string if the
 225 appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
 226
 227 The first two arguments may be either readings or reading IDs.
 228
 229 =head2 has_reading( $id )
 230
 231 Predicate to see whether a given reading ID is in the graph.
 232
 233 =head2 reading_witnesses( $object_or_id )
 234
 235 Returns a list of sigils whose witnesses contain the reading.
 236
 237 =head2 paths
 238
 239 Returns all reading paths within the document - that is, all edges in the
 240 collation graph.  Each path is an arrayref of [ $source, $target ] reading IDs.
 241
 242 =head2 add_path( $source, $target, $sigil )
 243
 244 Links the given readings in the collation in sequence, under the given witness
 245 sigil.  The readings may be specified by object or ID.
 246
 247 =head2 del_path( $source, $target, $sigil )
 248
 249 Links the given readings in the collation in sequence, under the given witness
 250 sigil.  The readings may be specified by object or ID.
 251
 252 =head2 has_path( $source, $target );
 253
 254 Returns true if the two readings are linked in sequence in any witness.
 255 The readings may be specified by object or ID.
 256
 257 =head2 relationships
 258
 259 Returns all Relationship objects in the collation.
 260
 261 =head2 add_relationship( $reading, $other_reading, $options )
 262
 263 Adds a new relationship of the type given in $options between the two readings,
 264 which may be specified by object or ID.  Returns a value of ( $status, @vectors)
 265 where $status is true on success, and @vectors is a list of relationship edges
 266 that were ultimately added.
 267 See L<Text::Tradition::Collation::Relationship> for the available options.
 268
 269 =cut
 270
 271 sub BUILD {
 272     my $self = shift;
 273     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
 274     $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
 275     $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
 276 }
 277
 278 ### Reading construct/destruct functions
 279
 280 sub add_reading {
 281         my( $self, $reading ) = @_;
 282         unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
 283                 my %args = %$reading;
 284                 $reading = Text::Tradition::Collation::Reading->new(
 285                         'collation' => $self,
 286                         %args );
 287         }
 288         # First check to see if a reading with this ID exists.
 289         if( $self->reading( $reading->id ) ) {
 290                 throw( "Collation already has a reading with id " . $reading->id );
 291         }
 292         $self->_graphcalc_done(0);
 293         $self->_add_reading( $reading->id => $reading );
 294         # Once the reading has been added, put it in both graphs.
 295         $self->sequence->add_vertex( $reading->id );
 296         $self->relations->add_reading( $reading->id );
 297         return $reading;
 298 };
 299
 300 around del_reading => sub {
 301         my $orig = shift;
 302         my $self = shift;
 303         my $arg = shift;
 304
 305         if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
 306                 $arg = $arg->id;
 307         }
 308         # Remove the reading from the graphs.
 309         $self->_graphcalc_done(0);
 310         $self->_clear_cache; # Explicitly clear caches to GC the reading
 311         $self->sequence->delete_vertex( $arg );
 312         $self->relations->delete_reading( $arg );
 313
 314         # Carry on.
 315         $self->$orig( $arg );
 316 };
 317
 318 =begin testing
 319
 320 use Text::Tradition;
 321
 322 my $cxfile = 't/data/Collatex-16.xml';
 323 my $t = Text::Tradition->new(
 324     'name'  => 'inline',
 325     'input' => 'CollateX',
 326     'file'  => $cxfile,
 327     );
 328 my $c = $t->collation;
 329
 330 my $rno = scalar $c->readings;
 331 # Split n21 for testing purposes
 332 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
 333 my $old_r = $c->reading( 'n21' );
 334 $old_r->alter_text( 'to' );
 335 $c->del_path( 'n20', 'n21', 'A' );
 336 $c->add_path( 'n20', 'n21p0', 'A' );
 337 $c->add_path( 'n21p0', 'n21', 'A' );
 338 $c->flatten_ranks();
 339 ok( $c->reading( 'n21p0' ), "New reading exists" );
 340 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
 341
 342 # Combine n3 and n4 ( with his )
 343 $c->merge_readings( 'n3', 'n4', 1 );
 344 ok( !$c->reading('n4'), "Reading n4 is gone" );
 345 is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
 346
 347 # Collapse n9 and n10 ( rood / root )
 348 $c->merge_readings( 'n9', 'n10' );
 349 ok( !$c->reading('n10'), "Reading n10 is gone" );
 350 is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
 351
 352 # Combine n21 and n21p0
 353 my $remaining = $c->reading('n21');
 354 $remaining ||= $c->reading('n22');  # one of these should still exist
 355 $c->merge_readings( 'n21p0', $remaining, 1 );
 356 ok( !$c->reading('n21'), "Reading $remaining is gone" );
 357 is( $c->reading('n21p0')->text, 'unto', "Reading n21p0 merged correctly" );
 358
 359 =end testing
 360
 361 =cut
 362
 363 sub merge_readings {
 364         my $self = shift;
 365
 366         # We only need the IDs for adding paths to the graph, not the reading
 367         # objects themselves.
 368     my( $kept, $deleted, $combine, $combine_char ) = $self->_stringify_args( @_ );
 369         $self->_graphcalc_done(0);
 370
 371     # The kept reading should inherit the paths and the relationships
 372     # of the deleted reading.
 373         foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
 374                 my @vector = ( $kept );
 375                 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
 376                 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
 377                 next if $vector[0] eq $vector[1]; # Don't add a self loop
 378                 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
 379                 $self->sequence->add_edge( @vector );
 380                 my $fwits = $self->sequence->get_edge_attributes( @vector );
 381                 @wits{keys %$fwits} = values %$fwits;
 382                 $self->sequence->set_edge_attributes( @vector, \%wits );
 383         }
 384         $self->relations->merge_readings( $kept, $deleted, $combine_char );
 385
 386         # Do the deletion deed.
 387         if( $combine ) {
 388                 my $kept_obj = $self->reading( $kept );
 389                 my $del_obj = $self->reading( $deleted );
 390                 my $joinstr = $combine_char;
 391                 unless( defined $joinstr ) {
 392                         $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
 393                         $joinstr = $self->wordsep unless defined $joinstr;
 394                 }
 395                 $kept_obj->alter_text( join( $joinstr, $kept_obj->text, $del_obj->text ) );
 396         }
 397         $self->del_reading( $deleted );
 398 }
 399
 400
 401 # Helper function for manipulating the graph.
 402 sub _stringify_args {
 403         my( $self, $first, $second, @args ) = @_;
 404     $first = $first->id
 405         if ref( $first ) eq 'Text::Tradition::Collation::Reading';
 406     $second = $second->id
 407         if ref( $second ) eq 'Text::Tradition::Collation::Reading';
 408     return( $first, $second, @args );
 409 }
 410
 411 # Helper function for manipulating the graph.
 412 sub _objectify_args {
 413         my( $self, $first, $second, $arg ) = @_;
 414     $first = $self->reading( $first )
 415         unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
 416     $second = $self->reading( $second )
 417         unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
 418     return( $first, $second, $arg );
 419 }
 420 ### Path logic
 421
 422 sub add_path {
 423         my $self = shift;
 424
 425         # We only need the IDs for adding paths to the graph, not the reading
 426         # objects themselves.
 427     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 428
 429         $self->_graphcalc_done(0);
 430         # Connect the readings
 431     $self->sequence->add_edge( $source, $target );
 432     # Note the witness in question
 433     $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
 434 };
 435
 436 sub del_path {
 437         my $self = shift;
 438         my @args;
 439         if( ref( $_[0] ) eq 'ARRAY' ) {
 440                 my $e = shift @_;
 441                 @args = ( @$e, @_ );
 442         } else {
 443                 @args = @_;
 444         }
 445
 446         # We only need the IDs for adding paths to the graph, not the reading
 447         # objects themselves.
 448     my( $source, $target, $wit ) = $self->_stringify_args( @args );
 449
 450         $self->_graphcalc_done(0);
 451         if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
 452                 $self->sequence->delete_edge_attribute( $source, $target, $wit );
 453         }
 454         unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
 455                 $self->sequence->delete_edge( $source, $target );
 456         }
 457 }
 458
 459
 460 # Extra graph-alike utility
 461 sub has_path {
 462         my $self = shift;
 463     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 464         return undef unless $self->sequence->has_edge( $source, $target );
 465         return $self->sequence->has_edge_attribute( $source, $target, $wit );
 466 }
 467
 468 =head2 clear_witness( @sigil_list )
 469
 470 Clear the given witnesses out of the collation entirely, removing references
 471 to them in paths, and removing readings that belong only to them.  Should only
 472 be called via $tradition->del_witness.
 473
 474 =cut
 475
 476 sub clear_witness {
 477         my( $self, @sigils ) = @_;
 478
 479         $self->_graphcalc_done(0);
 480         # Clear the witness(es) out of the paths
 481         foreach my $e ( $self->paths ) {
 482                 foreach my $sig ( @sigils ) {
 483                         $self->del_path( $e, $sig );
 484                 }
 485         }
 486
 487         # Clear out the newly unused readings
 488         foreach my $r ( $self->readings ) {
 489                 unless( $self->reading_witnesses( $r ) ) {
 490                         $self->del_reading( $r );
 491                 }
 492         }
 493 }
 494
 495 sub add_relationship {
 496         my $self = shift;
 497     my( $source, $target, $opts ) = $self->_stringify_args( @_ );
 498     my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
 499         $self->_graphcalc_done(0);
 500     return @vectors;
 501 }
 502
 503 around qw/ get_relationship del_relationship / => sub {
 504         my $orig = shift;
 505         my $self = shift;
 506         my @args = @_;
 507         if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
 508                 @args = @{$_[0]};
 509         }
 510         my( $source, $target ) = $self->_stringify_args( @args );
 511         $self->$orig( $source, $target );
 512 };
 513
 514 =head2 reading_witnesses( $reading )
 515
 516 Return a list of sigils corresponding to the witnesses in which the reading appears.
 517
 518 =cut
 519
 520 sub reading_witnesses {
 521         my( $self, $reading ) = @_;
 522         # We need only check either the incoming or the outgoing edges; I have
 523         # arbitrarily chosen "incoming".  Thus, special-case the start node.
 524         if( $reading eq $self->start ) {
 525                 return map { $_->sigil } $self->tradition->witnesses;
 526         }
 527         my %all_witnesses;
 528         foreach my $e ( $self->sequence->edges_to( $reading ) ) {
 529                 my $wits = $self->sequence->get_edge_attributes( @$e );
 530                 @all_witnesses{ keys %$wits } = 1;
 531         }
 532         my $acstr = $self->ac_label;
 533         foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
 534                 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
 535         }
 536         return keys %all_witnesses;
 537 }
 538
 539 =head1 OUTPUT METHODS
 540
 541 =head2 as_svg( \%options )
 542
 543 Returns an SVG string that represents the graph, via as_dot and graphviz.
 544 See as_dot for a list of options.  Must have GraphViz (dot) installed to run.
 545
 546 =cut
 547
 548 sub as_svg {
 549     my( $self, $opts ) = @_;
 550     throw( "Need GraphViz installed to output SVG" )
 551         unless File::Which::which( 'dot' );
 552     my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
 553     $self->calculate_ranks() unless( $self->_graphcalc_done || $opts->{'nocalc'} );
 554     if( !$self->has_cached_svg || $opts->{'recalc'}     || $want_subgraph ) {
 555                 my @cmd = qw/dot -Tsvg/;
 556                 my( $svg, $err );
 557                 my $dotfile = File::Temp->new();
 558                 ## USE FOR DEBUGGING
 559                 # $dotfile->unlink_on_destroy(0);
 560                 binmode $dotfile, ':utf8';
 561                 print $dotfile $self->as_dot( $opts );
 562                 push( @cmd, $dotfile->filename );
 563                 run( \@cmd, ">", binary(), \$svg );
 564                 $svg = decode_utf8( $svg );
 565                 $self->cached_svg( $svg ) unless $want_subgraph;
 566                 return $svg;
 567         } else {
 568                 return $self->cached_svg;
 569         }
 570 }
 571
 572
 573 =head2 as_dot( \%options )
 574
 575 Returns a string that is the collation graph expressed in dot
 576 (i.e. GraphViz) format.  Options include:
 577
 578 =over 4
 579
 580 =item * from
 581
 582 =item * to
 583
 584 =item * color_common
 585
 586 =back
 587
 588 =cut
 589
 590 sub as_dot {
 591     my( $self, $opts ) = @_;
 592     my $startrank = $opts->{'from'} if $opts;
 593     my $endrank = $opts->{'to'} if $opts;
 594     my $color_common = $opts->{'color_common'} if $opts;
 595     my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
 596        && $self->end->rank > 100;
 597     $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
 598
 599     # Check the arguments
 600     if( $startrank ) {
 601         return if $endrank && $startrank > $endrank;
 602         return if $startrank > $self->end->rank;
 603         }
 604         if( defined $endrank ) {
 605                 return if $endrank < 0;
 606                 $endrank = undef if $endrank == $self->end->rank;
 607         }
 608
 609     my $graph_name = $self->tradition->name;
 610     $graph_name =~ s/[^\w\s]//g;
 611     $graph_name = join( '_', split( /\s+/, $graph_name ) );
 612
 613     my %graph_attrs = (
 614         'rankdir' => 'LR',
 615         'bgcolor' => 'none',
 616         );
 617     my %node_attrs = (
 618         'fontsize' => 14,
 619         'fillcolor' => 'white',
 620         'style' => 'filled',
 621         'shape' => 'ellipse'
 622         );
 623     my %edge_attrs = (
 624         'arrowhead' => 'open',
 625         'color' => '#000000',
 626         'fontcolor' => '#000000',
 627         );
 628
 629     my $dot = sprintf( "digraph %s {\n", $graph_name );
 630     $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
 631     $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
 632
 633         # Output substitute start/end readings if necessary
 634         if( $startrank ) {
 635                 $dot .= "\t\"#SUBSTART#\" [ label=\"...\" ];\n";
 636         }
 637         if( $endrank ) {
 638                 $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n";
 639         }
 640         if( $STRAIGHTENHACK ) {
 641                 ## HACK part 1
 642                 my $startlabel = $startrank ? 'SUBSTART' : 'START';
 643                 $dot .= "\tsubgraph { rank=same \"#$startlabel#\" \"#SILENT#\" }\n";
 644                 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
 645         }
 646         my %used;  # Keep track of the readings that actually appear in the graph
 647         # Sort the readings by rank if we have ranks; this speeds layout.
 648         my @all_readings = $self->end->has_rank
 649                 ? sort { $a->rank <=> $b->rank } $self->readings
 650                 : $self->readings;
 651         # TODO Refrain from outputting lacuna nodes - just grey out the edges.
 652     foreach my $reading ( @all_readings ) {
 653         # Only output readings within our rank range.
 654         next if $startrank && $reading->rank < $startrank;
 655         next if $endrank && $reading->rank > $endrank;
 656         $used{$reading->id} = 1;
 657         # Need not output nodes without separate labels
 658         next if $reading->id eq $reading->text;
 659         my $rattrs;
 660         my $label = $reading->text;
 661         $label .= '-' if $reading->join_next;
 662         $label = "-$label" if $reading->join_prior;
 663         $label =~ s/\"/\\\"/g;
 664                 $rattrs->{'label'} = $label;
 665                 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
 666         $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
 667     }
 668
 669         # Add the real edges. Need to weight one edge per rank jump, in a
 670         # continuous line.
 671         # my $weighted = $self->_add_edge_weights;
 672     my @edges = $self->paths;
 673         my( %substart, %subend );
 674     foreach my $edge ( @edges ) {
 675         # Do we need to output this edge?
 676         if( $used{$edge->[0]} && $used{$edge->[1]} ) {
 677                 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
 678                         my $variables = { %edge_attrs, 'label' => $label };
 679
 680                         # Account for the rank gap if necessary
 681                         my $rank0 = $self->reading( $edge->[0] )->rank
 682                                 if $self->reading( $edge->[0] )->has_rank;
 683                         my $rank1 = $self->reading( $edge->[1] )->rank
 684                                 if $self->reading( $edge->[1] )->has_rank;
 685                         if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
 686                                 $variables->{'minlen'} = $rank1 - $rank0;
 687                         }
 688
 689                         # Add the calculated edge weights
 690                         # if( exists $weighted->{$edge->[0]}
 691                         #       && $weighted->{$edge->[0]} eq $edge->[1] ) {
 692                         #       # $variables->{'color'} = 'red';
 693                         #       $variables->{'weight'} = 3.0;
 694                         # }
 695
 696                         # EXPERIMENTAL: make edge width reflect no. of witnesses
 697                         my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
 698                         $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
 699
 700                         my $varopts = _dot_attr_string( $variables );
 701                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
 702                                 $edge->[0], $edge->[1], $varopts );
 703         } elsif( $used{$edge->[0]} ) {
 704                 $subend{$edge->[0]} = 1;
 705         } elsif( $used{$edge->[1]} ) {
 706                 $substart{$edge->[1]} = 1;
 707         }
 708     }
 709     # Add substitute start and end edges if necessary
 710     foreach my $node ( keys %substart ) {
 711         my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
 712         my $variables = { %edge_attrs, 'label' => $witstr };
 713         my $varopts = _dot_attr_string( $variables );
 714         $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;";
 715         }
 716     foreach my $node ( keys %subend ) {
 717         my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
 718         my $variables = { %edge_attrs, 'label' => $witstr };
 719         my $varopts = _dot_attr_string( $variables );
 720         $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;";
 721         }
 722         # HACK part 2
 723         if( $STRAIGHTENHACK ) {
 724                 my $endlabel = $endrank ? 'SUBEND' : 'END';
 725                 $dot .= "\t\"#$endlabel#\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
 726         }
 727
 728     $dot .= "}\n";
 729     return $dot;
 730 }
 731
 732 sub _dot_attr_string {
 733         my( $hash ) = @_;
 734         my @attrs;
 735         foreach my $k ( sort keys %$hash ) {
 736                 my $v = $hash->{$k};
 737                 push( @attrs, $k.'="'.$v.'"' );
 738         }
 739         return( '[ ' . join( ', ', @attrs ) . ' ]' );
 740 }
 741
 742 sub _add_edge_weights {
 743         my $self = shift;
 744         # Walk the graph from START to END, choosing the successor node with
 745         # the largest number of witness paths each time.
 746         my $weighted = {};
 747         my $curr = $self->start->id;
 748         my $ranked = $self->end->has_rank;
 749         while( $curr ne $self->end->id ) {
 750                 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
 751                 my @succ = sort { $self->path_witnesses( $curr, $a )
 752                                                         <=> $self->path_witnesses( $curr, $b ) }
 753                         $self->sequence->successors( $curr );
 754                 my $next = pop @succ;
 755                 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
 756                 # Try to avoid lacunae in the weighted path.
 757                 while( @succ &&
 758                            ( $self->reading( $next )->is_lacuna ||
 759                                  $nextrank - $rank > 1 ) ){
 760                         $next = pop @succ;
 761                 }
 762                 $weighted->{$curr} = $next;
 763                 $curr = $next;
 764         }
 765         return $weighted;
 766 }
 767
 768 =head2 path_witnesses( $edge )
 769
 770 Returns the list of sigils whose witnesses are associated with the given edge.
 771 The edge can be passed as either an array or an arrayref of ( $source, $target ).
 772
 773 =cut
 774
 775 sub path_witnesses {
 776         my( $self, @edge ) = @_;
 777         # If edge is an arrayref, cope.
 778         if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
 779                 my $e = shift @edge;
 780                 @edge = @$e;
 781         }
 782         my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
 783         return @wits;
 784 }
 785
 786 # Helper function. Make a display label for the given witnesses, showing a.c.
 787 # witnesses only where the main witness is not also in the list.
 788 sub _path_display_label {
 789         my $self = shift;
 790         my %wits;
 791         map { $wits{$_} = 1 } @_;
 792
 793         # If an a.c. wit is listed, remove it if the main wit is also listed.
 794         # Otherwise keep it for explicit listing.
 795         my $aclabel = $self->ac_label;
 796         my @disp_ac;
 797         foreach my $w ( sort keys %wits ) {
 798                 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
 799                         if( exists $wits{$1} ) {
 800                                 delete $wits{$w};
 801                         } else {
 802                                 push( @disp_ac, $w );
 803                         }
 804                 }
 805         }
 806
 807         # See if we are in a majority situation.
 808         my $maj = scalar( $self->tradition->witnesses ) * 0.6;
 809         if( scalar keys %wits > $maj ) {
 810                 unshift( @disp_ac, 'majority' );
 811                 return join( ', ', @disp_ac );
 812         } else {
 813                 return join( ', ', sort keys %wits );
 814         }
 815 }
 816
 817 =head2 readings_at_rank( $rank )
 818
 819 Returns a list of readings at a given rank, taken from the alignment table.
 820
 821 =cut
 822
 823 sub readings_at_rank {
 824         my( $self, $rank ) = @_;
 825         my $table = $self->alignment_table;
 826         # Table rank is real rank - 1.
 827         my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
 828         my %readings;
 829         foreach my $e ( @elements ) {
 830                 next unless ref( $e ) eq 'HASH';
 831                 next unless exists $e->{'t'};
 832                 $readings{$e->{'t'}->id} = $e->{'t'};
 833         }
 834         return values %readings;
 835 }
 836
 837 =head2 as_graphml
 838
 839 Returns a GraphML representation of the collation.  The GraphML will contain
 840 two graphs. The first expresses the attributes of the readings and the witness
 841 paths that link them; the second expresses the relationships that link the
 842 readings.  This is the native transfer format for a tradition.
 843
 844 =begin testing
 845
 846 use Text::Tradition;
 847
 848 my $READINGS = 311;
 849 my $PATHS = 361;
 850
 851 my $datafile = 't/data/florilegium_tei_ps.xml';
 852 my $tradition = Text::Tradition->new( 'input' => 'TEI',
 853                                       'name' => 'test0',
 854                                       'file' => $datafile,
 855                                       'linear' => 1 );
 856
 857 ok( $tradition, "Got a tradition object" );
 858 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
 859 ok( $tradition->collation, "Tradition has a collation" );
 860
 861 my $c = $tradition->collation;
 862 is( scalar $c->readings, $READINGS, "Collation has all readings" );
 863 is( scalar $c->paths, $PATHS, "Collation has all paths" );
 864 is( scalar $c->relationships, 0, "Collation has all relationships" );
 865
 866 # Add a few relationships
 867 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
 868 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
 869 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
 870
 871 # Now write it to GraphML and parse it again.
 872
 873 my $graphml = $c->as_graphml;
 874 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
 875 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
 876 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
 877 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
 878
 879 =end testing
 880
 881 =cut
 882
 883 sub as_graphml {
 884     my( $self, $options ) = @_;
 885         $self->calculate_ranks unless $self->_graphcalc_done;
 886
 887         my $start = $options->{'from'}
 888                 ? $self->reading( $options->{'from'} ) : $self->start;
 889         my $end = $options->{'to'}
 890                 ? $self->reading( $options->{'to'} ) : $self->end;
 891         if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
 892                 throw( 'Start node must be before end node' );
 893         }
 894         # The readings need to be ranked for this to work.
 895         $start = $self->start unless $start->has_rank;
 896         $end = $self->end unless $end->has_rank;
 897         my $rankoffset = 0;
 898         unless( $start eq $self->start ) {
 899                 $rankoffset = $start->rank - 1;
 900         }
 901         my %use_readings;
 902
 903     # Some namespaces
 904     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
 905     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
 906     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
 907         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
 908
 909     # Create the document and root node
 910     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
 911     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
 912     $graphml->setDocumentElement( $root );
 913     $root->setNamespace( $xsi_ns, 'xsi', 0 );
 914     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
 915
 916     # List of attribute types to save on our objects and their corresponding
 917     # GraphML types
 918     my %save_types = (
 919         'Str' => 'string',
 920         'Int' => 'int',
 921         'Bool' => 'boolean',
 922         'RelationshipType' => 'string',
 923         'RelationshipScope' => 'string',
 924     );
 925
 926     # List of attribute names *not* to save on our objects.
 927     # We will also not save any attribute beginning with _.
 928     my %skipsave;
 929     map { $skipsave{$_} = 1 } qw/ cached_svg /;
 930
 931     # Add the data keys for the graph. Include an extra key 'version' for the
 932     # GraphML output version.
 933     my %graph_data_keys;
 934     my $gdi = 0;
 935     my %graph_attributes = ( 'version' => 'string' );
 936         # Graph attributes include those of Tradition and those of Collation.
 937         my %gattr_from;
 938         my $tmeta = $self->tradition->meta;
 939         my $cmeta = $self->meta;
 940         map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
 941         map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
 942         foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
 943                 next if $attr->name =~ /^_/;
 944                 next if $skipsave{$attr->name};
 945                 next unless $save_types{$attr->type_constraint->name};
 946                 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
 947         }
 948
 949     foreach my $datum ( sort keys %graph_attributes ) {
 950         $graph_data_keys{$datum} = 'dg'.$gdi++;
 951         my $key = $root->addNewChild( $graphml_ns, 'key' );
 952         $key->setAttribute( 'attr.name', $datum );
 953         $key->setAttribute( 'attr.type', $graph_attributes{$datum} );
 954         $key->setAttribute( 'for', 'graph' );
 955         $key->setAttribute( 'id', $graph_data_keys{$datum} );
 956     }
 957
 958     # Add the data keys for reading nodes
 959     my %reading_attributes;
 960     my $rmeta = Text::Tradition::Collation::Reading->meta;
 961     foreach my $attr( $rmeta->get_all_attributes ) {
 962                 next if $attr->name =~ /^_/;
 963                 next if $skipsave{$attr->name};
 964                 next unless $save_types{$attr->type_constraint->name};
 965                 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
 966         }
 967     my %node_data_keys;
 968     my $ndi = 0;
 969     foreach my $datum ( sort keys %reading_attributes ) {
 970         $node_data_keys{$datum} = 'dn'.$ndi++;
 971         my $key = $root->addNewChild( $graphml_ns, 'key' );
 972         $key->setAttribute( 'attr.name', $datum );
 973         $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
 974         $key->setAttribute( 'for', 'node' );
 975         $key->setAttribute( 'id', $node_data_keys{$datum} );
 976     }
 977
 978     # Add the data keys for edges, that is, paths and relationships. Path
 979     # data does not come from a Moose class so is here manually.
 980     my $edi = 0;
 981     my %edge_data_keys;
 982     my %edge_attributes = (
 983         witness => 'string',                    # ID/label for a path
 984         extra => 'boolean',                             # Path key
 985         );
 986     my @path_attributes = keys %edge_attributes; # track our manual additions
 987     my $pmeta = Text::Tradition::Collation::Relationship->meta;
 988     foreach my $attr( $pmeta->get_all_attributes ) {
 989                 next if $attr->name =~ /^_/;
 990                 next if $skipsave{$attr->name};
 991                 next unless $save_types{$attr->type_constraint->name};
 992                 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
 993         }
 994     foreach my $datum ( sort keys %edge_attributes ) {
 995         $edge_data_keys{$datum} = 'de'.$edi++;
 996         my $key = $root->addNewChild( $graphml_ns, 'key' );
 997         $key->setAttribute( 'attr.name', $datum );
 998         $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
 999         $key->setAttribute( 'for', 'edge' );
1000         $key->setAttribute( 'id', $edge_data_keys{$datum} );
1001     }
1002
1003     # Add the collation graph itself
1004     my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1005     $sgraph->setAttribute( 'edgedefault', 'directed' );
1006     $sgraph->setAttribute( 'id', $self->tradition->name );
1007     $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
1008     $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
1009     $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
1010     $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
1011     $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
1012
1013     # Collation attribute data
1014     foreach my $datum ( keys %graph_attributes ) {
1015         my $value;
1016         if( $datum eq 'version' ) {
1017                 $value = '3.1';
1018         } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1019                 $value = $self->tradition->$datum;
1020         } else {
1021                 $value = $self->$datum;
1022         }
1023                 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
1024         }
1025
1026     my $node_ctr = 0;
1027     my %node_hash;
1028     # Add our readings to the graph
1029     foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
1030         next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1031                 ( $n->rank < $start->rank || $n->rank > $end->rank );
1032         $use_readings{$n->id} = 1;
1033         # Add to the main graph
1034         my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
1035         my $node_xmlid = 'n' . $node_ctr++;
1036         $node_hash{ $n->id } = $node_xmlid;
1037         $node_el->setAttribute( 'id', $node_xmlid );
1038         foreach my $d ( keys %reading_attributes ) {
1039                 my $nval = $n->$d;
1040                 if( $rankoffset && $d eq 'rank' ) {
1041                         # Adjust the ranks within the subgraph.
1042                         $nval = $n eq $self->end ? $end->rank + 1 : $nval - $rankoffset;
1043                 }
1044                 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1045                         if defined $nval;
1046         }
1047     }
1048
1049     # Add the path edges to the sequence graph
1050     my $edge_ctr = 0;
1051     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1052         # We add an edge in the graphml for every witness in $e.
1053         next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1054         my @edge_wits = sort $self->path_witnesses( $e );
1055         $e->[0] = $self->start unless $use_readings{$e->[0]};
1056         $e->[1] = $self->end unless $use_readings{$e->[1]};
1057         foreach my $wit ( @edge_wits ) {
1058                         my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1059                                                                                 $node_hash{ $e->[0] },
1060                                                                                 $node_hash{ $e->[1] } );
1061                         my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
1062                         $edge_el->setAttribute( 'source', $from );
1063                         $edge_el->setAttribute( 'target', $to );
1064                         $edge_el->setAttribute( 'id', $id );
1065
1066                         # It's a witness path, so add the witness
1067                         my $base = $wit;
1068                         my $key = $edge_data_keys{'witness'};
1069                         # Is this an ante-corr witness?
1070                         my $aclabel = $self->ac_label;
1071                         if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1072                                 # Keep the base witness
1073                                 $base = $1;
1074                                 # ...and record that this is an 'extra' reading path
1075                                 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1076                         }
1077                         _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1078                 }
1079         }
1080
1081         # Add the relationship graph to the XML
1082         map { delete $edge_data_keys{$_} } @path_attributes;
1083         $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1084                 $node_data_keys{'id'}, \%edge_data_keys );
1085
1086     # Save and return the thing
1087     my $result = decode_utf8( $graphml->toString(1) );
1088     return $result;
1089 }
1090
1091 sub _add_graphml_data {
1092     my( $el, $key, $value ) = @_;
1093     return unless defined $value;
1094     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
1095     $data_el->setAttribute( 'key', $key );
1096     $data_el->appendText( $value );
1097 }
1098
1099 =head2 as_csv
1100
1101 Returns a CSV alignment table representation of the collation graph, one
1102 row per witness (or witness uncorrected.)
1103
1104 =cut
1105
1106 sub as_csv {
1107     my( $self ) = @_;
1108     my $table = $self->alignment_table;
1109     my $csv = Text::CSV->new( { binary => 1, quote_null => 0 } );
1110     my @result;
1111     # Make the header row
1112     $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1113         push( @result, decode_utf8( $csv->string ) );
1114     # Make the rest of the rows
1115     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1116         my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1117         my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
1118         $csv->combine( @row );
1119         push( @result, decode_utf8( $csv->string ) );
1120     }
1121     return join( "\n", @result );
1122 }
1123
1124 =head2 alignment_table( $use_refs, $include_witnesses )
1125
1126 Return a reference to an alignment table, in a slightly enhanced CollateX
1127 format which looks like this:
1128
1129  $table = { alignment => [ { witness => "SIGIL",
1130                              tokens => [ { t => "TEXT" }, ... ] },
1131                            { witness => "SIG2",
1132                              tokens => [ { t => "TEXT" }, ... ] },
1133                            ... ],
1134             length => TEXTLEN };
1135
1136 If $use_refs is set to 1, the reading object is returned in the table
1137 instead of READINGTEXT; if not, the text of the reading is returned.
1138
1139 If $include_witnesses is set to a hashref, only the witnesses whose sigil
1140 keys have a true hash value will be included.
1141
1142 =cut
1143
1144 sub alignment_table {
1145     my( $self ) = @_;
1146     $self->calculate_ranks() unless $self->_graphcalc_done;
1147     return $self->cached_table if $self->has_cached_table;
1148
1149     # Make sure we can do this
1150         throw( "Need a linear graph in order to make an alignment table" )
1151                 unless $self->linear;
1152         $self->calculate_ranks unless $self->end->has_rank;
1153
1154     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
1155     my @all_pos = ( 1 .. $self->end->rank - 1 );
1156     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
1157         # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1158         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1159         my @row = _make_witness_row( \@wit_path, \@all_pos );
1160         push( @{$table->{'alignment'}},
1161                 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1162         if( $wit->is_layered ) {
1163                 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
1164                         $wit->sigil.$self->ac_label );
1165             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
1166                         push( @{$table->{'alignment'}},
1167                                 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
1168         }
1169     }
1170     $self->cached_table( $table );
1171     return $table;
1172 }
1173
1174 sub _make_witness_row {
1175     my( $path, $positions ) = @_;
1176     my %char_hash;
1177     map { $char_hash{$_} = undef } @$positions;
1178     my $debug = 0;
1179     foreach my $rdg ( @$path ) {
1180         my $rtext = $rdg->text;
1181         $rtext = '#LACUNA#' if $rdg->is_lacuna;
1182         print STDERR "rank " . $rdg->rank . "\n" if $debug;
1183         # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
1184         $char_hash{$rdg->rank} = { 't' => $rdg };
1185     }
1186     my @row = map { $char_hash{$_} } @$positions;
1187     # Fill in lacuna markers for undef spots in the row
1188     my $last_el = shift @row;
1189     my @filled_row = ( $last_el );
1190     foreach my $el ( @row ) {
1191         # If we are using node reference, make the lacuna node appear many times
1192         # in the table.  If not, use the lacuna tag.
1193         if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1194             $el = $last_el;
1195         }
1196         push( @filled_row, $el );
1197         $last_el = $el;
1198     }
1199     return @filled_row;
1200 }
1201
1202 =head1 NAVIGATION METHODS
1203
1204 =head2 reading_sequence( $first, $last, $sigil, $backup )
1205
1206 Returns the ordered list of readings, starting with $first and ending
1207 with $last, for the witness given in $sigil. If a $backup sigil is
1208 specified (e.g. when walking a layered witness), it will be used wherever
1209 no $sigil path exists.  If there is a base text reading, that will be
1210 used wherever no path exists for $sigil or $backup.
1211
1212 =cut
1213
1214 # TODO Think about returning some lazy-eval iterator.
1215 # TODO Get rid of backup; we should know from what witness is whether we need it.
1216
1217 sub reading_sequence {
1218     my( $self, $start, $end, $witness ) = @_;
1219
1220     $witness = $self->baselabel unless $witness;
1221     my @readings = ( $start );
1222     my %seen;
1223     my $n = $start;
1224     while( $n && $n->id ne $end->id ) {
1225         if( exists( $seen{$n->id} ) ) {
1226             throw( "Detected loop for $witness at " . $n->id );
1227         }
1228         $seen{$n->id} = 1;
1229
1230         my $next = $self->next_reading( $n, $witness );
1231         unless( $next ) {
1232             throw( "Did not find any path for $witness from reading " . $n->id );
1233         }
1234         push( @readings, $next );
1235         $n = $next;
1236     }
1237     # Check that the last reading is our end reading.
1238     my $last = $readings[$#readings];
1239     throw( "Last reading found from " . $start->text .
1240         " for witness $witness is not the end!" ) # TODO do we get this far?
1241         unless $last->id eq $end->id;
1242
1243     return @readings;
1244 }
1245
1246 =head2 next_reading( $reading, $sigil );
1247
1248 Returns the reading that follows the given reading along the given witness
1249 path.
1250
1251 =cut
1252
1253 sub next_reading {
1254     # Return the successor via the corresponding path.
1255     my $self = shift;
1256     my $answer = $self->_find_linked_reading( 'next', @_ );
1257         return undef unless $answer;
1258     return $self->reading( $answer );
1259 }
1260
1261 =head2 prior_reading( $reading, $sigil )
1262
1263 Returns the reading that precedes the given reading along the given witness
1264 path.
1265
1266 =cut
1267
1268 sub prior_reading {
1269     # Return the predecessor via the corresponding path.
1270     my $self = shift;
1271     my $answer = $self->_find_linked_reading( 'prior', @_ );
1272     return $self->reading( $answer );
1273 }
1274
1275 sub _find_linked_reading {
1276     my( $self, $direction, $node, $path ) = @_;
1277
1278     # Get a backup if we are dealing with a layered witness
1279     my $alt_path;
1280     my $aclabel = $self->ac_label;
1281     if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1282         $alt_path = $1;
1283     }
1284
1285     my @linked_paths = $direction eq 'next'
1286         ? $self->sequence->edges_from( $node )
1287         : $self->sequence->edges_to( $node );
1288     return undef unless scalar( @linked_paths );
1289
1290     # We have to find the linked path that contains all of the
1291     # witnesses supplied in $path.
1292     my( @path_wits, @alt_path_wits );
1293     @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1294     @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
1295     my $base_le;
1296     my $alt_le;
1297     foreach my $le ( @linked_paths ) {
1298         if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
1299             $base_le = $le;
1300         }
1301                 my @le_wits = sort $self->path_witnesses( $le );
1302                 if( _is_within( \@path_wits, \@le_wits ) ) {
1303                         # This is the right path.
1304                         return $direction eq 'next' ? $le->[1] : $le->[0];
1305                 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1306                         $alt_le = $le;
1307                 }
1308     }
1309     # Got this far? Return the alternate path if it exists.
1310     return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
1311         if $alt_le;
1312
1313     # Got this far? Return the base path if it exists.
1314     return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
1315         if $base_le;
1316
1317     # Got this far? We have no appropriate path.
1318     warn "Could not find $direction node from " . $node->id
1319         . " along path $path";
1320     return undef;
1321 }
1322
1323 # Some set logic.
1324 sub _is_within {
1325     my( $set1, $set2 ) = @_;
1326     my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
1327     foreach my $el ( @$set1 ) {
1328         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
1329     }
1330     return $ret;
1331 }
1332
1333 # Return the string that joins together a list of witnesses for
1334 # display on a single path.
1335 sub _witnesses_of_label {
1336     my( $self, $label ) = @_;
1337     my $regex = $self->wit_list_separator;
1338     my @answer = split( /\Q$regex\E/, $label );
1339     return @answer;
1340 }
1341
1342 =head2 common_readings
1343
1344 Returns the list of common readings in the graph (i.e. those readings that are
1345 shared by all non-lacunose witnesses.)
1346
1347 =cut
1348
1349 sub common_readings {
1350         my $self = shift;
1351         my @common = grep { $_->is_common } $self->readings;
1352         return @common;
1353 }
1354
1355 =head2 path_text( $sigil, [, $start, $end ] )
1356
1357 Returns the text of a witness (plus its backup, if we are using a layer)
1358 as stored in the collation.  The text is returned as a string, where the
1359 individual readings are joined with spaces and the meta-readings (e.g.
1360 lacunae) are omitted.  Optional specification of $start and $end allows
1361 the generation of a subset of the witness text.
1362
1363 =cut
1364
1365 sub path_text {
1366         my( $self, $wit, $start, $end ) = @_;
1367         $start = $self->start unless $start;
1368         $end = $self->end unless $end;
1369         my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
1370         my $pathtext = '';
1371         my $last;
1372         foreach my $r ( @path ) {
1373                 if( $r->join_prior || !$last || $last->join_next ) {
1374                         $pathtext .= $r->text;
1375                 } else {
1376                         $pathtext .= ' ' . $r->text;
1377                 }
1378                 $last = $r;
1379         }
1380         return $pathtext;
1381 }
1382
1383 =head1 INITIALIZATION METHODS
1384
1385 These are mostly for use by parsers.
1386
1387 =head2 make_witness_path( $witness )
1388
1389 Link the array of readings contained in $witness->path (and in
1390 $witness->uncorrected_path if it exists) into collation paths.
1391 Clear out the arrays when finished.
1392
1393 =head2 make_witness_paths
1394
1395 Call make_witness_path for all witnesses in the tradition.
1396
1397 =cut
1398
1399 # For use when a collation is constructed from a base text and an apparatus.
1400 # We have the sequences of readings and just need to add path edges.
1401 # When we are done, clear out the witness path attributes, as they are no
1402 # longer needed.
1403 # TODO Find a way to replace the witness path attributes with encapsulated functions?
1404
1405 sub make_witness_paths {
1406     my( $self ) = @_;
1407     foreach my $wit ( $self->tradition->witnesses ) {
1408         # print STDERR "Making path for " . $wit->sigil . "\n";
1409         $self->make_witness_path( $wit );
1410     }
1411 }
1412
1413 sub make_witness_path {
1414     my( $self, $wit ) = @_;
1415     my @chain = @{$wit->path};
1416     my $sig = $wit->sigil;
1417     # Add start and end if necessary
1418     unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1419     push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1420     foreach my $idx ( 0 .. $#chain-1 ) {
1421         $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
1422     }
1423     if( $wit->is_layered ) {
1424         @chain = @{$wit->uncorrected_path};
1425                 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1426                 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1427         foreach my $idx( 0 .. $#chain-1 ) {
1428             my $source = $chain[$idx];
1429             my $target = $chain[$idx+1];
1430             $self->add_path( $source, $target, $sig.$self->ac_label )
1431                 unless $self->has_path( $source, $target, $sig );
1432         }
1433     }
1434     $wit->clear_path;
1435     $wit->clear_uncorrected_path;
1436 }
1437
1438 =head2 equivalence_graph( \%readingmap, $startrank, $endrank, @testvector )
1439
1440 Returns an equivalence graph of the collation, in which all readings
1441 related via a 'colocated' relationship are transformed into a single
1442 vertex. Can be used to determine the validity of a new relationship.  The
1443 mapping between equivalence vertices and reading IDs will be stored in the
1444 hash whose reference is passed as readingmap. For a subset of the graph,
1445 pass in a start and/or an ending rank (this only works if L<calculate_ranks>
1446 has been called at least once.)
1447
1448 It is also possible to pass in a test relationship in @testvector, and get
1449 the resulting equivalence graph before the relationship has been made.
1450
1451 =cut
1452
1453 sub equivalence_graph {
1454         my( $self, $map, $start, $end, @newvector ) = @_;
1455         $start = undef unless $self->end->has_rank;
1456         $end = undef unless $self->end->has_rank;
1457         my $eqgraph = Graph->new();
1458     my $rel_ctr = 0;
1459     # Add the nodes
1460     foreach my $r ( $self->readings ) {
1461         unless( $r eq $self->start || $r eq $self->end ) {
1462                         next if $start && $r->rank < $start;
1463                         next if $end && $r->rank > $end;
1464         }
1465                 next if exists $map->{$r->id};
1466         my @rels = $self->related_readings( $r->id, 'colocated' );
1467         push( @rels, $r->id );
1468                 # Make an equivalence vertex
1469                 my $rn = 'equivalence_' . $rel_ctr++;
1470                 $eqgraph->add_vertex( $rn );
1471                 # Note which readings belong to this vertex.
1472                 push( @rels, $r->id );
1473                 foreach( @rels ) {
1474                         $map->{$_} = $rn;
1475                 }
1476     }
1477
1478     # Add the edges.
1479     foreach my $p ( $self->paths ) {
1480         my $efrom = exists $map->{$p->[0]} ? $map->{$p->[0]}
1481                 : $map->{$self->start->id};
1482         my $eto = exists $map->{$p->[1]} ? $map->{$p->[1]}
1483                 : $map->{$self->end->id};
1484         $eqgraph->add_edge( $efrom, $eto );
1485     }
1486
1487     # Collapse the vertices in @newvector if applicable.
1488     if( @newvector ) {
1489                 my( $eqs, $eqt ) = map { $map->{$_} } @newvector;
1490                 $DB::single = 1 unless $eqs && $eqt;
1491                 unless( $eqs eq $eqt ) {
1492                         # Combine the vertices.
1493                         map { $eqgraph->add_edge( $eqs, $_ ) } $eqgraph->successors( $eqt );
1494                         map { $eqgraph->add_edge( $_, $eqs ) } $eqgraph->predecessors( $eqt );
1495                         $eqgraph->delete_vertex( $eqt );
1496                 }
1497         }
1498     return $eqgraph;
1499 }
1500
1501 =head2 calculate_ranks
1502
1503 Calculate the reading ranks (that is, their aligned positions relative
1504 to each other) for the graph.  This can only be called on linear collations.
1505
1506 =begin testing
1507
1508 use Text::Tradition;
1509
1510 my $cxfile = 't/data/Collatex-16.xml';
1511 my $t = Text::Tradition->new(
1512     'name'  => 'inline',
1513     'input' => 'CollateX',
1514     'file'  => $cxfile,
1515     );
1516 my $c = $t->collation;
1517
1518 # Make an svg
1519 my $table = $c->alignment_table;
1520 ok( $c->has_cached_table, "Alignment table was cached" );
1521 is( $c->alignment_table, $table, "Cached table returned upon second call" );
1522 $c->calculate_ranks;
1523 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
1524 $c->add_relationship( 'n24', 'n23', { 'type' => 'spelling' } );
1525 isnt( $c->alignment_table, $table, "Alignment table changed after relationship add" );
1526
1527 =end testing
1528
1529 =cut
1530
1531 sub calculate_ranks {
1532     my $self = shift;
1533     # Save the existing ranks, in case we need to invalidate the cached SVG.
1534     my %existing_ranks;
1535     map { $existing_ranks{$_} = $_->rank } $self->readings;
1536     # Walk a version of the graph where every node linked by a relationship
1537     # edge is fundamentally the same node, and do a topological ranking on
1538     # the nodes in this graph.
1539     my %rel_containers;
1540     my $topo_graph = $self->equivalence_graph( \%rel_containers );
1541
1542     # Now do the rankings, starting with the start node.
1543     my $topo_start = $rel_containers{$self->start->id};
1544     my $node_ranks = { $topo_start => 0 };
1545     my @curr_origin = ( $topo_start );
1546     # A little iterative function.
1547     while( @curr_origin ) {
1548         @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
1549     }
1550     # Transfer our rankings from the topological graph to the real one.
1551     foreach my $r ( $self->readings ) {
1552         if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1553             $r->rank( $node_ranks->{$rel_containers{$r->id}} );
1554         } else {
1555                 # Die. Find the last rank we calculated.
1556                 my @all_defined = sort { ( $node_ranks->{$rel_containers{$a->id}}||-1 )
1557                                  <=> ( $node_ranks->{$rel_containers{$b->id}}||-1 ) }
1558                         $self->readings;
1559                 my $last = pop @all_defined;
1560             throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
1561         }
1562     }
1563     # Do we need to invalidate the cached data?
1564     if( $self->has_cached_svg || $self->has_cached_table ) {
1565         foreach my $r ( $self->readings ) {
1566                 next if defined( $existing_ranks{$r} )
1567                         && $existing_ranks{$r} == $r->rank;
1568                 # Something has changed, so clear the cache
1569                 $self->_clear_cache;
1570                         # ...and recalculate the common readings.
1571                         $self->calculate_common_readings();
1572                 last;
1573         }
1574     }
1575         # The graph calculation information is now up to date.
1576         $self->_graphcalc_done(1);
1577 }
1578
1579 sub _assign_rank {
1580     my( $graph, $node_ranks, @current_nodes ) = @_;
1581     # Look at each of the children of @current_nodes.  If all the child's
1582     # parents have a rank, assign it the highest rank + 1 and add it to
1583     # @next_nodes.  Otherwise skip it; we will return when the highest-ranked
1584     # parent gets a rank.
1585     my @next_nodes;
1586     foreach my $c ( @current_nodes ) {
1587         warn "Current reading $c has no rank!"
1588             unless exists $node_ranks->{$c};
1589         # print STDERR "Looking at child of node $c, rank "
1590         #     . $node_ranks->{$c} . "\n";
1591         foreach my $child ( $graph->successors( $c ) ) {
1592             next if exists $node_ranks->{$child};
1593             my $highest_rank = -1;
1594             my $skip = 0;
1595             foreach my $parent ( $graph->predecessors( $child ) ) {
1596                 if( exists $node_ranks->{$parent} ) {
1597                     $highest_rank = $node_ranks->{$parent}
1598                         if $highest_rank <= $node_ranks->{$parent};
1599                 } else {
1600                     $skip = 1;
1601                     last;
1602                 }
1603             }
1604             next if $skip;
1605             my $c_rank = $highest_rank + 1;
1606             # print STDERR "Assigning rank $c_rank to node $child \n";
1607             $node_ranks->{$child} = $c_rank;
1608             push( @next_nodes, $child );
1609         }
1610     }
1611     return @next_nodes;
1612 }
1613
1614 sub _clear_cache {
1615         my $self = shift;
1616         $self->wipe_svg if $self->has_cached_svg;
1617         $self->wipe_table if $self->has_cached_table;
1618 }
1619
1620
1621 =head2 flatten_ranks
1622
1623 A convenience method for parsing collation data.  Searches the graph for readings
1624 with the same text at the same rank, and merges any that are found.
1625
1626 =cut
1627
1628 sub flatten_ranks {
1629     my $self = shift;
1630     my %unique_rank_rdg;
1631     my $changed;
1632     foreach my $rdg ( $self->readings ) {
1633         next unless $rdg->has_rank;
1634         my $key = $rdg->rank . "||" . $rdg->text;
1635         if( exists $unique_rank_rdg{$key} ) {
1636             # Combine!
1637                 # print STDERR "Combining readings at same rank: $key\n";
1638                 $changed = 1;
1639             $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
1640             # TODO see if this now makes a common point.
1641         } else {
1642             $unique_rank_rdg{$key} = $rdg;
1643         }
1644     }
1645     # If we merged readings, the ranks are still fine but the alignment
1646     # table is wrong. Wipe it.
1647     $self->wipe_table() if $changed;
1648 }
1649
1650
1651 =head2 calculate_common_readings
1652
1653 Goes through the graph identifying the readings that appear in every witness
1654 (apart from those with lacunae at that spot.) Marks them as common and returns
1655 the list.
1656
1657 =begin testing
1658
1659 use Text::Tradition;
1660
1661 my $cxfile = 't/data/Collatex-16.xml';
1662 my $t = Text::Tradition->new(
1663     'name'  => 'inline',
1664     'input' => 'CollateX',
1665     'file'  => $cxfile,
1666     );
1667 my $c = $t->collation;
1668
1669 my @common = $c->calculate_common_readings();
1670 is( scalar @common, 8, "Found correct number of common readings" );
1671 my @marked = sort $c->common_readings();
1672 is( scalar @common, 8, "All common readings got marked as such" );
1673 my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
1674 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1675
1676 =end testing
1677
1678 =cut
1679
1680 sub calculate_common_readings {
1681         my $self = shift;
1682         my @common;
1683         map { $_->is_common( 0 ) } $self->readings;
1684         # Implicitly calls calculate_ranks
1685         my $table = $self->alignment_table;
1686         foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1687                 my @row = map { $_->{'tokens'}->[$idx]
1688                                                         ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1689                                         @{$table->{'alignment'}};
1690                 my %hash;
1691                 foreach my $r ( @row ) {
1692                         if( $r ) {
1693                                 $hash{$r->id} = $r unless $r->is_meta;
1694                         } else {
1695                                 $hash{'UNDEF'} = $r;
1696                         }
1697                 }
1698                 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1699                         my( $r ) = values %hash;
1700                         $r->is_common( 1 );
1701                         push( @common, $r );
1702                 }
1703         }
1704         return @common;
1705 }
1706
1707 =head2 text_from_paths
1708
1709 Calculate the text array for all witnesses from the path, for later consistency
1710 checking.  Only to be used if there is no non-graph-based way to know the
1711 original texts.
1712
1713 =cut
1714
1715 sub text_from_paths {
1716         my $self = shift;
1717     foreach my $wit ( $self->tradition->witnesses ) {
1718         my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1719         my @text;
1720         foreach my $r ( @readings ) {
1721                 next if $r->is_meta;
1722                 push( @text, $r->text );
1723         }
1724         $wit->text( \@text );
1725         if( $wit->is_layered ) {
1726                         my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
1727                                                                                                   $wit->sigil.$self->ac_label );
1728                         my @uctext;
1729                         foreach my $r ( @ucrdgs ) {
1730                                 next if $r->is_meta;
1731                                 push( @uctext, $r->text );
1732                         }
1733                         $wit->layertext( \@uctext );
1734         }
1735     }
1736 }
1737
1738 =head1 UTILITY FUNCTIONS
1739
1740 =head2 common_predecessor( $reading_a, $reading_b )
1741
1742 Find the last reading that occurs in sequence before both the given readings.
1743 At the very least this should be $self->start.
1744
1745 =head2 common_successor( $reading_a, $reading_b )
1746
1747 Find the first reading that occurs in sequence after both the given readings.
1748 At the very least this should be $self->end.
1749
1750 =begin testing
1751
1752 use Text::Tradition;
1753
1754 my $cxfile = 't/data/Collatex-16.xml';
1755 my $t = Text::Tradition->new(
1756     'name'  => 'inline',
1757     'input' => 'CollateX',
1758     'file'  => $cxfile,
1759     );
1760 my $c = $t->collation;
1761
1762 is( $c->common_predecessor( 'n24', 'n23' )->id,
1763     'n20', "Found correct common predecessor" );
1764 is( $c->common_successor( 'n24', 'n23' )->id,
1765     '#END#', "Found correct common successor" );
1766
1767 is( $c->common_predecessor( 'n19', 'n17' )->id,
1768     'n16', "Found correct common predecessor for readings on same path" );
1769 is( $c->common_successor( 'n21', 'n10' )->id,
1770     '#END#', "Found correct common successor for readings on same path" );
1771
1772 =end testing
1773
1774 =cut
1775
1776 ## Return the closest reading that is a predecessor of both the given readings.
1777 sub common_predecessor {
1778         my $self = shift;
1779         my( $r1, $r2 ) = $self->_objectify_args( @_ );
1780         return $self->_common_in_path( $r1, $r2, 'predecessors' );
1781 }
1782
1783 sub common_successor {
1784         my $self = shift;
1785         my( $r1, $r2 ) = $self->_objectify_args( @_ );
1786         return $self->_common_in_path( $r1, $r2, 'successors' );
1787 }
1788
1789
1790 # TODO think about how to do this without ranks...
1791 sub _common_in_path {
1792         my( $self, $r1, $r2, $dir ) = @_;
1793         my $iter = $self->end->rank;
1794         my @candidates;
1795         my @last_r1 = ( $r1 );
1796         my @last_r2 = ( $r2 );
1797         # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
1798         my %all_seen;
1799         # print STDERR "Finding common $dir for $r1, $r2\n";
1800         while( !@candidates ) {
1801                 last unless $iter--;  # Avoid looping infinitely
1802                 # Iterate separately down the graph from r1 and r2
1803                 my( @new_lc1, @new_lc2 );
1804                 foreach my $lc ( @last_r1 ) {
1805                         foreach my $p ( $lc->$dir ) {
1806                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
1807                                         # print STDERR "Path candidate $p from $lc\n";
1808                                         push( @candidates, $p );
1809                                 } elsif( !$all_seen{$p->id} ) {
1810                                         $all_seen{$p->id} = 'r1';
1811                                         push( @new_lc1, $p );
1812                                 }
1813                         }
1814                 }
1815                 foreach my $lc ( @last_r2 ) {
1816                         foreach my $p ( $lc->$dir ) {
1817                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
1818                                         # print STDERR "Path candidate $p from $lc\n";
1819                                         push( @candidates, $p );
1820                                 } elsif( !$all_seen{$p->id} ) {
1821                                         $all_seen{$p->id} = 'r2';
1822                                         push( @new_lc2, $p );
1823                                 }
1824                         }
1825                 }
1826                 @last_r1 = @new_lc1;
1827                 @last_r2 = @new_lc2;
1828         }
1829         my @answer = sort { $a->rank <=> $b->rank } @candidates;
1830         return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1831 }
1832
1833 sub throw {
1834         Text::Tradition::Error->throw(
1835                 'ident' => 'Collation error',
1836                 'message' => $_[0],
1837                 );
1838 }
1839
1840 no Moose;
1841 __PACKAGE__->meta->make_immutable;
1842
1843 =head1 LICENSE
1844
1845 This package is free software and is provided "as is" without express
1846 or implied warranty.  You can redistribute it and/or modify it under
1847 the same terms as Perl itself.
1848
1849 =head1 AUTHOR
1850
1851 Tara L Andrews E<lt>aurum@cpan.orgE<gt>