lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use Encode qw( decode_utf8 );
   4 use File::Temp;
   5 use File::Which;
   6 use Graph;
   7 use IPC::Run qw( run binary );
   8 use Text::CSV;
   9 use Text::Tradition::Collation::Reading;
  10 use Text::Tradition::Collation::RelationshipStore;
  11 use Text::Tradition::Error;
  12 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
  13 use XML::LibXML;
  14 use XML::LibXML::XPathContext;
  15 use Moose;
  16
  17 has 'sequence' => (
  18     is => 'ro',
  19     isa => 'Graph',
  20     default => sub { Graph->new() },
  21     handles => {
  22         paths => 'edges',
  23     },
  24     );
  25
  26 has 'relations' => (
  27         is => 'ro',
  28         isa => 'Text::Tradition::Collation::RelationshipStore',
  29         handles => {
  30                 relationships => 'relationships',
  31                 related_readings => 'related_readings',
  32                 get_relationship => 'get_relationship',
  33                 del_relationship => 'del_relationship',
  34                 equivalence => 'equivalence',
  35                 equivalence_graph => 'equivalence_graph',
  36         },
  37         writer => '_set_relations',
  38         );
  39
  40 has 'tradition' => (
  41     is => 'ro',
  42     isa => 'Text::Tradition',
  43     writer => '_set_tradition',
  44     weak_ref => 1,
  45     );
  46
  47 has 'readings' => (
  48         isa => 'HashRef[Text::Tradition::Collation::Reading]',
  49         traits => ['Hash'],
  50     handles => {
  51         reading     => 'get',
  52         _add_reading => 'set',
  53         del_reading => 'delete',
  54         has_reading => 'exists',
  55         readings   => 'values',
  56     },
  57     default => sub { {} },
  58         );
  59
  60 has 'wit_list_separator' => (
  61     is => 'rw',
  62     isa => 'Str',
  63     default => ', ',
  64     );
  65
  66 has 'baselabel' => (
  67     is => 'rw',
  68     isa => 'Str',
  69     default => 'base text',
  70     );
  71
  72 has 'linear' => (
  73     is => 'rw',
  74     isa => 'Bool',
  75     default => 1,
  76     );
  77
  78 has 'ac_label' => (
  79     is => 'rw',
  80     isa => 'Str',
  81     default => ' (a.c.)',
  82     );
  83
  84 has 'wordsep' => (
  85         is => 'rw',
  86         isa => 'Str',
  87         default => ' ',
  88         );
  89
  90 has 'start' => (
  91         is => 'ro',
  92         isa => 'Text::Tradition::Collation::Reading',
  93         writer => '_set_start',
  94         weak_ref => 1,
  95         );
  96
  97 has 'end' => (
  98         is => 'ro',
  99         isa => 'Text::Tradition::Collation::Reading',
 100         writer => '_set_end',
 101         weak_ref => 1,
 102         );
 103
 104 has 'cached_svg' => (
 105         is => 'rw',
 106         isa => 'Str',
 107         predicate => 'has_cached_svg',
 108         clearer => 'wipe_svg',
 109         );
 110
 111 has 'cached_table' => (
 112         is => 'rw',
 113         isa => 'HashRef',
 114         predicate => 'has_cached_table',
 115         clearer => 'wipe_table',
 116         );
 117
 118 has '_graphcalc_done' => (
 119         is => 'rw',
 120         isa => 'Bool',
 121         default => undef,
 122         );
 123
 124 =head1 NAME
 125
 126 Text::Tradition::Collation - a software model for a text collation
 127
 128 =head1 SYNOPSIS
 129
 130   use Text::Tradition;
 131   my $t = Text::Tradition->new(
 132     'name' => 'this is a text',
 133     'input' => 'TEI',
 134     'file' => '/path/to/tei_parallel_seg_file.xml' );
 135
 136   my $c = $t->collation;
 137   my @readings = $c->readings;
 138   my @paths = $c->paths;
 139   my @relationships = $c->relationships;
 140
 141   my $svg_variant_graph = $t->collation->as_svg();
 142
 143 =head1 DESCRIPTION
 144
 145 Text::Tradition is a library for representation and analysis of collated
 146 texts, particularly medieval ones.  The Collation is the central feature of
 147 a Tradition, where the text, its sequence of readings, and its relationships
 148 between readings are actually kept.
 149
 150 =head1 CONSTRUCTOR
 151
 152 =head2 new
 153
 154 The constructor.  Takes a hash or hashref of the following arguments:
 155
 156 =over
 157
 158 =item * tradition - The Text::Tradition object to which the collation
 159 belongs. Required.
 160
 161 =item * linear - Whether the collation should be linear; that is, whether
 162 transposed readings should be treated as two linked readings rather than one,
 163 and therefore whether the collation graph is acyclic.  Defaults to true.
 164
 165 =item * baselabel - The default label for the path taken by a base text
 166 (if any). Defaults to 'base text'.
 167
 168 =item * wit_list_separator - The string to join a list of witnesses for
 169 purposes of making labels in display graphs.  Defaults to ', '.
 170
 171 =item * ac_label - The extra label to tack onto a witness sigil when
 172 representing another layer of path for the given witness - that is, when
 173 a text has more than one possible reading due to scribal corrections or
 174 the like.  Defaults to ' (a.c.)'.
 175
 176 =item * wordsep - The string used to separate words in the original text.
 177 Defaults to ' '.
 178
 179 =back
 180
 181 =head1 ACCESSORS
 182
 183 =head2 tradition
 184
 185 =head2 linear
 186
 187 =head2 wit_list_separator
 188
 189 =head2 baselabel
 190
 191 =head2 ac_label
 192
 193 =head2 wordsep
 194
 195 Simple accessors for collation attributes.
 196
 197 =head2 start
 198
 199 The meta-reading at the start of every witness path.
 200
 201 =head2 end
 202
 203 The meta-reading at the end of every witness path.
 204
 205 =head2 readings
 206
 207 Returns all Reading objects in the graph.
 208
 209 =head2 reading( $id )
 210
 211 Returns the Reading object corresponding to the given ID.
 212
 213 =head2 add_reading( $reading_args )
 214
 215 Adds a new reading object to the collation.
 216 See L<Text::Tradition::Collation::Reading> for the available arguments.
 217
 218 =head2 del_reading( $object_or_id )
 219
 220 Removes the given reading from the collation, implicitly removing its
 221 paths and relationships.
 222
 223 =head2 merge_readings( $main, $second, $concatenate, $with_str )
 224
 225 Merges the $second reading into the $main one. If $concatenate is true, then
 226 the merged node will carry the text of both readings, concatenated with either
 227 $with_str (if specified) or a sensible default (the empty string if the
 228 appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
 229
 230 The first two arguments may be either readings or reading IDs.
 231
 232 =head2 has_reading( $id )
 233
 234 Predicate to see whether a given reading ID is in the graph.
 235
 236 =head2 reading_witnesses( $object_or_id )
 237
 238 Returns a list of sigils whose witnesses contain the reading.
 239
 240 =head2 paths
 241
 242 Returns all reading paths within the document - that is, all edges in the
 243 collation graph.  Each path is an arrayref of [ $source, $target ] reading IDs.
 244
 245 =head2 add_path( $source, $target, $sigil )
 246
 247 Links the given readings in the collation in sequence, under the given witness
 248 sigil.  The readings may be specified by object or ID.
 249
 250 =head2 del_path( $source, $target, $sigil )
 251
 252 Links the given readings in the collation in sequence, under the given witness
 253 sigil.  The readings may be specified by object or ID.
 254
 255 =head2 has_path( $source, $target );
 256
 257 Returns true if the two readings are linked in sequence in any witness.
 258 The readings may be specified by object or ID.
 259
 260 =head2 relationships
 261
 262 Returns all Relationship objects in the collation.
 263
 264 =head2 add_relationship( $reading, $other_reading, $options )
 265
 266 Adds a new relationship of the type given in $options between the two readings,
 267 which may be specified by object or ID.  Returns a value of ( $status, @vectors)
 268 where $status is true on success, and @vectors is a list of relationship edges
 269 that were ultimately added.
 270 See L<Text::Tradition::Collation::Relationship> for the available options.
 271
 272 =cut
 273
 274 sub BUILD {
 275     my $self = shift;
 276     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
 277     $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
 278     $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
 279 }
 280
 281 ### Reading construct/destruct functions
 282
 283 sub add_reading {
 284         my( $self, $reading ) = @_;
 285         unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
 286                 my %args = %$reading;
 287                 if( $self->tradition->has_language && !exists $args{'language'} ) {
 288                         $args{'language'} = $self->tradition->language;
 289                 }
 290                 $reading = Text::Tradition::Collation::Reading->new(
 291                         'collation' => $self,
 292                         %args );
 293         }
 294         # First check to see if a reading with this ID exists.
 295         if( $self->reading( $reading->id ) ) {
 296                 throw( "Collation already has a reading with id " . $reading->id );
 297         }
 298         $self->_graphcalc_done(0);
 299         $self->_add_reading( $reading->id => $reading );
 300         # Once the reading has been added, put it in both graphs.
 301         $self->sequence->add_vertex( $reading->id );
 302         # All meta readings save 'start' and 'end' get disregarded for relationships.
 303         unless( $reading->is_nonrel ) {
 304                 $self->relations->add_reading( $reading->id );
 305         }
 306         return $reading;
 307 };
 308
 309 around del_reading => sub {
 310         my $orig = shift;
 311         my $self = shift;
 312         my $arg = shift;
 313
 314         unless( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
 315                 $arg = $self->reading( $arg )
 316         }
 317         my $argid = $arg->id;
 318         # Remove the reading from the graphs.
 319         $self->_graphcalc_done(0);
 320         $self->_clear_cache; # Explicitly clear caches to GC the reading
 321         $self->sequence->delete_vertex( $argid );
 322         $self->relations->delete_reading( $argid )
 323                 unless $arg->is_nonrel;
 324
 325         # Carry on.
 326         $self->$orig( $argid );
 327 };
 328
 329 =begin testing
 330
 331 use Text::Tradition;
 332
 333 my $cxfile = 't/data/Collatex-16.xml';
 334 my $t = Text::Tradition->new(
 335     'name'  => 'inline',
 336     'input' => 'CollateX',
 337     'file'  => $cxfile,
 338     );
 339 my $c = $t->collation;
 340
 341 my $rno = scalar $c->readings;
 342 # Split n21 for testing purposes
 343 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
 344 my $old_r = $c->reading( 'n21' );
 345 $old_r->alter_text( 'to' );
 346 $c->del_path( 'n20', 'n21', 'A' );
 347 $c->add_path( 'n20', 'n21p0', 'A' );
 348 $c->add_path( 'n21p0', 'n21', 'A' );
 349 $c->flatten_ranks();
 350 ok( $c->reading( 'n21p0' ), "New reading exists" );
 351 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
 352
 353 # Combine n3 and n4 ( with his )
 354 $c->merge_readings( 'n3', 'n4', 1 );
 355 ok( !$c->reading('n4'), "Reading n4 is gone" );
 356 is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
 357
 358 # Collapse n9 and n10 ( rood / root )
 359 $c->merge_readings( 'n9', 'n10' );
 360 ok( !$c->reading('n10'), "Reading n10 is gone" );
 361 is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
 362
 363 # Combine n21 and n21p0
 364 my $remaining = $c->reading('n21');
 365 $remaining ||= $c->reading('n22');  # one of these should still exist
 366 $c->merge_readings( 'n21p0', $remaining, 1 );
 367 ok( !$c->reading('n21'), "Reading $remaining is gone" );
 368 is( $c->reading('n21p0')->text, 'unto', "Reading n21p0 merged correctly" );
 369
 370 =end testing
 371
 372 =cut
 373
 374 sub merge_readings {
 375         my $self = shift;
 376
 377         # Sanity check
 378         my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
 379         my $mergemeta = $kept_obj->is_meta;
 380         throw( "Cannot merge meta and non-meta reading" )
 381                 unless ( $mergemeta && $del_obj->is_meta )
 382                         || ( !$mergemeta && !$del_obj->is_meta );
 383         if( $mergemeta ) {
 384                 throw( "Cannot merge with start or end node" )
 385                         if( $kept_obj eq $self->start || $kept_obj eq $self->end
 386                                 || $del_obj eq $self->start || $del_obj eq $self->end );
 387         }
 388         # We only need the IDs for adding paths to the graph, not the reading
 389         # objects themselves.
 390         my $kept = $kept_obj->id;
 391         my $deleted = $del_obj->id;
 392         $self->_graphcalc_done(0);
 393
 394     # The kept reading should inherit the paths and the relationships
 395     # of the deleted reading.
 396         foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
 397                 my @vector = ( $kept );
 398                 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
 399                 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
 400                 next if $vector[0] eq $vector[1]; # Don't add a self loop
 401                 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
 402                 $self->sequence->add_edge( @vector );
 403                 my $fwits = $self->sequence->get_edge_attributes( @vector );
 404                 @wits{keys %$fwits} = values %$fwits;
 405                 $self->sequence->set_edge_attributes( @vector, \%wits );
 406         }
 407         $self->relations->merge_readings( $kept, $deleted, $combine )
 408                 unless $mergemeta;
 409
 410         # Do the deletion deed.
 411         if( $combine ) {
 412                 my $joinstr = $combine_char;
 413                 unless( defined $joinstr ) {
 414                         $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
 415                         $joinstr = $self->wordsep unless defined $joinstr;
 416                 }
 417                 $kept_obj->alter_text( join( $joinstr, $kept_obj->text, $del_obj->text ) );
 418         }
 419         $self->del_reading( $deleted );
 420 }
 421
 422
 423 # Helper function for manipulating the graph.
 424 sub _stringify_args {
 425         my( $self, $first, $second, @args ) = @_;
 426     $first = $first->id
 427         if ref( $first ) eq 'Text::Tradition::Collation::Reading';
 428     $second = $second->id
 429         if ref( $second ) eq 'Text::Tradition::Collation::Reading';
 430     return( $first, $second, @args );
 431 }
 432
 433 # Helper function for manipulating the graph.
 434 sub _objectify_args {
 435         my( $self, $first, $second, $arg ) = @_;
 436     $first = $self->reading( $first )
 437         unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
 438     $second = $self->reading( $second )
 439         unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
 440     return( $first, $second, $arg );
 441 }
 442 ### Path logic
 443
 444 sub add_path {
 445         my $self = shift;
 446
 447         # We only need the IDs for adding paths to the graph, not the reading
 448         # objects themselves.
 449     my( $source, $target, $wit ) = $self->_objectify_args( @_ );
 450
 451         $self->_graphcalc_done(0);
 452         # Connect the readings
 453         unless( $self->sequence->has_edge( $source, $target ) ) {
 454             $self->sequence->add_edge( $source, $target );
 455             $self->relations->add_equivalence_edge( $source, $target );
 456         }
 457     # Note the witness in question
 458     $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
 459 }
 460
 461 sub del_path {
 462         my $self = shift;
 463         my @args;
 464         if( ref( $_[0] ) eq 'ARRAY' ) {
 465                 my $e = shift @_;
 466                 @args = ( @$e, @_ );
 467         } else {
 468                 @args = @_;
 469         }
 470
 471         # We only need the IDs for adding paths to the graph, not the reading
 472         # objects themselves.
 473     my( $source, $target, $wit ) = $self->_stringify_args( @args );
 474
 475         $self->_graphcalc_done(0);
 476         if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
 477                 $self->sequence->delete_edge_attribute( $source, $target, $wit );
 478         }
 479         unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
 480                 $self->sequence->delete_edge( $source, $target );
 481                 $self->relations->delete_equivalence_edge( $source, $target );
 482         }
 483 }
 484
 485
 486 # Extra graph-alike utility
 487 sub has_path {
 488         my $self = shift;
 489     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 490         return undef unless $self->sequence->has_edge( $source, $target );
 491         return $self->sequence->has_edge_attribute( $source, $target, $wit );
 492 }
 493
 494 =head2 clear_witness( @sigil_list )
 495
 496 Clear the given witnesses out of the collation entirely, removing references
 497 to them in paths, and removing readings that belong only to them.  Should only
 498 be called via $tradition->del_witness.
 499
 500 =cut
 501
 502 sub clear_witness {
 503         my( $self, @sigils ) = @_;
 504
 505         $self->_graphcalc_done(0);
 506         # Clear the witness(es) out of the paths
 507         foreach my $e ( $self->paths ) {
 508                 foreach my $sig ( @sigils ) {
 509                         $self->del_path( $e, $sig );
 510                 }
 511         }
 512
 513         # Clear out the newly unused readings
 514         foreach my $r ( $self->readings ) {
 515                 unless( $self->reading_witnesses( $r ) ) {
 516                         $self->del_reading( $r );
 517                 }
 518         }
 519 }
 520
 521 sub add_relationship {
 522         my $self = shift;
 523     my( $source, $target, $opts ) = $self->_stringify_args( @_ );
 524     my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
 525         $self->_graphcalc_done(0);
 526     return @vectors;
 527 }
 528
 529 around qw/ get_relationship del_relationship / => sub {
 530         my $orig = shift;
 531         my $self = shift;
 532         my @args = @_;
 533         if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
 534                 @args = @{$_[0]};
 535         }
 536         my( $source, $target ) = $self->_stringify_args( @args );
 537         $self->$orig( $source, $target );
 538 };
 539
 540 =head2 reading_witnesses( $reading )
 541
 542 Return a list of sigils corresponding to the witnesses in which the reading appears.
 543
 544 =cut
 545
 546 sub reading_witnesses {
 547         my( $self, $reading ) = @_;
 548         # We need only check either the incoming or the outgoing edges; I have
 549         # arbitrarily chosen "incoming".  Thus, special-case the start node.
 550         if( $reading eq $self->start ) {
 551                 return map { $_->sigil } $self->tradition->witnesses;
 552         }
 553         my %all_witnesses;
 554         foreach my $e ( $self->sequence->edges_to( $reading ) ) {
 555                 my $wits = $self->sequence->get_edge_attributes( @$e );
 556                 @all_witnesses{ keys %$wits } = 1;
 557         }
 558         my $acstr = $self->ac_label;
 559         foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
 560                 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
 561         }
 562         return keys %all_witnesses;
 563 }
 564
 565 =head1 OUTPUT METHODS
 566
 567 =head2 as_svg( \%options )
 568
 569 Returns an SVG string that represents the graph, via as_dot and graphviz.
 570 See as_dot for a list of options.  Must have GraphViz (dot) installed to run.
 571
 572 =cut
 573
 574 sub as_svg {
 575     my( $self, $opts ) = @_;
 576     throw( "Need GraphViz installed to output SVG" )
 577         unless File::Which::which( 'dot' );
 578     my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
 579     $self->calculate_ranks()
 580         unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
 581     if( !$self->has_cached_svg || $opts->{'recalc'}     || $want_subgraph ) {
 582                 my @cmd = qw/dot -Tsvg/;
 583                 my( $svg, $err );
 584                 my $dotfile = File::Temp->new();
 585                 ## USE FOR DEBUGGING
 586                 # $dotfile->unlink_on_destroy(0);
 587                 binmode $dotfile, ':utf8';
 588                 print $dotfile $self->as_dot( $opts );
 589                 push( @cmd, $dotfile->filename );
 590                 run( \@cmd, ">", binary(), \$svg );
 591                 $svg = decode_utf8( $svg );
 592                 $self->cached_svg( $svg ) unless $want_subgraph;
 593                 return $svg;
 594         } else {
 595                 return $self->cached_svg;
 596         }
 597 }
 598
 599
 600 =head2 as_dot( \%options )
 601
 602 Returns a string that is the collation graph expressed in dot
 603 (i.e. GraphViz) format.  Options include:
 604
 605 =over 4
 606
 607 =item * from
 608
 609 =item * to
 610
 611 =item * color_common
 612
 613 =back
 614
 615 =cut
 616
 617 sub as_dot {
 618     my( $self, $opts ) = @_;
 619     my $startrank = $opts->{'from'} if $opts;
 620     my $endrank = $opts->{'to'} if $opts;
 621     my $color_common = $opts->{'color_common'} if $opts;
 622     my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
 623        && $self->end->rank > 100;
 624     $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
 625
 626     # Check the arguments
 627     if( $startrank ) {
 628         return if $endrank && $startrank > $endrank;
 629         return if $startrank > $self->end->rank;
 630         }
 631         if( defined $endrank ) {
 632                 return if $endrank < 0;
 633                 $endrank = undef if $endrank == $self->end->rank;
 634         }
 635
 636     my $graph_name = $self->tradition->name;
 637     $graph_name =~ s/[^\w\s]//g;
 638     $graph_name = join( '_', split( /\s+/, $graph_name ) );
 639
 640     my %graph_attrs = (
 641         'rankdir' => 'LR',
 642         'bgcolor' => 'none',
 643         );
 644     my %node_attrs = (
 645         'fontsize' => 14,
 646         'fillcolor' => 'white',
 647         'style' => 'filled',
 648         'shape' => 'ellipse'
 649         );
 650     my %edge_attrs = (
 651         'arrowhead' => 'open',
 652         'color' => '#000000',
 653         'fontcolor' => '#000000',
 654         );
 655
 656     my $dot = sprintf( "digraph %s {\n", $graph_name );
 657     $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
 658     $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
 659
 660         # Output substitute start/end readings if necessary
 661         if( $startrank ) {
 662                 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
 663         }
 664         if( $endrank ) {
 665                 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
 666         }
 667         if( $STRAIGHTENHACK ) {
 668                 ## HACK part 1
 669                 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
 670                 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
 671                 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
 672         }
 673         my %used;  # Keep track of the readings that actually appear in the graph
 674         # Sort the readings by rank if we have ranks; this speeds layout.
 675         my @all_readings = $self->end->has_rank
 676                 ? sort { $a->rank <=> $b->rank } $self->readings
 677                 : $self->readings;
 678         # TODO Refrain from outputting lacuna nodes - just grey out the edges.
 679     foreach my $reading ( @all_readings ) {
 680         # Only output readings within our rank range.
 681         next if $startrank && $reading->rank < $startrank;
 682         next if $endrank && $reading->rank > $endrank;
 683         $used{$reading->id} = 1;
 684         # Need not output nodes without separate labels
 685         next if $reading->id eq $reading->text;
 686         my $rattrs;
 687         my $label = $reading->text;
 688         $label .= '-' if $reading->join_next;
 689         $label = "-$label" if $reading->join_prior;
 690         $label =~ s/\"/\\\"/g;
 691                 $rattrs->{'label'} = $label;
 692                 $rattrs->{'id'} = $reading->id;
 693                 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
 694         $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
 695     }
 696
 697         # Add the real edges. Need to weight one edge per rank jump, in a
 698         # continuous line.
 699         # my $weighted = $self->_add_edge_weights;
 700     my @edges = $self->paths;
 701         my( %substart, %subend );
 702     foreach my $edge ( @edges ) {
 703         # Do we need to output this edge?
 704         if( $used{$edge->[0]} && $used{$edge->[1]} ) {
 705                 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
 706                         my $variables = { %edge_attrs, 'label' => $label };
 707
 708                         # Account for the rank gap if necessary
 709                         my $rank0 = $self->reading( $edge->[0] )->rank
 710                                 if $self->reading( $edge->[0] )->has_rank;
 711                         my $rank1 = $self->reading( $edge->[1] )->rank
 712                                 if $self->reading( $edge->[1] )->has_rank;
 713                         if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
 714                                 $variables->{'minlen'} = $rank1 - $rank0;
 715                         }
 716
 717                         # Add the calculated edge weights
 718                         # if( exists $weighted->{$edge->[0]}
 719                         #       && $weighted->{$edge->[0]} eq $edge->[1] ) {
 720                         #       # $variables->{'color'} = 'red';
 721                         #       $variables->{'weight'} = 3.0;
 722                         # }
 723
 724                         # EXPERIMENTAL: make edge width reflect no. of witnesses
 725                         my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
 726                         $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
 727
 728                         my $varopts = _dot_attr_string( $variables );
 729                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
 730                                 $edge->[0], $edge->[1], $varopts );
 731         } elsif( $used{$edge->[0]} ) {
 732                 $subend{$edge->[0]} = 1;
 733         } elsif( $used{$edge->[1]} ) {
 734                 $substart{$edge->[1]} = 1;
 735         }
 736     }
 737     # Add substitute start and end edges if necessary
 738     foreach my $node ( keys %substart ) {
 739         my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
 740         my $variables = { %edge_attrs, 'label' => $witstr };
 741         my $varopts = _dot_attr_string( $variables );
 742         $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;";
 743         }
 744     foreach my $node ( keys %subend ) {
 745         my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
 746         my $variables = { %edge_attrs, 'label' => $witstr };
 747         my $varopts = _dot_attr_string( $variables );
 748         $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;";
 749         }
 750         # HACK part 2
 751         if( $STRAIGHTENHACK ) {
 752                 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
 753                 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
 754         }
 755
 756     $dot .= "}\n";
 757     return $dot;
 758 }
 759
 760 sub _dot_attr_string {
 761         my( $hash ) = @_;
 762         my @attrs;
 763         foreach my $k ( sort keys %$hash ) {
 764                 my $v = $hash->{$k};
 765                 push( @attrs, $k.'="'.$v.'"' );
 766         }
 767         return( '[ ' . join( ', ', @attrs ) . ' ]' );
 768 }
 769
 770 sub _add_edge_weights {
 771         my $self = shift;
 772         # Walk the graph from START to END, choosing the successor node with
 773         # the largest number of witness paths each time.
 774         my $weighted = {};
 775         my $curr = $self->start->id;
 776         my $ranked = $self->end->has_rank;
 777         while( $curr ne $self->end->id ) {
 778                 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
 779                 my @succ = sort { $self->path_witnesses( $curr, $a )
 780                                                         <=> $self->path_witnesses( $curr, $b ) }
 781                         $self->sequence->successors( $curr );
 782                 my $next = pop @succ;
 783                 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
 784                 # Try to avoid lacunae in the weighted path.
 785                 while( @succ &&
 786                            ( $self->reading( $next )->is_lacuna ||
 787                                  $nextrank - $rank > 1 ) ){
 788                         $next = pop @succ;
 789                 }
 790                 $weighted->{$curr} = $next;
 791                 $curr = $next;
 792         }
 793         return $weighted;
 794 }
 795
 796 =head2 path_witnesses( $edge )
 797
 798 Returns the list of sigils whose witnesses are associated with the given edge.
 799 The edge can be passed as either an array or an arrayref of ( $source, $target ).
 800
 801 =cut
 802
 803 sub path_witnesses {
 804         my( $self, @edge ) = @_;
 805         # If edge is an arrayref, cope.
 806         if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
 807                 my $e = shift @edge;
 808                 @edge = @$e;
 809         }
 810         my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
 811         return @wits;
 812 }
 813
 814 # Helper function. Make a display label for the given witnesses, showing a.c.
 815 # witnesses only where the main witness is not also in the list.
 816 sub _path_display_label {
 817         my $self = shift;
 818         my %wits;
 819         map { $wits{$_} = 1 } @_;
 820
 821         # If an a.c. wit is listed, remove it if the main wit is also listed.
 822         # Otherwise keep it for explicit listing.
 823         my $aclabel = $self->ac_label;
 824         my @disp_ac;
 825         foreach my $w ( sort keys %wits ) {
 826                 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
 827                         if( exists $wits{$1} ) {
 828                                 delete $wits{$w};
 829                         } else {
 830                                 push( @disp_ac, $w );
 831                         }
 832                 }
 833         }
 834
 835         # See if we are in a majority situation.
 836         my $maj = scalar( $self->tradition->witnesses ) * 0.6;
 837         $maj = $maj > 5 ? $maj : 5;
 838         if( scalar keys %wits > $maj ) {
 839                 unshift( @disp_ac, 'majority' );
 840                 return join( ', ', @disp_ac );
 841         } else {
 842                 return join( ', ', sort keys %wits );
 843         }
 844 }
 845
 846 =head2 readings_at_rank( $rank )
 847
 848 Returns a list of readings at a given rank, taken from the alignment table.
 849
 850 =cut
 851
 852 sub readings_at_rank {
 853         my( $self, $rank ) = @_;
 854         my $table = $self->alignment_table;
 855         # Table rank is real rank - 1.
 856         my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
 857         my %readings;
 858         foreach my $e ( @elements ) {
 859                 next unless ref( $e ) eq 'HASH';
 860                 next unless exists $e->{'t'};
 861                 $readings{$e->{'t'}->id} = $e->{'t'};
 862         }
 863         return values %readings;
 864 }
 865
 866 =head2 as_graphml
 867
 868 Returns a GraphML representation of the collation.  The GraphML will contain
 869 two graphs. The first expresses the attributes of the readings and the witness
 870 paths that link them; the second expresses the relationships that link the
 871 readings.  This is the native transfer format for a tradition.
 872
 873 =begin testing
 874
 875 use Text::Tradition;
 876
 877 my $READINGS = 311;
 878 my $PATHS = 361;
 879
 880 my $datafile = 't/data/florilegium_tei_ps.xml';
 881 my $tradition = Text::Tradition->new( 'input' => 'TEI',
 882                                       'name' => 'test0',
 883                                       'file' => $datafile,
 884                                       'linear' => 1 );
 885
 886 ok( $tradition, "Got a tradition object" );
 887 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
 888 ok( $tradition->collation, "Tradition has a collation" );
 889
 890 my $c = $tradition->collation;
 891 is( scalar $c->readings, $READINGS, "Collation has all readings" );
 892 is( scalar $c->paths, $PATHS, "Collation has all paths" );
 893 is( scalar $c->relationships, 0, "Collation has all relationships" );
 894
 895 # Add a few relationships
 896 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
 897 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
 898 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
 899
 900 # Now write it to GraphML and parse it again.
 901
 902 my $graphml = $c->as_graphml;
 903 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
 904 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
 905 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
 906 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
 907
 908 # Now add a stemma, write to GraphML, and parse again.
 909 my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
 910 is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
 911 is( $tradition->stemmata, 1, "Tradition now has the stemma" );
 912 $graphml = $c->as_graphml;
 913 like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
 914
 915 =end testing
 916
 917 =cut
 918
 919 sub as_graphml {
 920     my( $self, $options ) = @_;
 921         $self->calculate_ranks unless $self->_graphcalc_done;
 922
 923         my $start = $options->{'from'}
 924                 ? $self->reading( $options->{'from'} ) : $self->start;
 925         my $end = $options->{'to'}
 926                 ? $self->reading( $options->{'to'} ) : $self->end;
 927         if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
 928                 throw( 'Start node must be before end node' );
 929         }
 930         # The readings need to be ranked for this to work.
 931         $start = $self->start unless $start->has_rank;
 932         $end = $self->end unless $end->has_rank;
 933         my $rankoffset = 0;
 934         unless( $start eq $self->start ) {
 935                 $rankoffset = $start->rank - 1;
 936         }
 937         my %use_readings;
 938
 939     # Some namespaces
 940     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
 941     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
 942     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
 943         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
 944
 945     # Create the document and root node
 946     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
 947     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
 948     $graphml->setDocumentElement( $root );
 949     $root->setNamespace( $xsi_ns, 'xsi', 0 );
 950     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
 951
 952     # List of attribute types to save on our objects and their corresponding
 953     # GraphML types
 954     my %save_types = (
 955         'Str' => 'string',
 956         'Int' => 'int',
 957         'Bool' => 'boolean',
 958         'ReadingID' => 'string',
 959         'RelationshipType' => 'string',
 960         'RelationshipScope' => 'string',
 961     );
 962
 963     # List of attribute names *not* to save on our objects.
 964     # We will also not save any attribute beginning with _.
 965     my %skipsave;
 966     map { $skipsave{$_} = 1 } qw/ cached_svg /;
 967
 968     # Add the data keys for the graph. Include an extra key 'version' for the
 969     # GraphML output version.
 970     my %graph_data_keys;
 971     my $gdi = 0;
 972     my %graph_attributes = ( 'version' => 'string' );
 973         # Graph attributes include those of Tradition and those of Collation.
 974         my %gattr_from;
 975         my $tmeta = $self->tradition->meta;
 976         my $cmeta = $self->meta;
 977         map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
 978         map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
 979         foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
 980                 next if $attr->name =~ /^_/;
 981                 next if $skipsave{$attr->name};
 982                 next unless $save_types{$attr->type_constraint->name};
 983                 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
 984         }
 985     # Extra custom key for the tradition stemma(ta)
 986     $graph_attributes{'stemmata'} = 'string';
 987
 988     foreach my $datum ( sort keys %graph_attributes ) {
 989         $graph_data_keys{$datum} = 'dg'.$gdi++;
 990         my $key = $root->addNewChild( $graphml_ns, 'key' );
 991         $key->setAttribute( 'attr.name', $datum );
 992         $key->setAttribute( 'attr.type', $graph_attributes{$datum} );
 993         $key->setAttribute( 'for', 'graph' );
 994         $key->setAttribute( 'id', $graph_data_keys{$datum} );
 995     }
 996
 997     # Add the data keys for reading nodes
 998     my %reading_attributes;
 999     my $rmeta = Text::Tradition::Collation::Reading->meta;
1000     foreach my $attr( $rmeta->get_all_attributes ) {
1001                 next if $attr->name =~ /^_/;
1002                 next if $skipsave{$attr->name};
1003                 next unless $save_types{$attr->type_constraint->name};
1004                 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1005         }
1006         # Extra custom key for the reading morphology
1007         $reading_attributes{'lexemes'} = 'string';
1008
1009     my %node_data_keys;
1010     my $ndi = 0;
1011     foreach my $datum ( sort keys %reading_attributes ) {
1012         $node_data_keys{$datum} = 'dn'.$ndi++;
1013         my $key = $root->addNewChild( $graphml_ns, 'key' );
1014         $key->setAttribute( 'attr.name', $datum );
1015         $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
1016         $key->setAttribute( 'for', 'node' );
1017         $key->setAttribute( 'id', $node_data_keys{$datum} );
1018     }
1019
1020     # Add the data keys for edges, that is, paths and relationships. Path
1021     # data does not come from a Moose class so is here manually.
1022     my $edi = 0;
1023     my %edge_data_keys;
1024     my %edge_attributes = (
1025         witness => 'string',                    # ID/label for a path
1026         extra => 'boolean',                             # Path key
1027         );
1028     my @path_attributes = keys %edge_attributes; # track our manual additions
1029     my $pmeta = Text::Tradition::Collation::Relationship->meta;
1030     foreach my $attr( $pmeta->get_all_attributes ) {
1031                 next if $attr->name =~ /^_/;
1032                 next if $skipsave{$attr->name};
1033                 next unless $save_types{$attr->type_constraint->name};
1034                 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1035         }
1036     foreach my $datum ( sort keys %edge_attributes ) {
1037         $edge_data_keys{$datum} = 'de'.$edi++;
1038         my $key = $root->addNewChild( $graphml_ns, 'key' );
1039         $key->setAttribute( 'attr.name', $datum );
1040         $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
1041         $key->setAttribute( 'for', 'edge' );
1042         $key->setAttribute( 'id', $edge_data_keys{$datum} );
1043     }
1044
1045     # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1046     my $xmlidname = $self->tradition->name;
1047     $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1048     if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1049         $xmlidname = '_'.$xmlidname;
1050     }
1051     my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1052     $sgraph->setAttribute( 'edgedefault', 'directed' );
1053     $sgraph->setAttribute( 'id', $xmlidname );
1054     $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
1055     $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
1056     $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
1057     $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
1058     $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
1059
1060     # Tradition/collation attribute data
1061     foreach my $datum ( keys %graph_attributes ) {
1062         my $value;
1063         if( $datum eq 'version' ) {
1064                 $value = '3.2';
1065         } elsif( $datum eq 'stemmata' ) {
1066                 my @stemstrs;
1067                 map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1068                         $self->tradition->stemmata;
1069                 $value = join( "\n", @stemstrs );
1070         } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1071                 $value = $self->tradition->$datum;
1072         } else {
1073                 $value = $self->$datum;
1074         }
1075                 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
1076         }
1077
1078     my $node_ctr = 0;
1079     my %node_hash;
1080     # Add our readings to the graph
1081     foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
1082         next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1083                 ( $n->rank < $start->rank || $n->rank > $end->rank );
1084         $use_readings{$n->id} = 1;
1085         # Add to the main graph
1086         my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
1087         my $node_xmlid = 'n' . $node_ctr++;
1088         $node_hash{ $n->id } = $node_xmlid;
1089         $node_el->setAttribute( 'id', $node_xmlid );
1090         foreach my $d ( keys %reading_attributes ) {
1091                 my $nval = $n->$d;
1092                 # Custom serialization
1093                 if( $d eq 'lexemes' ) {
1094                                 # If nval is a true value, we have lexemes so we need to
1095                                 # serialize them. Otherwise set nval to undef so that the
1096                                 # key is excluded from this reading.
1097                         $nval = $nval ? $n->_serialize_lexemes : undef;
1098                 }
1099                 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
1100                         # Adjust the ranks within the subgraph.
1101                         $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1102                                 : $nval - $rankoffset;
1103                 }
1104                 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1105                         if defined $nval;
1106         }
1107     }
1108
1109     # Add the path edges to the sequence graph
1110     my $edge_ctr = 0;
1111     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1112         # We add an edge in the graphml for every witness in $e.
1113         next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1114         my @edge_wits = sort $self->path_witnesses( $e );
1115         $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1116         $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1117         # Skip any path from start to end; that witness is not in the subgraph.
1118         next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
1119         foreach my $wit ( @edge_wits ) {
1120                         my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1121                                                                                 $node_hash{ $e->[0] },
1122                                                                                 $node_hash{ $e->[1] } );
1123                         my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
1124                         $edge_el->setAttribute( 'source', $from );
1125                         $edge_el->setAttribute( 'target', $to );
1126                         $edge_el->setAttribute( 'id', $id );
1127
1128                         # It's a witness path, so add the witness
1129                         my $base = $wit;
1130                         my $key = $edge_data_keys{'witness'};
1131                         # Is this an ante-corr witness?
1132                         my $aclabel = $self->ac_label;
1133                         if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1134                                 # Keep the base witness
1135                                 $base = $1;
1136                                 # ...and record that this is an 'extra' reading path
1137                                 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1138                         }
1139                         _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1140                 }
1141         }
1142
1143         # Report the actual number of nodes and edges that went in
1144         $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1145         $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1146
1147         # Add the relationship graph to the XML
1148         map { delete $edge_data_keys{$_} } @path_attributes;
1149         $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1150                 $node_data_keys{'id'}, \%edge_data_keys );
1151
1152     # Save and return the thing
1153     my $result = decode_utf8( $graphml->toString(1) );
1154     return $result;
1155 }
1156
1157 sub _add_graphml_data {
1158     my( $el, $key, $value ) = @_;
1159     return unless defined $value;
1160     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
1161     $data_el->setAttribute( 'key', $key );
1162     $data_el->appendText( $value );
1163 }
1164
1165 =head2 as_csv
1166
1167 Returns a CSV alignment table representation of the collation graph, one
1168 row per witness (or witness uncorrected.)
1169
1170 =cut
1171
1172 sub as_csv {
1173     my( $self ) = @_;
1174     my $table = $self->alignment_table;
1175     my $csv = Text::CSV->new( { binary => 1, quote_null => 0 } );
1176     my @result;
1177     # Make the header row
1178     $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1179         push( @result, decode_utf8( $csv->string ) );
1180     # Make the rest of the rows
1181     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1182         my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1183         my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
1184         $csv->combine( @row );
1185         push( @result, decode_utf8( $csv->string ) );
1186     }
1187     return join( "\n", @result );
1188 }
1189
1190 =head2 alignment_table( $use_refs, $include_witnesses )
1191
1192 Return a reference to an alignment table, in a slightly enhanced CollateX
1193 format which looks like this:
1194
1195  $table = { alignment => [ { witness => "SIGIL",
1196                              tokens => [ { t => "TEXT" }, ... ] },
1197                            { witness => "SIG2",
1198                              tokens => [ { t => "TEXT" }, ... ] },
1199                            ... ],
1200             length => TEXTLEN };
1201
1202 If $use_refs is set to 1, the reading object is returned in the table
1203 instead of READINGTEXT; if not, the text of the reading is returned.
1204
1205 If $include_witnesses is set to a hashref, only the witnesses whose sigil
1206 keys have a true hash value will be included.
1207
1208 =cut
1209
1210 sub alignment_table {
1211     my( $self ) = @_;
1212     $self->calculate_ranks() unless $self->_graphcalc_done;
1213     return $self->cached_table if $self->has_cached_table;
1214
1215     # Make sure we can do this
1216         throw( "Need a linear graph in order to make an alignment table" )
1217                 unless $self->linear;
1218         $self->calculate_ranks unless $self->end->has_rank;
1219
1220     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
1221     my @all_pos = ( 1 .. $self->end->rank - 1 );
1222     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
1223         # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1224         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1225         my @row = _make_witness_row( \@wit_path, \@all_pos );
1226         push( @{$table->{'alignment'}},
1227                 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1228         if( $wit->is_layered ) {
1229                 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
1230                         $wit->sigil.$self->ac_label );
1231             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
1232                         push( @{$table->{'alignment'}},
1233                                 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
1234         }
1235     }
1236     $self->cached_table( $table );
1237     return $table;
1238 }
1239
1240 sub _make_witness_row {
1241     my( $path, $positions ) = @_;
1242     my %char_hash;
1243     map { $char_hash{$_} = undef } @$positions;
1244     my $debug = 0;
1245     foreach my $rdg ( @$path ) {
1246         my $rtext = $rdg->text;
1247         $rtext = '#LACUNA#' if $rdg->is_lacuna;
1248         print STDERR "rank " . $rdg->rank . "\n" if $debug;
1249         # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
1250         $char_hash{$rdg->rank} = { 't' => $rdg };
1251     }
1252     my @row = map { $char_hash{$_} } @$positions;
1253     # Fill in lacuna markers for undef spots in the row
1254     my $last_el = shift @row;
1255     my @filled_row = ( $last_el );
1256     foreach my $el ( @row ) {
1257         # If we are using node reference, make the lacuna node appear many times
1258         # in the table.  If not, use the lacuna tag.
1259         if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1260             $el = $last_el;
1261         }
1262         push( @filled_row, $el );
1263         $last_el = $el;
1264     }
1265     return @filled_row;
1266 }
1267
1268 =head1 NAVIGATION METHODS
1269
1270 =head2 reading_sequence( $first, $last, $sigil, $backup )
1271
1272 Returns the ordered list of readings, starting with $first and ending
1273 with $last, for the witness given in $sigil. If a $backup sigil is
1274 specified (e.g. when walking a layered witness), it will be used wherever
1275 no $sigil path exists.  If there is a base text reading, that will be
1276 used wherever no path exists for $sigil or $backup.
1277
1278 =cut
1279
1280 # TODO Think about returning some lazy-eval iterator.
1281 # TODO Get rid of backup; we should know from what witness is whether we need it.
1282
1283 sub reading_sequence {
1284     my( $self, $start, $end, $witness ) = @_;
1285
1286     $witness = $self->baselabel unless $witness;
1287     my @readings = ( $start );
1288     my %seen;
1289     my $n = $start;
1290     while( $n && $n->id ne $end->id ) {
1291         if( exists( $seen{$n->id} ) ) {
1292             throw( "Detected loop for $witness at " . $n->id );
1293         }
1294         $seen{$n->id} = 1;
1295
1296         my $next = $self->next_reading( $n, $witness );
1297         unless( $next ) {
1298             throw( "Did not find any path for $witness from reading " . $n->id );
1299         }
1300         push( @readings, $next );
1301         $n = $next;
1302     }
1303     # Check that the last reading is our end reading.
1304     my $last = $readings[$#readings];
1305     throw( "Last reading found from " . $start->text .
1306         " for witness $witness is not the end!" ) # TODO do we get this far?
1307         unless $last->id eq $end->id;
1308
1309     return @readings;
1310 }
1311
1312 =head2 next_reading( $reading, $sigil );
1313
1314 Returns the reading that follows the given reading along the given witness
1315 path.
1316
1317 =cut
1318
1319 sub next_reading {
1320     # Return the successor via the corresponding path.
1321     my $self = shift;
1322     my $answer = $self->_find_linked_reading( 'next', @_ );
1323         return undef unless $answer;
1324     return $self->reading( $answer );
1325 }
1326
1327 =head2 prior_reading( $reading, $sigil )
1328
1329 Returns the reading that precedes the given reading along the given witness
1330 path.
1331
1332 =cut
1333
1334 sub prior_reading {
1335     # Return the predecessor via the corresponding path.
1336     my $self = shift;
1337     my $answer = $self->_find_linked_reading( 'prior', @_ );
1338     return $self->reading( $answer );
1339 }
1340
1341 sub _find_linked_reading {
1342     my( $self, $direction, $node, $path ) = @_;
1343
1344     # Get a backup if we are dealing with a layered witness
1345     my $alt_path;
1346     my $aclabel = $self->ac_label;
1347     if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1348         $alt_path = $1;
1349     }
1350
1351     my @linked_paths = $direction eq 'next'
1352         ? $self->sequence->edges_from( $node )
1353         : $self->sequence->edges_to( $node );
1354     return undef unless scalar( @linked_paths );
1355
1356     # We have to find the linked path that contains all of the
1357     # witnesses supplied in $path.
1358     my( @path_wits, @alt_path_wits );
1359     @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1360     @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
1361     my $base_le;
1362     my $alt_le;
1363     foreach my $le ( @linked_paths ) {
1364         if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
1365             $base_le = $le;
1366         }
1367                 my @le_wits = sort $self->path_witnesses( $le );
1368                 if( _is_within( \@path_wits, \@le_wits ) ) {
1369                         # This is the right path.
1370                         return $direction eq 'next' ? $le->[1] : $le->[0];
1371                 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1372                         $alt_le = $le;
1373                 }
1374     }
1375     # Got this far? Return the alternate path if it exists.
1376     return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
1377         if $alt_le;
1378
1379     # Got this far? Return the base path if it exists.
1380     return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
1381         if $base_le;
1382
1383     # Got this far? We have no appropriate path.
1384     warn "Could not find $direction node from " . $node->id
1385         . " along path $path";
1386     return undef;
1387 }
1388
1389 # Some set logic.
1390 sub _is_within {
1391     my( $set1, $set2 ) = @_;
1392     my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
1393     foreach my $el ( @$set1 ) {
1394         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
1395     }
1396     return $ret;
1397 }
1398
1399 # Return the string that joins together a list of witnesses for
1400 # display on a single path.
1401 sub _witnesses_of_label {
1402     my( $self, $label ) = @_;
1403     my $regex = $self->wit_list_separator;
1404     my @answer = split( /\Q$regex\E/, $label );
1405     return @answer;
1406 }
1407
1408 =head2 common_readings
1409
1410 Returns the list of common readings in the graph (i.e. those readings that are
1411 shared by all non-lacunose witnesses.)
1412
1413 =cut
1414
1415 sub common_readings {
1416         my $self = shift;
1417         my @common = grep { $_->is_common } $self->readings;
1418         return @common;
1419 }
1420
1421 =head2 path_text( $sigil, [, $start, $end ] )
1422
1423 Returns the text of a witness (plus its backup, if we are using a layer)
1424 as stored in the collation.  The text is returned as a string, where the
1425 individual readings are joined with spaces and the meta-readings (e.g.
1426 lacunae) are omitted.  Optional specification of $start and $end allows
1427 the generation of a subset of the witness text.
1428
1429 =cut
1430
1431 sub path_text {
1432         my( $self, $wit, $start, $end ) = @_;
1433         $start = $self->start unless $start;
1434         $end = $self->end unless $end;
1435         my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
1436         my $pathtext = '';
1437         my $last;
1438         foreach my $r ( @path ) {
1439                 unless ( $r->join_prior || !$last || $last->join_next ) {
1440                         $pathtext .= ' ';
1441                 }
1442                 $pathtext .= $r->text;
1443                 $last = $r;
1444         }
1445         return $pathtext;
1446 }
1447
1448 =head1 INITIALIZATION METHODS
1449
1450 These are mostly for use by parsers.
1451
1452 =head2 make_witness_path( $witness )
1453
1454 Link the array of readings contained in $witness->path (and in
1455 $witness->uncorrected_path if it exists) into collation paths.
1456 Clear out the arrays when finished.
1457
1458 =head2 make_witness_paths
1459
1460 Call make_witness_path for all witnesses in the tradition.
1461
1462 =cut
1463
1464 # For use when a collation is constructed from a base text and an apparatus.
1465 # We have the sequences of readings and just need to add path edges.
1466 # When we are done, clear out the witness path attributes, as they are no
1467 # longer needed.
1468 # TODO Find a way to replace the witness path attributes with encapsulated functions?
1469
1470 sub make_witness_paths {
1471     my( $self ) = @_;
1472     foreach my $wit ( $self->tradition->witnesses ) {
1473         # print STDERR "Making path for " . $wit->sigil . "\n";
1474         $self->make_witness_path( $wit );
1475     }
1476 }
1477
1478 sub make_witness_path {
1479     my( $self, $wit ) = @_;
1480     my @chain = @{$wit->path};
1481     my $sig = $wit->sigil;
1482     # Add start and end if necessary
1483     unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1484     push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1485     foreach my $idx ( 0 .. $#chain-1 ) {
1486         $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
1487     }
1488     if( $wit->is_layered ) {
1489         @chain = @{$wit->uncorrected_path};
1490                 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1491                 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1492         foreach my $idx( 0 .. $#chain-1 ) {
1493             my $source = $chain[$idx];
1494             my $target = $chain[$idx+1];
1495             $self->add_path( $source, $target, $sig.$self->ac_label )
1496                 unless $self->has_path( $source, $target, $sig );
1497         }
1498     }
1499     $wit->clear_path;
1500     $wit->clear_uncorrected_path;
1501 }
1502
1503 =head2 calculate_ranks
1504
1505 Calculate the reading ranks (that is, their aligned positions relative
1506 to each other) for the graph.  This can only be called on linear collations.
1507
1508 =begin testing
1509
1510 use Text::Tradition;
1511
1512 my $cxfile = 't/data/Collatex-16.xml';
1513 my $t = Text::Tradition->new(
1514     'name'  => 'inline',
1515     'input' => 'CollateX',
1516     'file'  => $cxfile,
1517     );
1518 my $c = $t->collation;
1519
1520 # Make an svg
1521 my $table = $c->alignment_table;
1522 ok( $c->has_cached_table, "Alignment table was cached" );
1523 is( $c->alignment_table, $table, "Cached table returned upon second call" );
1524 $c->calculate_ranks;
1525 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
1526 $c->add_relationship( 'n24', 'n23', { 'type' => 'spelling' } );
1527 isnt( $c->alignment_table, $table, "Alignment table changed after relationship add" );
1528
1529 =end testing
1530
1531 =cut
1532
1533 sub calculate_ranks {
1534     my $self = shift;
1535     # Save the existing ranks, in case we need to invalidate the cached SVG.
1536     my %existing_ranks;
1537     map { $existing_ranks{$_} = $_->rank } $self->readings;
1538
1539     # Do the rankings based on the relationship equivalence graph, starting
1540     # with the start node.
1541     my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
1542
1543     # Transfer our rankings from the topological graph to the real one.
1544     foreach my $r ( $self->readings ) {
1545         if( $r->is_nonrel ) {
1546                 # These are not in the equivalence graph.  Grab the rank of the highest
1547                 # predecessor + 1.
1548                 my @preds = $self->sequence->predecessors( $r );
1549                 my $mrank = 0;
1550                 map { my $rk = $node_ranks->{$self->equivalence( $_ )} + 1;
1551                         $mrank = $rk > $mrank ? $rk : $mrank; }
1552                         $self->sequence->predecessors( $r );
1553                 throw( "All predecessors of $r unranked!" ) unless $mrank;
1554                 $r->rank( $mrank );
1555         } elsif( defined $node_ranks->{$self->equivalence( $r->id )} ) {
1556             $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
1557         } else {
1558                 # Die. Find the last rank we calculated.
1559                 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
1560                                  <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
1561                         $self->readings;
1562                 my $last = pop @all_defined;
1563             throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
1564         }
1565     }
1566     # Do we need to invalidate the cached data?
1567     if( $self->has_cached_svg || $self->has_cached_table ) {
1568         foreach my $r ( $self->readings ) {
1569                 next if defined( $existing_ranks{$r} )
1570                         && $existing_ranks{$r} == $r->rank;
1571                 # Something has changed, so clear the cache
1572                 $self->_clear_cache;
1573                         # ...and recalculate the common readings.
1574                         $self->calculate_common_readings();
1575                 last;
1576         }
1577     }
1578         # The graph calculation information is now up to date.
1579         $self->_graphcalc_done(1);
1580 }
1581
1582 sub _clear_cache {
1583         my $self = shift;
1584         $self->wipe_svg if $self->has_cached_svg;
1585         $self->wipe_table if $self->has_cached_table;
1586 }
1587
1588
1589 =head2 flatten_ranks
1590
1591 A convenience method for parsing collation data.  Searches the graph for readings
1592 with the same text at the same rank, and merges any that are found.
1593
1594 =cut
1595
1596 sub flatten_ranks {
1597     my $self = shift;
1598     my %unique_rank_rdg;
1599     my $changed;
1600     foreach my $rdg ( $self->readings ) {
1601         next unless $rdg->has_rank;
1602         my $key = $rdg->rank . "||" . $rdg->text;
1603         if( exists $unique_rank_rdg{$key} ) {
1604             # Combine!
1605                 # print STDERR "Combining readings at same rank: $key\n";
1606                 $changed = 1;
1607             $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
1608             # TODO see if this now makes a common point.
1609         } else {
1610             $unique_rank_rdg{$key} = $rdg;
1611         }
1612     }
1613     # If we merged readings, the ranks are still fine but the alignment
1614     # table is wrong. Wipe it.
1615     $self->wipe_table() if $changed;
1616 }
1617
1618
1619 =head2 calculate_common_readings
1620
1621 Goes through the graph identifying the readings that appear in every witness
1622 (apart from those with lacunae at that spot.) Marks them as common and returns
1623 the list.
1624
1625 =begin testing
1626
1627 use Text::Tradition;
1628
1629 my $cxfile = 't/data/Collatex-16.xml';
1630 my $t = Text::Tradition->new(
1631     'name'  => 'inline',
1632     'input' => 'CollateX',
1633     'file'  => $cxfile,
1634     );
1635 my $c = $t->collation;
1636
1637 my @common = $c->calculate_common_readings();
1638 is( scalar @common, 8, "Found correct number of common readings" );
1639 my @marked = sort $c->common_readings();
1640 is( scalar @common, 8, "All common readings got marked as such" );
1641 my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
1642 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1643
1644 =end testing
1645
1646 =cut
1647
1648 sub calculate_common_readings {
1649         my $self = shift;
1650         my @common;
1651         map { $_->is_common( 0 ) } $self->readings;
1652         # Implicitly calls calculate_ranks
1653         my $table = $self->alignment_table;
1654         foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1655                 my @row = map { $_->{'tokens'}->[$idx]
1656                                                         ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1657                                         @{$table->{'alignment'}};
1658                 my %hash;
1659                 foreach my $r ( @row ) {
1660                         if( $r ) {
1661                                 $hash{$r->id} = $r unless $r->is_meta;
1662                         } else {
1663                                 $hash{'UNDEF'} = $r;
1664                         }
1665                 }
1666                 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1667                         my( $r ) = values %hash;
1668                         $r->is_common( 1 );
1669                         push( @common, $r );
1670                 }
1671         }
1672         return @common;
1673 }
1674
1675 =head2 text_from_paths
1676
1677 Calculate the text array for all witnesses from the path, for later consistency
1678 checking.  Only to be used if there is no non-graph-based way to know the
1679 original texts.
1680
1681 =cut
1682
1683 sub text_from_paths {
1684         my $self = shift;
1685     foreach my $wit ( $self->tradition->witnesses ) {
1686         my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1687         my @text;
1688         foreach my $r ( @readings ) {
1689                 next if $r->is_meta;
1690                 push( @text, $r->text );
1691         }
1692         $wit->text( \@text );
1693         if( $wit->is_layered ) {
1694                         my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
1695                                                                                                   $wit->sigil.$self->ac_label );
1696                         my @uctext;
1697                         foreach my $r ( @ucrdgs ) {
1698                                 next if $r->is_meta;
1699                                 push( @uctext, $r->text );
1700                         }
1701                         $wit->layertext( \@uctext );
1702         }
1703     }
1704 }
1705
1706 =head1 UTILITY FUNCTIONS
1707
1708 =head2 common_predecessor( $reading_a, $reading_b )
1709
1710 Find the last reading that occurs in sequence before both the given readings.
1711 At the very least this should be $self->start.
1712
1713 =head2 common_successor( $reading_a, $reading_b )
1714
1715 Find the first reading that occurs in sequence after both the given readings.
1716 At the very least this should be $self->end.
1717
1718 =begin testing
1719
1720 use Text::Tradition;
1721
1722 my $cxfile = 't/data/Collatex-16.xml';
1723 my $t = Text::Tradition->new(
1724     'name'  => 'inline',
1725     'input' => 'CollateX',
1726     'file'  => $cxfile,
1727     );
1728 my $c = $t->collation;
1729
1730 is( $c->common_predecessor( 'n24', 'n23' )->id,
1731     'n20', "Found correct common predecessor" );
1732 is( $c->common_successor( 'n24', 'n23' )->id,
1733     '__END__', "Found correct common successor" );
1734
1735 is( $c->common_predecessor( 'n19', 'n17' )->id,
1736     'n16', "Found correct common predecessor for readings on same path" );
1737 is( $c->common_successor( 'n21', 'n10' )->id,
1738     '__END__', "Found correct common successor for readings on same path" );
1739
1740 =end testing
1741
1742 =cut
1743
1744 ## Return the closest reading that is a predecessor of both the given readings.
1745 sub common_predecessor {
1746         my $self = shift;
1747         my( $r1, $r2 ) = $self->_objectify_args( @_ );
1748         return $self->_common_in_path( $r1, $r2, 'predecessors' );
1749 }
1750
1751 sub common_successor {
1752         my $self = shift;
1753         my( $r1, $r2 ) = $self->_objectify_args( @_ );
1754         return $self->_common_in_path( $r1, $r2, 'successors' );
1755 }
1756
1757
1758 # TODO think about how to do this without ranks...
1759 sub _common_in_path {
1760         my( $self, $r1, $r2, $dir ) = @_;
1761         my $iter = $self->end->rank;
1762         my @candidates;
1763         my @last_r1 = ( $r1 );
1764         my @last_r2 = ( $r2 );
1765         # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
1766         my %all_seen;
1767         # print STDERR "Finding common $dir for $r1, $r2\n";
1768         while( !@candidates ) {
1769                 last unless $iter--;  # Avoid looping infinitely
1770                 # Iterate separately down the graph from r1 and r2
1771                 my( @new_lc1, @new_lc2 );
1772                 foreach my $lc ( @last_r1 ) {
1773                         foreach my $p ( $lc->$dir ) {
1774                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
1775                                         # print STDERR "Path candidate $p from $lc\n";
1776                                         push( @candidates, $p );
1777                                 } elsif( !$all_seen{$p->id} ) {
1778                                         $all_seen{$p->id} = 'r1';
1779                                         push( @new_lc1, $p );
1780                                 }
1781                         }
1782                 }
1783                 foreach my $lc ( @last_r2 ) {
1784                         foreach my $p ( $lc->$dir ) {
1785                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
1786                                         # print STDERR "Path candidate $p from $lc\n";
1787                                         push( @candidates, $p );
1788                                 } elsif( !$all_seen{$p->id} ) {
1789                                         $all_seen{$p->id} = 'r2';
1790                                         push( @new_lc2, $p );
1791                                 }
1792                         }
1793                 }
1794                 @last_r1 = @new_lc1;
1795                 @last_r2 = @new_lc2;
1796         }
1797         my @answer = sort { $a->rank <=> $b->rank } @candidates;
1798         return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1799 }
1800
1801 sub throw {
1802         Text::Tradition::Error->throw(
1803                 'ident' => 'Collation error',
1804                 'message' => $_[0],
1805                 );
1806 }
1807
1808 no Moose;
1809 __PACKAGE__->meta->make_immutable;
1810
1811 =head1 LICENSE
1812
1813 This package is free software and is provided "as is" without express
1814 or implied warranty.  You can redistribute it and/or modify it under
1815 the same terms as Perl itself.
1816
1817 =head1 AUTHOR
1818
1819 Tara L Andrews E<lt>aurum@cpan.orgE<gt>