lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use Encode qw( decode_utf8 );
   4 use File::Temp;
   5 use File::Which;
   6 use Graph;
   7 use IPC::Run qw( run binary );
   8 use Text::CSV_XS;
   9 use Text::Tradition::Collation::Reading;
  10 use Text::Tradition::Collation::RelationshipStore;
  11 use Text::Tradition::Error;
  12 use XML::LibXML;
  13 use XML::LibXML::XPathContext;
  14 use Moose;
  15
  16 has 'sequence' => (
  17     is => 'ro',
  18     isa => 'Graph',
  19     default => sub { Graph->new() },
  20     handles => {
  21         paths => 'edges',
  22     },
  23     );
  24
  25 has 'relations' => (
  26         is => 'ro',
  27         isa => 'Text::Tradition::Collation::RelationshipStore',
  28         handles => {
  29                 relationships => 'relationships',
  30                 related_readings => 'related_readings',
  31                 del_relationship => 'del_relationship',
  32         },
  33         writer => '_set_relations',
  34         );
  35
  36 has 'tradition' => (
  37     is => 'ro',
  38     isa => 'Text::Tradition',
  39     weak_ref => 1,
  40     );
  41
  42 has 'readings' => (
  43         isa => 'HashRef[Text::Tradition::Collation::Reading]',
  44         traits => ['Hash'],
  45     handles => {
  46         reading     => 'get',
  47         _add_reading => 'set',
  48         del_reading => 'delete',
  49         has_reading => 'exists',
  50         readings   => 'values',
  51     },
  52     default => sub { {} },
  53         );
  54
  55 has 'wit_list_separator' => (
  56     is => 'rw',
  57     isa => 'Str',
  58     default => ', ',
  59     );
  60
  61 has 'baselabel' => (
  62     is => 'rw',
  63     isa => 'Str',
  64     default => 'base text',
  65     );
  66
  67 has 'linear' => (
  68     is => 'rw',
  69     isa => 'Bool',
  70     default => 1,
  71     );
  72
  73 has 'ac_label' => (
  74     is => 'rw',
  75     isa => 'Str',
  76     default => ' (a.c.)',
  77     );
  78
  79 has 'start' => (
  80         is => 'ro',
  81         isa => 'Text::Tradition::Collation::Reading',
  82         writer => '_set_start',
  83         weak_ref => 1,
  84         );
  85
  86 has 'end' => (
  87         is => 'ro',
  88         isa => 'Text::Tradition::Collation::Reading',
  89         writer => '_set_end',
  90         weak_ref => 1,
  91         );
  92
  93 has 'cached_svg' => (
  94         is => 'rw',
  95         isa => 'Str',
  96         predicate => 'has_cached_svg',
  97         clearer => 'wipe_svg',
  98         );
  99
 100 has 'cached_table' => (
 101         is => 'rw',
 102         isa => 'HashRef',
 103         predicate => 'has_cached_table',
 104         clearer => 'wipe_table',
 105         );
 106
 107 =head1 NAME
 108
 109 Text::Tradition::Collation - a software model for a text collation
 110
 111 =head1 SYNOPSIS
 112
 113   use Text::Tradition;
 114   my $t = Text::Tradition->new(
 115     'name' => 'this is a text',
 116     'input' => 'TEI',
 117     'file' => '/path/to/tei_parallel_seg_file.xml' );
 118
 119   my $c = $t->collation;
 120   my @readings = $c->readings;
 121   my @paths = $c->paths;
 122   my @relationships = $c->relationships;
 123
 124   my $svg_variant_graph = $t->collation->as_svg();
 125
 126 =head1 DESCRIPTION
 127
 128 Text::Tradition is a library for representation and analysis of collated
 129 texts, particularly medieval ones.  The Collation is the central feature of
 130 a Tradition, where the text, its sequence of readings, and its relationships
 131 between readings are actually kept.
 132
 133 =head1 CONSTRUCTOR
 134
 135 =head2 new
 136
 137 The constructor.  Takes a hash or hashref of the following arguments:
 138
 139 =over
 140
 141 =item * tradition - The Text::Tradition object to which the collation
 142 belongs. Required.
 143
 144 =item * linear - Whether the collation should be linear; that is, whether
 145 transposed readings should be treated as two linked readings rather than one,
 146 and therefore whether the collation graph is acyclic.  Defaults to true.
 147
 148 =item * baselabel - The default label for the path taken by a base text
 149 (if any). Defaults to 'base text'.
 150
 151 =item * wit_list_separator - The string to join a list of witnesses for
 152 purposes of making labels in display graphs.  Defaults to ', '.
 153
 154 =item * ac_label - The extra label to tack onto a witness sigil when
 155 representing another layer of path for the given witness - that is, when
 156 a text has more than one possible reading due to scribal corrections or
 157 the like.  Defaults to ' (a.c.)'.
 158
 159 =back
 160
 161 =head1 ACCESSORS
 162
 163 =head2 tradition
 164
 165 =head2 linear
 166
 167 =head2 wit_list_separator
 168
 169 =head2 baselabel
 170
 171 =head2 ac_label
 172
 173 Simple accessors for collation attributes.
 174
 175 =head2 start
 176
 177 The meta-reading at the start of every witness path.
 178
 179 =head2 end
 180
 181 The meta-reading at the end of every witness path.
 182
 183 =head2 readings
 184
 185 Returns all Reading objects in the graph.
 186
 187 =head2 reading( $id )
 188
 189 Returns the Reading object corresponding to the given ID.
 190
 191 =head2 add_reading( $reading_args )
 192
 193 Adds a new reading object to the collation.
 194 See L<Text::Tradition::Collation::Reading> for the available arguments.
 195
 196 =head2 del_reading( $object_or_id )
 197
 198 Removes the given reading from the collation, implicitly removing its
 199 paths and relationships.
 200
 201 =head2 merge_readings( $main, $second )
 202
 203 Merges the $second reading into the $main one.
 204 The arguments may be either readings or reading IDs.
 205
 206 =head2 has_reading( $id )
 207
 208 Predicate to see whether a given reading ID is in the graph.
 209
 210 =head2 reading_witnesses( $object_or_id )
 211
 212 Returns a list of sigils whose witnesses contain the reading.
 213
 214 =head2 paths
 215
 216 Returns all reading paths within the document - that is, all edges in the
 217 collation graph.  Each path is an arrayref of [ $source, $target ] reading IDs.
 218
 219 =head2 add_path( $source, $target, $sigil )
 220
 221 Links the given readings in the collation in sequence, under the given witness
 222 sigil.  The readings may be specified by object or ID.
 223
 224 =head2 del_path( $source, $target, $sigil )
 225
 226 Links the given readings in the collation in sequence, under the given witness
 227 sigil.  The readings may be specified by object or ID.
 228
 229 =head2 has_path( $source, $target );
 230
 231 Returns true if the two readings are linked in sequence in any witness.
 232 The readings may be specified by object or ID.
 233
 234 =head2 relationships
 235
 236 Returns all Relationship objects in the collation.
 237
 238 =head2 add_relationship( $reading, $other_reading, $options )
 239
 240 Adds a new relationship of the type given in $options between the two readings,
 241 which may be specified by object or ID.  Returns a value of ( $status, @vectors)
 242 where $status is true on success, and @vectors is a list of relationship edges
 243 that were ultimately added.
 244 See L<Text::Tradition::Collation::Relationship> for the available options.
 245
 246 =cut
 247
 248 sub BUILD {
 249     my $self = shift;
 250     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
 251     $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) );
 252     $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) );
 253 }
 254
 255 ### Reading construct/destruct functions
 256
 257 sub _clear_cache {
 258         my $self = shift;
 259         $self->wipe_svg if $self->has_cached_svg;
 260         $self->wipe_table if $self->has_cached_table;
 261 }
 262
 263 sub add_reading {
 264         my( $self, $reading ) = @_;
 265         unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
 266                 my %args = %$reading;
 267                 $reading = Text::Tradition::Collation::Reading->new(
 268                         'collation' => $self,
 269                         %args );
 270         }
 271         # First check to see if a reading with this ID exists.
 272         if( $self->reading( $reading->id ) ) {
 273                 throw( "Collation already has a reading with id " . $reading->id );
 274         }
 275         $self->_clear_cache;
 276         $self->_add_reading( $reading->id => $reading );
 277         # Once the reading has been added, put it in both graphs.
 278         $self->sequence->add_vertex( $reading->id );
 279         $self->relations->add_reading( $reading->id );
 280         return $reading;
 281 };
 282
 283 around del_reading => sub {
 284         my $orig = shift;
 285         my $self = shift;
 286         my $arg = shift;
 287
 288         if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
 289                 $arg = $arg->id;
 290         }
 291         # Remove the reading from the graphs.
 292         $self->_clear_cache;
 293         $self->sequence->delete_vertex( $arg );
 294         $self->relations->delete_reading( $arg );
 295
 296         # Carry on.
 297         $self->$orig( $arg );
 298 };
 299
 300 # merge_readings( $main, $to_be_deleted );
 301
 302 sub merge_readings {
 303         my $self = shift;
 304
 305         # We only need the IDs for adding paths to the graph, not the reading
 306         # objects themselves.
 307     my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
 308         $self->_clear_cache;
 309
 310     # The kept reading should inherit the paths and the relationships
 311     # of the deleted reading.
 312         foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
 313                 my @vector = ( $kept );
 314                 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
 315                 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
 316                 next if $vector[0] eq $vector[1]; # Don't add a self loop
 317                 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
 318                 $self->sequence->add_edge( @vector );
 319                 my $fwits = $self->sequence->get_edge_attributes( @vector );
 320                 @wits{keys %$fwits} = values %$fwits;
 321                 $self->sequence->set_edge_attributes( @vector, \%wits );
 322         }
 323         $self->relations->merge_readings( $kept, $deleted, $combine_char );
 324
 325         # Do the deletion deed.
 326         if( $combine_char ) {
 327                 my $kept_obj = $self->reading( $kept );
 328                 my $new_text = join( $combine_char, $kept_obj->text,
 329                         $self->reading( $deleted )->text );
 330                 $kept_obj->alter_text( $new_text );
 331         }
 332         $self->del_reading( $deleted );
 333 }
 334
 335
 336 # Helper function for manipulating the graph.
 337 sub _stringify_args {
 338         my( $self, $first, $second, $arg ) = @_;
 339     $first = $first->id
 340         if ref( $first ) eq 'Text::Tradition::Collation::Reading';
 341     $second = $second->id
 342         if ref( $second ) eq 'Text::Tradition::Collation::Reading';
 343     return( $first, $second, $arg );
 344 }
 345
 346 # Helper function for manipulating the graph.
 347 sub _objectify_args {
 348         my( $self, $first, $second, $arg ) = @_;
 349     $first = $self->reading( $first )
 350         unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
 351     $second = $self->reading( $second )
 352         unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
 353     return( $first, $second, $arg );
 354 }
 355 ### Path logic
 356
 357 sub add_path {
 358         my $self = shift;
 359
 360         # We only need the IDs for adding paths to the graph, not the reading
 361         # objects themselves.
 362     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 363
 364         $self->_clear_cache;
 365         # Connect the readings
 366     $self->sequence->add_edge( $source, $target );
 367     # Note the witness in question
 368     $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
 369 };
 370
 371 sub del_path {
 372         my $self = shift;
 373         my @args;
 374         if( ref( $_[0] ) eq 'ARRAY' ) {
 375                 my $e = shift @_;
 376                 @args = ( @$e, @_ );
 377         } else {
 378                 @args = @_;
 379         }
 380
 381         # We only need the IDs for adding paths to the graph, not the reading
 382         # objects themselves.
 383     my( $source, $target, $wit ) = $self->_stringify_args( @args );
 384
 385         $self->_clear_cache;
 386         if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
 387                 $self->sequence->delete_edge_attribute( $source, $target, $wit );
 388         }
 389         unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
 390                 $self->sequence->delete_edge( $source, $target );
 391         }
 392 }
 393
 394
 395 # Extra graph-alike utility
 396 sub has_path {
 397         my $self = shift;
 398     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 399         return undef unless $self->sequence->has_edge( $source, $target );
 400         return $self->sequence->has_edge_attribute( $source, $target, $wit );
 401 }
 402
 403 =head2 clear_witness( @sigil_list )
 404
 405 Clear the given witnesses out of the collation entirely, removing references
 406 to them in paths, and removing readings that belong only to them.  Should only
 407 be called via $tradition->del_witness.
 408
 409 =cut
 410
 411 sub clear_witness {
 412         my( $self, @sigils ) = @_;
 413
 414         $self->_clear_cache;
 415         # Clear the witness(es) out of the paths
 416         foreach my $e ( $self->paths ) {
 417                 foreach my $sig ( @sigils ) {
 418                         $self->del_path( $e, $sig );
 419                 }
 420         }
 421
 422         # Clear out the newly unused readings
 423         foreach my $r ( $self->readings ) {
 424                 unless( $self->reading_witnesses( $r ) ) {
 425                         $self->del_reading( $r );
 426                 }
 427         }
 428 }
 429
 430 sub add_relationship {
 431         my $self = shift;
 432     my( $source, $target, $opts ) = $self->_stringify_args( @_ );
 433     my( @vectors ) = $self->relations->add_relationship( $source,
 434         $self->reading( $source ), $target, $self->reading( $target ), $opts );
 435     # Force a full rank recalculation every time. Yuck.
 436     $self->calculate_ranks() if $self->end->has_rank;
 437         $self->_clear_cache;
 438     return @vectors;
 439 }
 440
 441 =head2 reading_witnesses( $reading )
 442
 443 Return a list of sigils corresponding to the witnesses in which the reading appears.
 444
 445 =cut
 446
 447 sub reading_witnesses {
 448         my( $self, $reading ) = @_;
 449         # We need only check either the incoming or the outgoing edges; I have
 450         # arbitrarily chosen "incoming".  Thus, special-case the start node.
 451         if( $reading eq $self->start ) {
 452                 return map { $_->sigil } $self->tradition->witnesses;
 453         }
 454         my %all_witnesses;
 455         foreach my $e ( $self->sequence->edges_to( $reading ) ) {
 456                 my $wits = $self->sequence->get_edge_attributes( @$e );
 457                 @all_witnesses{ keys %$wits } = 1;
 458         }
 459         my $acstr = $self->ac_label;
 460         foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
 461                 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
 462         }
 463         return keys %all_witnesses;
 464 }
 465
 466 =head1 OUTPUT METHODS
 467
 468 =head2 as_svg( \%options )
 469
 470 Returns an SVG string that represents the graph, via as_dot and graphviz.
 471 See as_dot for a list of options.  Must have GraphViz (dot) installed to run.
 472
 473 =cut
 474
 475 sub as_svg {
 476     my( $self, $opts ) = @_;
 477     throw( "Need GraphViz installed to output SVG" )
 478         unless File::Which::which( 'dot' );
 479     my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
 480     if( !$self->has_cached_svg || $opts->{'recalc'}     || $want_subgraph ) {
 481                 my @cmd = qw/dot -Tsvg/;
 482                 my( $svg, $err );
 483                 my $dotfile = File::Temp->new();
 484                 ## USE FOR DEBUGGING
 485                 # $dotfile->unlink_on_destroy(0);
 486                 binmode $dotfile, ':utf8';
 487                 print $dotfile $self->as_dot( $opts );
 488                 push( @cmd, $dotfile->filename );
 489                 run( \@cmd, ">", binary(), \$svg );
 490                 $svg = decode_utf8( $svg );
 491                 $self->cached_svg( $svg ) unless $want_subgraph;
 492                 return $svg;
 493         } else {
 494                 return $self->cached_svg;
 495         }
 496 }
 497
 498
 499 =head2 as_dot( \%options )
 500
 501 Returns a string that is the collation graph expressed in dot
 502 (i.e. GraphViz) format.  Options include:
 503
 504 =over 4
 505
 506 =item * from
 507
 508 =item * to
 509
 510 =item * color_common
 511
 512 =back
 513
 514 =cut
 515
 516 sub as_dot {
 517     my( $self, $opts ) = @_;
 518     my $startrank = $opts->{'from'} if $opts;
 519     my $endrank = $opts->{'to'} if $opts;
 520     my $color_common = $opts->{'color_common'} if $opts;
 521     my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
 522        && $self->end->rank > 100;
 523
 524     # Check the arguments
 525     if( $startrank ) {
 526         return if $endrank && $startrank > $endrank;
 527         return if $startrank > $self->end->rank;
 528         }
 529         if( defined $endrank ) {
 530                 return if $endrank < 0;
 531                 $endrank = undef if $endrank == $self->end->rank;
 532         }
 533
 534     my $graph_name = $self->tradition->name;
 535     $graph_name =~ s/[^\w\s]//g;
 536     $graph_name = join( '_', split( /\s+/, $graph_name ) );
 537
 538     my %graph_attrs = (
 539         'rankdir' => 'LR',
 540         'bgcolor' => 'none',
 541         );
 542     my %node_attrs = (
 543         'fontsize' => 14,
 544         'fillcolor' => 'white',
 545         'style' => 'filled',
 546         'shape' => 'ellipse'
 547         );
 548     my %edge_attrs = (
 549         'arrowhead' => 'open',
 550         'color' => '#000000',
 551         'fontcolor' => '#000000',
 552         );
 553
 554     my $dot = sprintf( "digraph %s {\n", $graph_name );
 555     $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
 556     $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
 557
 558         # Output substitute start/end readings if necessary
 559         if( $startrank ) {
 560                 $dot .= "\t\"#SUBSTART#\" [ label=\"...\" ];\n";
 561         }
 562         if( $endrank ) {
 563                 $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n";
 564         }
 565         if( $STRAIGHTENHACK ) {
 566                 ## HACK part 1
 567                 $dot .= "\tsubgraph { rank=same \"#START#\" \"#SILENT#\" }\n";
 568                 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
 569         }
 570         my %used;  # Keep track of the readings that actually appear in the graph
 571         # Sort the readings by rank if we have ranks; this speeds layout.
 572         my @all_readings = $self->end->has_rank
 573                 ? sort { $a->rank <=> $b->rank } $self->readings
 574                 : $self->readings;
 575         # TODO Refrain from outputting lacuna nodes - just grey out the edges.
 576     foreach my $reading ( @all_readings ) {
 577         # Only output readings within our rank range.
 578         next if $startrank && $reading->rank < $startrank;
 579         next if $endrank && $reading->rank > $endrank;
 580         $used{$reading->id} = 1;
 581         # Need not output nodes without separate labels
 582         next if $reading->id eq $reading->text;
 583         my $rattrs;
 584         my $label = $reading->text;
 585         $label =~ s/\"/\\\"/g;
 586                 $rattrs->{'label'} = $label;
 587                 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
 588         $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
 589     }
 590
 591         # Add the real edges. Need to weight one edge per rank jump, in a
 592         # continuous line.
 593         # my $weighted = $self->_add_edge_weights;
 594     my @edges = $self->paths;
 595         my( %substart, %subend );
 596     foreach my $edge ( @edges ) {
 597         # Do we need to output this edge?
 598         if( $used{$edge->[0]} && $used{$edge->[1]} ) {
 599                 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
 600                         my $variables = { %edge_attrs, 'label' => $label };
 601
 602                         # Account for the rank gap if necessary
 603                         my $rank0 = $self->reading( $edge->[0] )->rank
 604                                 if $self->reading( $edge->[0] )->has_rank;
 605                         my $rank1 = $self->reading( $edge->[1] )->rank
 606                                 if $self->reading( $edge->[1] )->has_rank;
 607                         if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
 608                                 $variables->{'minlen'} = $rank1 - $rank0;
 609                         }
 610
 611                         # Add the calculated edge weights
 612                         # if( exists $weighted->{$edge->[0]}
 613                         #       && $weighted->{$edge->[0]} eq $edge->[1] ) {
 614                         #       # $variables->{'color'} = 'red';
 615                         #       $variables->{'weight'} = 3.0;
 616                         # }
 617
 618                         # EXPERIMENTAL: make edge width reflect no. of witnesses
 619                         my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
 620                         $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
 621
 622                         my $varopts = _dot_attr_string( $variables );
 623                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
 624                                 $edge->[0], $edge->[1], $varopts );
 625         } elsif( $used{$edge->[0]} ) {
 626                 $subend{$edge->[0]} = 1;
 627         } elsif( $used{$edge->[1]} ) {
 628                 $substart{$edge->[1]} = 1;
 629         }
 630     }
 631     # Add substitute start and end edges if necessary
 632     foreach my $node ( keys %substart ) {
 633         my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
 634         my $variables = { %edge_attrs, 'label' => $witstr };
 635         my $varopts = _dot_attr_string( $variables );
 636         $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;";
 637         }
 638     foreach my $node ( keys %subend ) {
 639         my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
 640         my $variables = { %edge_attrs, 'label' => $witstr };
 641         my $varopts = _dot_attr_string( $variables );
 642         $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;";
 643         }
 644         # HACK part 2
 645         if( $STRAIGHTENHACK ) {
 646                 $dot .= "\t\"#END#\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
 647         }
 648
 649     $dot .= "}\n";
 650     return $dot;
 651 }
 652
 653 sub _dot_attr_string {
 654         my( $hash ) = @_;
 655         my @attrs;
 656         foreach my $k ( sort keys %$hash ) {
 657                 my $v = $hash->{$k};
 658                 push( @attrs, $k.'="'.$v.'"' );
 659         }
 660         return( '[ ' . join( ', ', @attrs ) . ' ]' );
 661 }
 662
 663 sub _add_edge_weights {
 664         my $self = shift;
 665         # Walk the graph from START to END, choosing the successor node with
 666         # the largest number of witness paths each time.
 667         my $weighted = {};
 668         my $curr = $self->start->id;
 669         my $ranked = $self->end->has_rank;
 670         while( $curr ne $self->end->id ) {
 671                 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
 672                 my @succ = sort { $self->path_witnesses( $curr, $a )
 673                                                         <=> $self->path_witnesses( $curr, $b ) }
 674                         $self->sequence->successors( $curr );
 675                 my $next = pop @succ;
 676                 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
 677                 # Try to avoid lacunae in the weighted path.
 678                 while( @succ &&
 679                            ( $self->reading( $next )->is_lacuna ||
 680                                  $nextrank - $rank > 1 ) ){
 681                         $next = pop @succ;
 682                 }
 683                 $weighted->{$curr} = $next;
 684                 $curr = $next;
 685         }
 686         return $weighted;
 687 }
 688
 689 =head2 path_witnesses( $edge )
 690
 691 Returns the list of sigils whose witnesses are associated with the given edge.
 692 The edge can be passed as either an array or an arrayref of ( $source, $target ).
 693
 694 =cut
 695
 696 sub path_witnesses {
 697         my( $self, @edge ) = @_;
 698         # If edge is an arrayref, cope.
 699         if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
 700                 my $e = shift @edge;
 701                 @edge = @$e;
 702         }
 703         my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
 704         return @wits;
 705 }
 706
 707 sub _path_display_label {
 708         my $self = shift;
 709         my @wits = sort @_;
 710         my $maj = scalar( $self->tradition->witnesses ) * 0.6;
 711         if( scalar @wits > $maj ) {
 712                 # TODO break out a.c. wits
 713                 return 'majority';
 714         } else {
 715                 return join( ', ', @wits );
 716         }
 717 }
 718
 719 =head2 witnesses_at_rank
 720
 721 Returns a list of witnesses that are not lacunose, for a given rank.
 722
 723 =cut
 724
 725 sub witnesses_at_rank {
 726         my( $self, $rank ) = @_;
 727 }
 728
 729 =head2 as_graphml
 730
 731 Returns a GraphML representation of the collation.  The GraphML will contain
 732 two graphs. The first expresses the attributes of the readings and the witness
 733 paths that link them; the second expresses the relationships that link the
 734 readings.  This is the native transfer format for a tradition.
 735
 736 =begin testing
 737
 738 use Text::Tradition;
 739
 740 my $READINGS = 311;
 741 my $PATHS = 361;
 742
 743 my $datafile = 't/data/florilegium_tei_ps.xml';
 744 my $tradition = Text::Tradition->new( 'input' => 'TEI',
 745                                       'name' => 'test0',
 746                                       'file' => $datafile,
 747                                       'linear' => 1 );
 748
 749 ok( $tradition, "Got a tradition object" );
 750 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
 751 ok( $tradition->collation, "Tradition has a collation" );
 752
 753 my $c = $tradition->collation;
 754 is( scalar $c->readings, $READINGS, "Collation has all readings" );
 755 is( scalar $c->paths, $PATHS, "Collation has all paths" );
 756 is( scalar $c->relationships, 0, "Collation has all relationships" );
 757
 758 # Add a few relationships
 759 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
 760 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
 761 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
 762
 763 # Now write it to GraphML and parse it again.
 764
 765 my $graphml = $c->as_graphml;
 766 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
 767 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
 768 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
 769 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
 770
 771 =end testing
 772
 773 =cut
 774
 775 sub as_graphml {
 776     my( $self ) = @_;
 777
 778     # Some namespaces
 779     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
 780     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
 781     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
 782         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
 783
 784     # Create the document and root node
 785     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
 786     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
 787     $graphml->setDocumentElement( $root );
 788     $root->setNamespace( $xsi_ns, 'xsi', 0 );
 789     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
 790
 791     # Add the data keys for the graph
 792     my %graph_data_keys;
 793     my $gdi = 0;
 794     my @graph_attributes = qw/ version wit_list_separator baselabel linear ac_label /;
 795     foreach my $datum ( @graph_attributes ) {
 796         $graph_data_keys{$datum} = 'dg'.$gdi++;
 797         my $key = $root->addNewChild( $graphml_ns, 'key' );
 798         $key->setAttribute( 'attr.name', $datum );
 799         $key->setAttribute( 'attr.type', $key eq 'linear' ? 'boolean' : 'string' );
 800         $key->setAttribute( 'for', 'graph' );
 801         $key->setAttribute( 'id', $graph_data_keys{$datum} );
 802     }
 803
 804     # Add the data keys for nodes
 805     my %node_data_keys;
 806     my $ndi = 0;
 807     my %node_data = (
 808         id => 'string',
 809         text => 'string',
 810         rank => 'string',
 811         is_start => 'boolean',
 812         is_end => 'boolean',
 813         is_lacuna => 'boolean',
 814         );
 815     foreach my $datum ( keys %node_data ) {
 816         $node_data_keys{$datum} = 'dn'.$ndi++;
 817         my $key = $root->addNewChild( $graphml_ns, 'key' );
 818         $key->setAttribute( 'attr.name', $datum );
 819         $key->setAttribute( 'attr.type', $node_data{$datum} );
 820         $key->setAttribute( 'for', 'node' );
 821         $key->setAttribute( 'id', $node_data_keys{$datum} );
 822     }
 823
 824     # Add the data keys for edges, i.e. witnesses
 825     my $edi = 0;
 826     my %edge_data_keys;
 827     my %edge_data = (
 828         class => 'string',                              # Class, deprecated soon
 829         witness => 'string',                    # ID/label for a path
 830         relationship => 'string',               # ID/label for a relationship
 831         extra => 'boolean',                             # Path key
 832         scope => 'string',                              # Relationship key
 833         non_correctable => 'boolean',   # Relationship key
 834         non_independent => 'boolean',   # Relationship key
 835         );
 836     foreach my $datum ( keys %edge_data ) {
 837         $edge_data_keys{$datum} = 'de'.$edi++;
 838         my $key = $root->addNewChild( $graphml_ns, 'key' );
 839         $key->setAttribute( 'attr.name', $datum );
 840         $key->setAttribute( 'attr.type', $edge_data{$datum} );
 841         $key->setAttribute( 'for', 'edge' );
 842         $key->setAttribute( 'id', $edge_data_keys{$datum} );
 843     }
 844
 845     # Add the collation graph itself
 846     my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
 847     $sgraph->setAttribute( 'edgedefault', 'directed' );
 848     $sgraph->setAttribute( 'id', $self->tradition->name );
 849     $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
 850     $sgraph->setAttribute( 'parse.edges', scalar($self->paths) );
 851     $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
 852     $sgraph->setAttribute( 'parse.nodes', scalar($self->readings) );
 853     $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
 854
 855     # Collation attribute data
 856     foreach my $datum ( @graph_attributes ) {
 857         my $value = $datum eq 'version' ? '3.0' : $self->$datum;
 858                 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
 859         }
 860
 861     my $node_ctr = 0;
 862     my %node_hash;
 863     # Add our readings to the graph
 864     foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
 865         # Add to the main graph
 866         my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
 867         my $node_xmlid = 'n' . $node_ctr++;
 868         $node_hash{ $n->id } = $node_xmlid;
 869         $node_el->setAttribute( 'id', $node_xmlid );
 870         foreach my $d ( keys %node_data ) {
 871                 my $nval = $n->$d;
 872                 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
 873                         if defined $nval;
 874         }
 875     }
 876
 877     # Add the path edges to the sequence graph
 878     my $edge_ctr = 0;
 879     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
 880         # We add an edge in the graphml for every witness in $e.
 881         foreach my $wit ( sort $self->path_witnesses( $e ) ) {
 882                         my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
 883                                                                                 $node_hash{ $e->[0] },
 884                                                                                 $node_hash{ $e->[1] } );
 885                         my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
 886                         $edge_el->setAttribute( 'source', $from );
 887                         $edge_el->setAttribute( 'target', $to );
 888                         $edge_el->setAttribute( 'id', $id );
 889
 890                         # It's a witness path, so add the witness
 891                         my $base = $wit;
 892                         my $key = $edge_data_keys{'witness'};
 893                         # Is this an ante-corr witness?
 894                         my $aclabel = $self->ac_label;
 895                         if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
 896                                 # Keep the base witness
 897                                 $base = $1;
 898                                 # ...and record that this is an 'extra' reading path
 899                                 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
 900                         }
 901                         _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
 902                         _add_graphml_data( $edge_el, $edge_data_keys{'class'}, 'path' );
 903                 }
 904         }
 905
 906         # Add the relationship graph to the XML
 907         $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
 908                 $node_data_keys{'id'}, \%edge_data_keys );
 909
 910     # Save and return the thing
 911     my $result = decode_utf8( $graphml->toString(1) );
 912     return $result;
 913 }
 914
 915 sub _add_graphml_data {
 916     my( $el, $key, $value ) = @_;
 917     return unless defined $value;
 918     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
 919     $data_el->setAttribute( 'key', $key );
 920     $data_el->appendText( $value );
 921 }
 922
 923 =head2 as_csv
 924
 925 Returns a CSV alignment table representation of the collation graph, one
 926 row per witness (or witness uncorrected.)
 927
 928 =cut
 929
 930 sub as_csv {
 931     my( $self ) = @_;
 932     my $table = $self->alignment_table;
 933     my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
 934     my @result;
 935     # Make the header row
 936     $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
 937         push( @result, decode_utf8( $csv->string ) );
 938     # Make the rest of the rows
 939     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
 940         my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
 941         my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
 942         $csv->combine( @row );
 943         push( @result, decode_utf8( $csv->string ) );
 944     }
 945     return join( "\n", @result );
 946 }
 947
 948 =head2 alignment_table( $use_refs, $include_witnesses )
 949
 950 Return a reference to an alignment table, in a slightly enhanced CollateX
 951 format which looks like this:
 952
 953  $table = { alignment => [ { witness => "SIGIL",
 954                              tokens => [ { t => "TEXT" }, ... ] },
 955                            { witness => "SIG2",
 956                              tokens => [ { t => "TEXT" }, ... ] },
 957                            ... ],
 958             length => TEXTLEN };
 959
 960 If $use_refs is set to 1, the reading object is returned in the table
 961 instead of READINGTEXT; if not, the text of the reading is returned.
 962
 963 If $include_witnesses is set to a hashref, only the witnesses whose sigil
 964 keys have a true hash value will be included.
 965
 966 =cut
 967
 968 sub alignment_table {
 969     my( $self ) = @_;
 970     my $include; # see if we can ditch this
 971     return $self->cached_table if $self->has_cached_table;
 972
 973     # Make sure we can do this
 974         throw( "Need a linear graph in order to make an alignment table" )
 975                 unless $self->linear;
 976         $self->calculate_ranks unless $self->end->has_rank;
 977
 978     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
 979     my @all_pos = ( 1 .. $self->end->rank - 1 );
 980     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
 981         if( $include ) {
 982                 next unless $include->{$wit->sigil};
 983         }
 984         # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
 985         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
 986         my @row = _make_witness_row( \@wit_path, \@all_pos );
 987         push( @{$table->{'alignment'}},
 988                 { 'witness' => $wit->sigil, 'tokens' => \@row } );
 989         if( $wit->is_layered ) {
 990                 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
 991                         $wit->sigil.$self->ac_label );
 992             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
 993                         push( @{$table->{'alignment'}},
 994                                 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
 995         }
 996     }
 997     $self->cached_table( $table );
 998     return $table;
 999 }
1000
1001 sub _make_witness_row {
1002     my( $path, $positions ) = @_;
1003     my %char_hash;
1004     map { $char_hash{$_} = undef } @$positions;
1005     my $debug = 0;
1006     foreach my $rdg ( @$path ) {
1007         my $rtext = $rdg->text;
1008         $rtext = '#LACUNA#' if $rdg->is_lacuna;
1009         print STDERR "rank " . $rdg->rank . "\n" if $debug;
1010         # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
1011         $char_hash{$rdg->rank} = { 't' => $rdg };
1012     }
1013     my @row = map { $char_hash{$_} } @$positions;
1014     # Fill in lacuna markers for undef spots in the row
1015     my $last_el = shift @row;
1016     my @filled_row = ( $last_el );
1017     foreach my $el ( @row ) {
1018         # If we are using node reference, make the lacuna node appear many times
1019         # in the table.  If not, use the lacuna tag.
1020         if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1021             $el = $last_el;
1022         }
1023         push( @filled_row, $el );
1024         $last_el = $el;
1025     }
1026     return @filled_row;
1027 }
1028
1029 =head1 NAVIGATION METHODS
1030
1031 =head2 reading_sequence( $first, $last, $sigil, $backup )
1032
1033 Returns the ordered list of readings, starting with $first and ending
1034 with $last, for the witness given in $sigil. If a $backup sigil is
1035 specified (e.g. when walking a layered witness), it will be used wherever
1036 no $sigil path exists.  If there is a base text reading, that will be
1037 used wherever no path exists for $sigil or $backup.
1038
1039 =cut
1040
1041 # TODO Think about returning some lazy-eval iterator.
1042 # TODO Get rid of backup; we should know from what witness is whether we need it.
1043
1044 sub reading_sequence {
1045     my( $self, $start, $end, $witness ) = @_;
1046
1047     $witness = $self->baselabel unless $witness;
1048     my @readings = ( $start );
1049     my %seen;
1050     my $n = $start;
1051     while( $n && $n->id ne $end->id ) {
1052         if( exists( $seen{$n->id} ) ) {
1053             throw( "Detected loop for $witness at " . $n->id );
1054         }
1055         $seen{$n->id} = 1;
1056
1057         my $next = $self->next_reading( $n, $witness );
1058         unless( $next ) {
1059             throw( "Did not find any path for $witness from reading " . $n->id );
1060         }
1061         push( @readings, $next );
1062         $n = $next;
1063     }
1064     # Check that the last reading is our end reading.
1065     my $last = $readings[$#readings];
1066     throw( "Last reading found from " . $start->text .
1067         " for witness $witness is not the end!" ) # TODO do we get this far?
1068         unless $last->id eq $end->id;
1069
1070     return @readings;
1071 }
1072
1073 =head2 next_reading( $reading, $sigil );
1074
1075 Returns the reading that follows the given reading along the given witness
1076 path.
1077
1078 =cut
1079
1080 sub next_reading {
1081     # Return the successor via the corresponding path.
1082     my $self = shift;
1083     my $answer = $self->_find_linked_reading( 'next', @_ );
1084         return undef unless $answer;
1085     return $self->reading( $answer );
1086 }
1087
1088 =head2 prior_reading( $reading, $sigil )
1089
1090 Returns the reading that precedes the given reading along the given witness
1091 path.
1092
1093 =cut
1094
1095 sub prior_reading {
1096     # Return the predecessor via the corresponding path.
1097     my $self = shift;
1098     my $answer = $self->_find_linked_reading( 'prior', @_ );
1099     return $self->reading( $answer );
1100 }
1101
1102 sub _find_linked_reading {
1103     my( $self, $direction, $node, $path ) = @_;
1104
1105     # Get a backup if we are dealing with a layered witness
1106     my $alt_path;
1107     my $aclabel = $self->ac_label;
1108     if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1109         $alt_path = $1;
1110     }
1111
1112     my @linked_paths = $direction eq 'next'
1113         ? $self->sequence->edges_from( $node )
1114         : $self->sequence->edges_to( $node );
1115     return undef unless scalar( @linked_paths );
1116
1117     # We have to find the linked path that contains all of the
1118     # witnesses supplied in $path.
1119     my( @path_wits, @alt_path_wits );
1120     @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1121     @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
1122     my $base_le;
1123     my $alt_le;
1124     foreach my $le ( @linked_paths ) {
1125         if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
1126             $base_le = $le;
1127         }
1128                 my @le_wits = sort $self->path_witnesses( $le );
1129                 if( _is_within( \@path_wits, \@le_wits ) ) {
1130                         # This is the right path.
1131                         return $direction eq 'next' ? $le->[1] : $le->[0];
1132                 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1133                         $alt_le = $le;
1134                 }
1135     }
1136     # Got this far? Return the alternate path if it exists.
1137     return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
1138         if $alt_le;
1139
1140     # Got this far? Return the base path if it exists.
1141     return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
1142         if $base_le;
1143
1144     # Got this far? We have no appropriate path.
1145     warn "Could not find $direction node from " . $node->id
1146         . " along path $path";
1147     return undef;
1148 }
1149
1150 # Some set logic.
1151 sub _is_within {
1152     my( $set1, $set2 ) = @_;
1153     my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
1154     foreach my $el ( @$set1 ) {
1155         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
1156     }
1157     return $ret;
1158 }
1159
1160 # Return the string that joins together a list of witnesses for
1161 # display on a single path.
1162 sub _witnesses_of_label {
1163     my( $self, $label ) = @_;
1164     my $regex = $self->wit_list_separator;
1165     my @answer = split( /\Q$regex\E/, $label );
1166     return @answer;
1167 }
1168
1169 =head2 common_readings
1170
1171 Returns the list of common readings in the graph (i.e. those readings that are
1172 shared by all non-lacunose witnesses.)
1173
1174 =cut
1175
1176 sub common_readings {
1177         my $self = shift;
1178         my @common = grep { $_->is_common } $self->readings;
1179         return @common;
1180 }
1181
1182 =head2 path_text( $sigil, $mainsigil [, $start, $end ] )
1183
1184 Returns the text of a witness (plus its backup, if we are using a layer)
1185 as stored in the collation.  The text is returned as a string, where the
1186 individual readings are joined with spaces and the meta-readings (e.g.
1187 lacunae) are omitted.  Optional specification of $start and $end allows
1188 the generation of a subset of the witness text.
1189
1190 =cut
1191
1192 sub path_text {
1193         my( $self, $wit, $start, $end ) = @_;
1194         $start = $self->start unless $start;
1195         $end = $self->end unless $end;
1196         my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
1197         return join( ' ', map { $_->text } @path );
1198 }
1199
1200 =head1 INITIALIZATION METHODS
1201
1202 These are mostly for use by parsers.
1203
1204 =head2 make_witness_path( $witness )
1205
1206 Link the array of readings contained in $witness->path (and in
1207 $witness->uncorrected_path if it exists) into collation paths.
1208 Clear out the arrays when finished.
1209
1210 =head2 make_witness_paths
1211
1212 Call make_witness_path for all witnesses in the tradition.
1213
1214 =cut
1215
1216 # For use when a collation is constructed from a base text and an apparatus.
1217 # We have the sequences of readings and just need to add path edges.
1218 # When we are done, clear out the witness path attributes, as they are no
1219 # longer needed.
1220 # TODO Find a way to replace the witness path attributes with encapsulated functions?
1221
1222 sub make_witness_paths {
1223     my( $self ) = @_;
1224     foreach my $wit ( $self->tradition->witnesses ) {
1225         # print STDERR "Making path for " . $wit->sigil . "\n";
1226         $self->make_witness_path( $wit );
1227     }
1228 }
1229
1230 sub make_witness_path {
1231     my( $self, $wit ) = @_;
1232     my @chain = @{$wit->path};
1233     my $sig = $wit->sigil;
1234     foreach my $idx ( 0 .. $#chain-1 ) {
1235         $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
1236     }
1237     if( $wit->is_layered ) {
1238         @chain = @{$wit->uncorrected_path};
1239         foreach my $idx( 0 .. $#chain-1 ) {
1240             my $source = $chain[$idx];
1241             my $target = $chain[$idx+1];
1242             $self->add_path( $source, $target, $sig.$self->ac_label )
1243                 unless $self->has_path( $source, $target, $sig );
1244         }
1245     }
1246     $wit->clear_path;
1247     $wit->clear_uncorrected_path;
1248 }
1249
1250 =head2 calculate_ranks
1251
1252 Calculate the reading ranks (that is, their aligned positions relative
1253 to each other) for the graph.  This can only be called on linear collations.
1254
1255 =begin testing
1256
1257 use Text::Tradition;
1258
1259 my $cxfile = 't/data/Collatex-16.xml';
1260 my $t = Text::Tradition->new(
1261     'name'  => 'inline',
1262     'input' => 'CollateX',
1263     'file'  => $cxfile,
1264     );
1265 my $c = $t->collation;
1266
1267 # Make an svg
1268 my $table = $c->alignment_table;
1269 ok( $c->has_cached_table, "Alignment table was cached" );
1270 is( $c->alignment_table, $table, "Cached table returned upon second call" );
1271 $c->calculate_ranks;
1272 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
1273 $c->add_relationship( 'n9', 'n23', { 'type' => 'spelling' } );
1274 isnt( $c->alignment_table, $table, "Alignment table changed after relationship add" );
1275
1276 =end testing
1277
1278 =cut
1279
1280 sub calculate_ranks {
1281     my $self = shift;
1282     # Save the existing ranks, in case we need to invalidate the cached SVG.
1283     my %existing_ranks;
1284     # Walk a version of the graph where every node linked by a relationship
1285     # edge is fundamentally the same node, and do a topological ranking on
1286     # the nodes in this graph.
1287     my $topo_graph = Graph->new();
1288     my %rel_containers;
1289     my $rel_ctr = 0;
1290     # Add the nodes
1291     foreach my $r ( $self->readings ) {
1292         next if exists $rel_containers{$r->id};
1293         my @rels = $r->related_readings( 'colocated' );
1294         if( @rels ) {
1295             # Make a relationship container.
1296             push( @rels, $r );
1297             my $rn = 'rel_container_' . $rel_ctr++;
1298             $topo_graph->add_vertex( $rn );
1299             foreach( @rels ) {
1300                 $rel_containers{$_->id} = $rn;
1301             }
1302         } else {
1303             # Add a new node to mirror the old node.
1304             $rel_containers{$r->id} = $r->id;
1305             $topo_graph->add_vertex( $r->id );
1306         }
1307     }
1308
1309     # Add the edges.
1310     foreach my $r ( $self->readings ) {
1311                 $existing_ranks{$r} = $r->rank;
1312         foreach my $n ( $self->sequence->successors( $r->id ) ) {
1313                 my( $tfrom, $tto ) = ( $rel_containers{$r->id},
1314                         $rel_containers{$n} );
1315                 # $DB::single = 1 unless $tfrom && $tto;
1316             $topo_graph->add_edge( $tfrom, $tto );
1317         }
1318     }
1319
1320     # Now do the rankings, starting with the start node.
1321     my $topo_start = $rel_containers{$self->start->id};
1322     my $node_ranks = { $topo_start => 0 };
1323     my @curr_origin = ( $topo_start );
1324     # A little iterative function.
1325     while( @curr_origin ) {
1326         @curr_origin = _assign_rank( $topo_graph, $node_ranks, @curr_origin );
1327     }
1328     # Transfer our rankings from the topological graph to the real one.
1329     foreach my $r ( $self->readings ) {
1330         if( defined $node_ranks->{$rel_containers{$r->id}} ) {
1331             $r->rank( $node_ranks->{$rel_containers{$r->id}} );
1332         } else {
1333                 # Die. Find the last rank we calculated.
1334                 my @all_defined = sort { $node_ranks->{$rel_containers{$a->id}}
1335                                  <=> $node_ranks->{$rel_containers{$b->id}} }
1336                         $self->readings;
1337                 my $last = pop @all_defined;
1338             throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
1339         }
1340     }
1341     # Do we need to invalidate the cached data?
1342     if( $self->has_cached_svg || $self->has_cached_table ) {
1343         foreach my $r ( $self->readings ) {
1344                 next if $existing_ranks{$r} == $r->rank;
1345                 $self->_clear_cache;
1346                 last;
1347         }
1348     }
1349 }
1350
1351 sub _assign_rank {
1352     my( $graph, $node_ranks, @current_nodes ) = @_;
1353     # Look at each of the children of @current_nodes.  If all the child's
1354     # parents have a rank, assign it the highest rank + 1 and add it to
1355     # @next_nodes.  Otherwise skip it; we will return when the highest-ranked
1356     # parent gets a rank.
1357     my @next_nodes;
1358     foreach my $c ( @current_nodes ) {
1359         warn "Current reading $c has no rank!"
1360             unless exists $node_ranks->{$c};
1361         # print STDERR "Looking at child of node $c, rank "
1362         #     . $node_ranks->{$c} . "\n";
1363         foreach my $child ( $graph->successors( $c ) ) {
1364             next if exists $node_ranks->{$child};
1365             my $highest_rank = -1;
1366             my $skip = 0;
1367             foreach my $parent ( $graph->predecessors( $child ) ) {
1368                 if( exists $node_ranks->{$parent} ) {
1369                     $highest_rank = $node_ranks->{$parent}
1370                         if $highest_rank <= $node_ranks->{$parent};
1371                 } else {
1372                     $skip = 1;
1373                     last;
1374                 }
1375             }
1376             next if $skip;
1377             my $c_rank = $highest_rank + 1;
1378             # print STDERR "Assigning rank $c_rank to node $child \n";
1379             $node_ranks->{$child} = $c_rank;
1380             push( @next_nodes, $child );
1381         }
1382     }
1383     return @next_nodes;
1384 }
1385
1386 =head2 flatten_ranks
1387
1388 A convenience method for parsing collation data.  Searches the graph for readings
1389 with the same text at the same rank, and merges any that are found.
1390
1391 =cut
1392
1393 sub flatten_ranks {
1394     my $self = shift;
1395     my %unique_rank_rdg;
1396     foreach my $rdg ( $self->readings ) {
1397         next unless $rdg->has_rank;
1398         my $key = $rdg->rank . "||" . $rdg->text;
1399         if( exists $unique_rank_rdg{$key} ) {
1400             # Combine!
1401                 # print STDERR "Combining readings at same rank: $key\n";
1402             $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
1403             # TODO see if this now makes a common point.
1404         } else {
1405             $unique_rank_rdg{$key} = $rdg;
1406         }
1407     }
1408 }
1409
1410 =head2 remove_collations
1411
1412 Another convenience method for parsing. Removes all 'collation' relationships
1413 that were defined in order to get the reading ranks to be correct.
1414
1415 =begin testing
1416
1417 use Text::Tradition;
1418
1419 my $cxfile = 't/data/Collatex-16.xml';
1420 my $t = Text::Tradition->new(
1421     'name'  => 'inline',
1422     'input' => 'CollateX',
1423     'file'  => $cxfile,
1424     );
1425 my $c = $t->collation;
1426
1427 isnt( $c->reading('n23')->rank, $c->reading('n9')->rank, "Rank skew exists" );
1428 $c->add_relationship( 'n23', 'n9', { 'type' => 'collated', 'scope' => 'local' } );
1429 is( scalar $c->relationships, 4, "Found all expected relationships" );
1430 $c->remove_collations;
1431 is( scalar $c->relationships, 3, "Collated relationships now gone" );
1432 is( $c->reading('n23')->rank, $c->reading('n9')->rank, "Aligned ranks were preserved" );
1433
1434 =end testing
1435
1436 =cut
1437
1438 sub remove_collations {
1439         my $self = shift;
1440         foreach my $reledge ( $self->relationships ) {
1441                 my $relobj = $self->relations->get_relationship( $reledge );
1442                 if( $relobj && $relobj->type eq 'collated' ) {
1443                         $self->relations->delete_relationship( $reledge );
1444                 }
1445         }
1446 }
1447
1448
1449 =head2 calculate_common_readings
1450
1451 Goes through the graph identifying the readings that appear in every witness
1452 (apart from those with lacunae at that spot.) Marks them as common and returns
1453 the list.
1454
1455 =begin testing
1456
1457 use Text::Tradition;
1458
1459 my $cxfile = 't/data/Collatex-16.xml';
1460 my $t = Text::Tradition->new(
1461     'name'  => 'inline',
1462     'input' => 'CollateX',
1463     'file'  => $cxfile,
1464     );
1465 my $c = $t->collation;
1466
1467 my @common = $c->calculate_common_readings();
1468 is( scalar @common, 8, "Found correct number of common readings" );
1469 my @marked = sort $c->common_readings();
1470 is( scalar @common, 8, "All common readings got marked as such" );
1471 my @expected = qw/ n1 n12 n16 n19 n20 n5 n6 n7 /;
1472 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1473
1474 =end testing
1475
1476 =cut
1477
1478 sub calculate_common_readings {
1479         my $self = shift;
1480         my @common;
1481         my $table = $self->alignment_table;
1482         foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1483                 my @row = map { $_->{'tokens'}->[$idx]
1484                                                         ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1485                                         @{$table->{'alignment'}};
1486                 my %hash;
1487                 foreach my $r ( @row ) {
1488                         if( $r ) {
1489                                 $hash{$r->id} = $r unless $r->is_meta;
1490                         } else {
1491                                 $hash{'UNDEF'} = $r;
1492                         }
1493                 }
1494                 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1495                         my( $r ) = values %hash;
1496                         $r->is_common( 1 );
1497                         push( @common, $r );
1498                 }
1499         }
1500         return @common;
1501 }
1502
1503 =head2 text_from_paths
1504
1505 Calculate the text array for all witnesses from the path, for later consistency
1506 checking.  Only to be used if there is no non-graph-based way to know the
1507 original texts.
1508
1509 =cut
1510
1511 sub text_from_paths {
1512         my $self = shift;
1513     foreach my $wit ( $self->tradition->witnesses ) {
1514         my @text = split( /\s+/,
1515                 $self->reading_sequence( $self->start, $self->end, $wit->sigil ) );
1516         $wit->text( \@text );
1517         if( $wit->is_layered ) {
1518                         my @uctext = split( /\s+/,
1519                                 $self->reading_sequence( $self->start, $self->end,
1520                                         $wit->sigil.$self->ac_label ) );
1521                         $wit->text( \@uctext );
1522         }
1523     }
1524 }
1525
1526 =head1 UTILITY FUNCTIONS
1527
1528 =head2 common_predecessor( $reading_a, $reading_b )
1529
1530 Find the last reading that occurs in sequence before both the given readings.
1531
1532 =head2 common_successor( $reading_a, $reading_b )
1533
1534 Find the first reading that occurs in sequence after both the given readings.
1535
1536 =begin testing
1537
1538 use Text::Tradition;
1539
1540 my $cxfile = 't/data/Collatex-16.xml';
1541 my $t = Text::Tradition->new(
1542     'name'  => 'inline',
1543     'input' => 'CollateX',
1544     'file'  => $cxfile,
1545     );
1546 my $c = $t->collation;
1547
1548 is( $c->common_predecessor( 'n9', 'n23' )->id,
1549     'n20', "Found correct common predecessor" );
1550 is( $c->common_successor( 'n9', 'n23' )->id,
1551     '#END#', "Found correct common successor" );
1552
1553 is( $c->common_predecessor( 'n19', 'n17' )->id,
1554     'n16', "Found correct common predecessor for readings on same path" );
1555 is( $c->common_successor( 'n21', 'n26' )->id,
1556     '#END#', "Found correct common successor for readings on same path" );
1557
1558 =end testing
1559
1560 =cut
1561
1562 ## Return the closest reading that is a predecessor of both the given readings.
1563 sub common_predecessor {
1564         my $self = shift;
1565         my( $r1, $r2 ) = $self->_objectify_args( @_ );
1566         return $self->_common_in_path( $r1, $r2, 'predecessors' );
1567 }
1568
1569 sub common_successor {
1570         my $self = shift;
1571         my( $r1, $r2 ) = $self->_objectify_args( @_ );
1572         return $self->_common_in_path( $r1, $r2, 'successors' );
1573 }
1574
1575 sub _common_in_path {
1576         my( $self, $r1, $r2, $dir ) = @_;
1577         my $iter = $r1->rank > $r2->rank ? $r1->rank : $r2->rank;
1578         $iter = $self->end->rank - $iter if $dir eq 'successors';
1579         my @candidates;
1580         my @last_checked = ( $r1, $r2 );
1581         my %all_seen;
1582         while( !@candidates ) {
1583                 my @new_lc;
1584                 foreach my $lc ( @last_checked ) {
1585                         foreach my $p ( $lc->$dir ) {
1586                                 if( $all_seen{$p->id} ) {
1587                                         push( @candidates, $p );
1588                                 } else {
1589                                         $all_seen{$p->id} = 1;
1590                                         push( @new_lc, $p );
1591                                 }
1592                         }
1593                 }
1594                 @last_checked = @new_lc;
1595         }
1596         my @answer = sort { $a->rank <=> $b->rank } @candidates;
1597         return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1598 }
1599
1600 sub throw {
1601         Text::Tradition::Error->throw(
1602                 'ident' => 'Collation error',
1603                 'message' => $_[0],
1604                 );
1605 }
1606
1607 no Moose;
1608 __PACKAGE__->meta->make_immutable;
1609
1610 =head1 LICENSE
1611
1612 This package is free software and is provided "as is" without express
1613 or implied warranty.  You can redistribute it and/or modify it under
1614 the same terms as Perl itself.
1615
1616 =head1 AUTHOR
1617
1618 Tara L Andrews E<lt>aurum@cpan.orgE<gt>