lib/Text/Tradition/Collation.pm

   1 package Text::Tradition::Collation;
   2
   3 use Encode qw( decode_utf8 );
   4 use File::Temp;
   5 use File::Which;
   6 use Graph;
   7 use IPC::Run qw( run binary );
   8 use Text::CSV;
   9 use Text::Tradition::Collation::Reading;
  10 use Text::Tradition::Collation::RelationshipStore;
  11 use Text::Tradition::Error;
  12 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
  13 use XML::LibXML;
  14 use XML::LibXML::XPathContext;
  15 use Moose;
  16
  17 has 'sequence' => (
  18     is => 'ro',
  19     isa => 'Graph',
  20     default => sub { Graph->new() },
  21     handles => {
  22         paths => 'edges',
  23     },
  24     );
  25
  26 has 'relations' => (
  27         is => 'ro',
  28         isa => 'Text::Tradition::Collation::RelationshipStore',
  29         handles => {
  30                 relationships => 'relationships',
  31                 related_readings => 'related_readings',
  32                 get_relationship => 'get_relationship',
  33                 del_relationship => 'del_relationship',
  34                 equivalence => 'equivalence',
  35                 equivalence_graph => 'equivalence_graph',
  36         },
  37         writer => '_set_relations',
  38         );
  39
  40 has 'tradition' => (
  41     is => 'ro',
  42     isa => 'Text::Tradition',
  43     writer => '_set_tradition',
  44     weak_ref => 1,
  45     );
  46
  47 has 'readings' => (
  48         isa => 'HashRef[Text::Tradition::Collation::Reading]',
  49         traits => ['Hash'],
  50     handles => {
  51         reading     => 'get',
  52         _add_reading => 'set',
  53         del_reading => 'delete',
  54         has_reading => 'exists',
  55         readings   => 'values',
  56     },
  57     default => sub { {} },
  58         );
  59
  60 has 'wit_list_separator' => (
  61     is => 'rw',
  62     isa => 'Str',
  63     default => ', ',
  64     );
  65
  66 has 'baselabel' => (
  67     is => 'rw',
  68     isa => 'Str',
  69     default => 'base text',
  70     );
  71
  72 has 'linear' => (
  73     is => 'rw',
  74     isa => 'Bool',
  75     default => 1,
  76     );
  77
  78 has 'ac_label' => (
  79     is => 'rw',
  80     isa => 'Str',
  81     default => ' (a.c.)',
  82     );
  83
  84 has 'wordsep' => (
  85         is => 'rw',
  86         isa => 'Str',
  87         default => ' ',
  88         );
  89
  90 has 'start' => (
  91         is => 'ro',
  92         isa => 'Text::Tradition::Collation::Reading',
  93         writer => '_set_start',
  94         weak_ref => 1,
  95         );
  96
  97 has 'end' => (
  98         is => 'ro',
  99         isa => 'Text::Tradition::Collation::Reading',
 100         writer => '_set_end',
 101         weak_ref => 1,
 102         );
 103
 104 has 'cached_svg' => (
 105         is => 'rw',
 106         isa => 'Str',
 107         predicate => 'has_cached_svg',
 108         clearer => 'wipe_svg',
 109         );
 110
 111 has 'cached_table' => (
 112         is => 'rw',
 113         isa => 'HashRef',
 114         predicate => 'has_cached_table',
 115         clearer => 'wipe_table',
 116         );
 117
 118 has '_graphcalc_done' => (
 119         is => 'rw',
 120         isa => 'Bool',
 121         default => undef,
 122         );
 123
 124 =head1 NAME
 125
 126 Text::Tradition::Collation - a software model for a text collation
 127
 128 =head1 SYNOPSIS
 129
 130   use Text::Tradition;
 131   my $t = Text::Tradition->new(
 132     'name' => 'this is a text',
 133     'input' => 'TEI',
 134     'file' => '/path/to/tei_parallel_seg_file.xml' );
 135
 136   my $c = $t->collation;
 137   my @readings = $c->readings;
 138   my @paths = $c->paths;
 139   my @relationships = $c->relationships;
 140
 141   my $svg_variant_graph = $t->collation->as_svg();
 142
 143 =head1 DESCRIPTION
 144
 145 Text::Tradition is a library for representation and analysis of collated
 146 texts, particularly medieval ones.  The Collation is the central feature of
 147 a Tradition, where the text, its sequence of readings, and its relationships
 148 between readings are actually kept.
 149
 150 =head1 CONSTRUCTOR
 151
 152 =head2 new
 153
 154 The constructor.  Takes a hash or hashref of the following arguments:
 155
 156 =over
 157
 158 =item * tradition - The Text::Tradition object to which the collation
 159 belongs. Required.
 160
 161 =item * linear - Whether the collation should be linear; that is, whether
 162 transposed readings should be treated as two linked readings rather than one,
 163 and therefore whether the collation graph is acyclic.  Defaults to true.
 164
 165 =item * baselabel - The default label for the path taken by a base text
 166 (if any). Defaults to 'base text'.
 167
 168 =item * wit_list_separator - The string to join a list of witnesses for
 169 purposes of making labels in display graphs.  Defaults to ', '.
 170
 171 =item * ac_label - The extra label to tack onto a witness sigil when
 172 representing another layer of path for the given witness - that is, when
 173 a text has more than one possible reading due to scribal corrections or
 174 the like.  Defaults to ' (a.c.)'.
 175
 176 =item * wordsep - The string used to separate words in the original text.
 177 Defaults to ' '.
 178
 179 =back
 180
 181 =head1 ACCESSORS
 182
 183 =head2 tradition
 184
 185 =head2 linear
 186
 187 =head2 wit_list_separator
 188
 189 =head2 baselabel
 190
 191 =head2 ac_label
 192
 193 =head2 wordsep
 194
 195 Simple accessors for collation attributes.
 196
 197 =head2 start
 198
 199 The meta-reading at the start of every witness path.
 200
 201 =head2 end
 202
 203 The meta-reading at the end of every witness path.
 204
 205 =head2 readings
 206
 207 Returns all Reading objects in the graph.
 208
 209 =head2 reading( $id )
 210
 211 Returns the Reading object corresponding to the given ID.
 212
 213 =head2 add_reading( $reading_args )
 214
 215 Adds a new reading object to the collation.
 216 See L<Text::Tradition::Collation::Reading> for the available arguments.
 217
 218 =head2 del_reading( $object_or_id )
 219
 220 Removes the given reading from the collation, implicitly removing its
 221 paths and relationships.
 222
 223 =head2 merge_readings( $main, $second, $concatenate, $with_str )
 224
 225 Merges the $second reading into the $main one. If $concatenate is true, then
 226 the merged node will carry the text of both readings, concatenated with either
 227 $with_str (if specified) or a sensible default (the empty string if the
 228 appropriate 'join_*' flag is set on either reading, or else $self->wordsep.)
 229
 230 The first two arguments may be either readings or reading IDs.
 231
 232 =head2 has_reading( $id )
 233
 234 Predicate to see whether a given reading ID is in the graph.
 235
 236 =head2 reading_witnesses( $object_or_id )
 237
 238 Returns a list of sigils whose witnesses contain the reading.
 239
 240 =head2 paths
 241
 242 Returns all reading paths within the document - that is, all edges in the
 243 collation graph.  Each path is an arrayref of [ $source, $target ] reading IDs.
 244
 245 =head2 add_path( $source, $target, $sigil )
 246
 247 Links the given readings in the collation in sequence, under the given witness
 248 sigil.  The readings may be specified by object or ID.
 249
 250 =head2 del_path( $source, $target, $sigil )
 251
 252 Links the given readings in the collation in sequence, under the given witness
 253 sigil.  The readings may be specified by object or ID.
 254
 255 =head2 has_path( $source, $target );
 256
 257 Returns true if the two readings are linked in sequence in any witness.
 258 The readings may be specified by object or ID.
 259
 260 =head2 relationships
 261
 262 Returns all Relationship objects in the collation.
 263
 264 =head2 add_relationship( $reading, $other_reading, $options )
 265
 266 Adds a new relationship of the type given in $options between the two readings,
 267 which may be specified by object or ID.  Returns a value of ( $status, @vectors)
 268 where $status is true on success, and @vectors is a list of relationship edges
 269 that were ultimately added.
 270 See L<Text::Tradition::Collation::Relationship> for the available options.
 271
 272 =cut
 273
 274 sub BUILD {
 275     my $self = shift;
 276     $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) );
 277     $self->_set_start( $self->add_reading(
 278         { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) );
 279     $self->_set_end( $self->add_reading(
 280         { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) );
 281 }
 282
 283 ### Reading construct/destruct functions
 284
 285 sub add_reading {
 286         my( $self, $reading ) = @_;
 287         unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
 288                 my %args = %$reading;
 289                 if( $args{'init'} ) {
 290                         # If we are initializing an empty collation, don't assume that we
 291                         # have set a tradition.
 292                         delete $args{'init'};
 293                 } elsif( $self->tradition->has_language && !exists $args{'language'} ) {
 294                         $args{'language'} = $self->tradition->language;
 295                 }
 296                 $reading = Text::Tradition::Collation::Reading->new(
 297                         'collation' => $self,
 298                         %args );
 299         }
 300         # First check to see if a reading with this ID exists.
 301         if( $self->reading( $reading->id ) ) {
 302                 throw( "Collation already has a reading with id " . $reading->id );
 303         }
 304         $self->_graphcalc_done(0);
 305         $self->_add_reading( $reading->id => $reading );
 306         # Once the reading has been added, put it in both graphs.
 307         $self->sequence->add_vertex( $reading->id );
 308         $self->relations->add_reading( $reading->id );
 309         return $reading;
 310 };
 311
 312 around del_reading => sub {
 313         my $orig = shift;
 314         my $self = shift;
 315         my $arg = shift;
 316
 317         if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
 318                 $arg = $arg->id;
 319         }
 320         # Remove the reading from the graphs.
 321         $self->_graphcalc_done(0);
 322         $self->_clear_cache; # Explicitly clear caches to GC the reading
 323         $self->sequence->delete_vertex( $arg );
 324         $self->relations->delete_reading( $arg );
 325
 326         # Carry on.
 327         $self->$orig( $arg );
 328 };
 329
 330 =begin testing
 331
 332 use Text::Tradition;
 333
 334 my $cxfile = 't/data/Collatex-16.xml';
 335 my $t = Text::Tradition->new(
 336     'name'  => 'inline',
 337     'input' => 'CollateX',
 338     'file'  => $cxfile,
 339     );
 340 my $c = $t->collation;
 341
 342 my $rno = scalar $c->readings;
 343 # Split n21 for testing purposes
 344 my $new_r = $c->add_reading( { 'id' => 'n21p0', 'text' => 'un', 'join_next' => 1 } );
 345 my $old_r = $c->reading( 'n21' );
 346 $old_r->alter_text( 'to' );
 347 $c->del_path( 'n20', 'n21', 'A' );
 348 $c->add_path( 'n20', 'n21p0', 'A' );
 349 $c->add_path( 'n21p0', 'n21', 'A' );
 350 $c->flatten_ranks();
 351 ok( $c->reading( 'n21p0' ), "New reading exists" );
 352 is( scalar $c->readings, $rno, "Reading add offset by flatten_ranks" );
 353
 354 # Combine n3 and n4 ( with his )
 355 $c->merge_readings( 'n3', 'n4', 1 );
 356 ok( !$c->reading('n4'), "Reading n4 is gone" );
 357 is( $c->reading('n3')->text, 'with his', "Reading n3 has both words" );
 358
 359 # Collapse n9 and n10 ( rood / root )
 360 $c->merge_readings( 'n9', 'n10' );
 361 ok( !$c->reading('n10'), "Reading n10 is gone" );
 362 is( $c->reading('n9')->text, 'rood', "Reading n9 has an unchanged word" );
 363
 364 # Combine n21 and n21p0
 365 my $remaining = $c->reading('n21');
 366 $remaining ||= $c->reading('n22');  # one of these should still exist
 367 $c->merge_readings( 'n21p0', $remaining, 1 );
 368 ok( !$c->reading('n21'), "Reading $remaining is gone" );
 369 is( $c->reading('n21p0')->text, 'unto', "Reading n21p0 merged correctly" );
 370
 371 =end testing
 372
 373 =cut
 374
 375 sub merge_readings {
 376         my $self = shift;
 377
 378         # Sanity check
 379         my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ );
 380         my $mergemeta = $kept_obj->is_meta;
 381         throw( "Cannot merge meta and non-meta reading" )
 382                 unless ( $mergemeta && $del_obj->is_meta )
 383                         || ( !$mergemeta && !$del_obj->is_meta );
 384         if( $mergemeta ) {
 385                 throw( "Cannot merge with start or end node" )
 386                         if( $kept_obj eq $self->start || $kept_obj eq $self->end
 387                                 || $del_obj eq $self->start || $del_obj eq $self->end );
 388         }
 389         # We only need the IDs for adding paths to the graph, not the reading
 390         # objects themselves.
 391         my $kept = $kept_obj->id;
 392         my $deleted = $del_obj->id;
 393         $self->_graphcalc_done(0);
 394
 395     # The kept reading should inherit the paths and the relationships
 396     # of the deleted reading.
 397         foreach my $path ( $self->sequence->edges_at( $deleted ) ) {
 398                 my @vector = ( $kept );
 399                 push( @vector, $path->[1] ) if $path->[0] eq $deleted;
 400                 unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
 401                 next if $vector[0] eq $vector[1]; # Don't add a self loop
 402                 my %wits = %{$self->sequence->get_edge_attributes( @$path )};
 403                 $self->sequence->add_edge( @vector );
 404                 my $fwits = $self->sequence->get_edge_attributes( @vector );
 405                 @wits{keys %$fwits} = values %$fwits;
 406                 $self->sequence->set_edge_attributes( @vector, \%wits );
 407         }
 408         $self->relations->merge_readings( $kept, $deleted, $combine );
 409
 410         # Do the deletion deed.
 411         if( $combine ) {
 412                 # Combine the text of the readings
 413                 my $joinstr = $combine_char;
 414                 unless( defined $joinstr ) {
 415                         $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
 416                         $joinstr = $self->wordsep unless defined $joinstr;
 417                 }
 418                 $kept_obj->alter_text( join( $joinstr, $kept_obj->text, $del_obj->text ) );
 419                 $kept_obj->normal_form(
 420                         join( $joinstr, $kept_obj->normal_form, $del_obj->normal_form ) );
 421                 # Combine the lexemes present in the readings
 422                 if( $kept_obj->has_lexemes && $del_obj->has_lexemes ) {
 423                         $kept_obj->add_lexeme( $del_obj->lexemes );
 424                 }
 425         }
 426         $self->del_reading( $deleted );
 427 }
 428
 429
 430 # Helper function for manipulating the graph.
 431 sub _stringify_args {
 432         my( $self, $first, $second, @args ) = @_;
 433     $first = $first->id
 434         if ref( $first ) eq 'Text::Tradition::Collation::Reading';
 435     $second = $second->id
 436         if ref( $second ) eq 'Text::Tradition::Collation::Reading';
 437     return( $first, $second, @args );
 438 }
 439
 440 # Helper function for manipulating the graph.
 441 sub _objectify_args {
 442         my( $self, $first, $second, $arg ) = @_;
 443     $first = $self->reading( $first )
 444         unless ref( $first ) eq 'Text::Tradition::Collation::Reading';
 445     $second = $self->reading( $second )
 446         unless ref( $second ) eq 'Text::Tradition::Collation::Reading';
 447     return( $first, $second, $arg );
 448 }
 449 ### Path logic
 450
 451 sub add_path {
 452         my $self = shift;
 453
 454         # We only need the IDs for adding paths to the graph, not the reading
 455         # objects themselves.
 456     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 457
 458         $self->_graphcalc_done(0);
 459         # Connect the readings
 460         unless( $self->sequence->has_edge( $source, $target ) ) {
 461             $self->sequence->add_edge( $source, $target );
 462             $self->relations->add_equivalence_edge( $source, $target );
 463         }
 464     # Note the witness in question
 465     $self->sequence->set_edge_attribute( $source, $target, $wit, 1 );
 466 }
 467
 468 sub del_path {
 469         my $self = shift;
 470         my @args;
 471         if( ref( $_[0] ) eq 'ARRAY' ) {
 472                 my $e = shift @_;
 473                 @args = ( @$e, @_ );
 474         } else {
 475                 @args = @_;
 476         }
 477
 478         # We only need the IDs for adding paths to the graph, not the reading
 479         # objects themselves.
 480     my( $source, $target, $wit ) = $self->_stringify_args( @args );
 481
 482         $self->_graphcalc_done(0);
 483         if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
 484                 $self->sequence->delete_edge_attribute( $source, $target, $wit );
 485         }
 486         unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
 487                 $self->sequence->delete_edge( $source, $target );
 488                 $self->relations->delete_equivalence_edge( $source, $target );
 489         }
 490 }
 491
 492
 493 # Extra graph-alike utility
 494 sub has_path {
 495         my $self = shift;
 496     my( $source, $target, $wit ) = $self->_stringify_args( @_ );
 497         return undef unless $self->sequence->has_edge( $source, $target );
 498         return $self->sequence->has_edge_attribute( $source, $target, $wit );
 499 }
 500
 501 =head2 clear_witness( @sigil_list )
 502
 503 Clear the given witnesses out of the collation entirely, removing references
 504 to them in paths, and removing readings that belong only to them.  Should only
 505 be called via $tradition->del_witness.
 506
 507 =cut
 508
 509 sub clear_witness {
 510         my( $self, @sigils ) = @_;
 511
 512         $self->_graphcalc_done(0);
 513         # Clear the witness(es) out of the paths
 514         foreach my $e ( $self->paths ) {
 515                 foreach my $sig ( @sigils ) {
 516                         $self->del_path( $e, $sig );
 517                 }
 518         }
 519
 520         # Clear out the newly unused readings
 521         foreach my $r ( $self->readings ) {
 522                 unless( $self->reading_witnesses( $r ) ) {
 523                         $self->del_reading( $r );
 524                 }
 525         }
 526 }
 527
 528 sub add_relationship {
 529         my $self = shift;
 530     my( $source, $target, $opts ) = $self->_stringify_args( @_ );
 531     my( @vectors ) = $self->relations->add_relationship( $source, $target, $opts );
 532         $self->_graphcalc_done(0);
 533     return @vectors;
 534 }
 535
 536 around qw/ get_relationship del_relationship / => sub {
 537         my $orig = shift;
 538         my $self = shift;
 539         my @args = @_;
 540         if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
 541                 @args = @{$_[0]};
 542         }
 543         my( $source, $target ) = $self->_stringify_args( @args );
 544         $self->$orig( $source, $target );
 545 };
 546
 547 =head2 reading_witnesses( $reading )
 548
 549 Return a list of sigils corresponding to the witnesses in which the reading appears.
 550
 551 =cut
 552
 553 sub reading_witnesses {
 554         my( $self, $reading ) = @_;
 555         # We need only check either the incoming or the outgoing edges; I have
 556         # arbitrarily chosen "incoming".  Thus, special-case the start node.
 557         if( $reading eq $self->start ) {
 558                 return map { $_->sigil } $self->tradition->witnesses;
 559         }
 560         my %all_witnesses;
 561         foreach my $e ( $self->sequence->edges_to( $reading ) ) {
 562                 my $wits = $self->sequence->get_edge_attributes( @$e );
 563                 @all_witnesses{ keys %$wits } = 1;
 564         }
 565         my $acstr = $self->ac_label;
 566         foreach my $acwit ( grep { $_ =~ s/^(.*)\Q$acstr\E$/$1/ } keys %all_witnesses ) {
 567                 delete $all_witnesses{$acwit.$acstr} if exists $all_witnesses{$acwit};
 568         }
 569         return keys %all_witnesses;
 570 }
 571
 572 =head1 OUTPUT METHODS
 573
 574 =head2 as_svg( \%options )
 575
 576 Returns an SVG string that represents the graph, via as_dot and graphviz.
 577 See as_dot for a list of options.  Must have GraphViz (dot) installed to run.
 578
 579 =cut
 580
 581 sub as_svg {
 582     my( $self, $opts ) = @_;
 583     throw( "Need GraphViz installed to output SVG" )
 584         unless File::Which::which( 'dot' );
 585     my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'};
 586     $self->calculate_ranks()
 587         unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear );
 588     if( !$self->has_cached_svg || $opts->{'recalc'}     || $want_subgraph ) {
 589                 my @cmd = qw/dot -Tsvg/;
 590                 my( $svg, $err );
 591                 my $dotfile = File::Temp->new();
 592                 ## USE FOR DEBUGGING
 593                 # $dotfile->unlink_on_destroy(0);
 594                 binmode $dotfile, ':utf8';
 595                 print $dotfile $self->as_dot( $opts );
 596                 push( @cmd, $dotfile->filename );
 597                 run( \@cmd, ">", binary(), \$svg );
 598                 $svg = decode_utf8( $svg );
 599                 $self->cached_svg( $svg ) unless $want_subgraph;
 600                 return $svg;
 601         } else {
 602                 return $self->cached_svg;
 603         }
 604 }
 605
 606
 607 =head2 as_dot( \%options )
 608
 609 Returns a string that is the collation graph expressed in dot
 610 (i.e. GraphViz) format.  Options include:
 611
 612 =over 4
 613
 614 =item * from
 615
 616 =item * to
 617
 618 =item * color_common
 619
 620 =back
 621
 622 =cut
 623
 624 sub as_dot {
 625     my( $self, $opts ) = @_;
 626     my $startrank = $opts->{'from'} if $opts;
 627     my $endrank = $opts->{'to'} if $opts;
 628     my $color_common = $opts->{'color_common'} if $opts;
 629     my $STRAIGHTENHACK = !$startrank && !$endrank && $self->end->rank
 630        && $self->end->rank > 100;
 631     $STRAIGHTENHACK = 1 if $opts->{'straight'}; # even for subgraphs or small graphs
 632
 633     # Check the arguments
 634     if( $startrank ) {
 635         return if $endrank && $startrank > $endrank;
 636         return if $startrank > $self->end->rank;
 637         }
 638         if( defined $endrank ) {
 639                 return if $endrank < 0;
 640                 $endrank = undef if $endrank == $self->end->rank;
 641         }
 642
 643     my $graph_name = $self->tradition->name;
 644     $graph_name =~ s/[^\w\s]//g;
 645     $graph_name = join( '_', split( /\s+/, $graph_name ) );
 646
 647     my %graph_attrs = (
 648         'rankdir' => 'LR',
 649         'bgcolor' => 'none',
 650         );
 651     my %node_attrs = (
 652         'fontsize' => 14,
 653         'fillcolor' => 'white',
 654         'style' => 'filled',
 655         'shape' => 'ellipse'
 656         );
 657     my %edge_attrs = (
 658         'arrowhead' => 'open',
 659         'color' => '#000000',
 660         'fontcolor' => '#000000',
 661         );
 662
 663     my $dot = sprintf( "digraph %s {\n", $graph_name );
 664     $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
 665     $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
 666
 667         # Output substitute start/end readings if necessary
 668         if( $startrank ) {
 669                 $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n";
 670         }
 671         if( $endrank ) {
 672                 $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n";
 673         }
 674         if( $STRAIGHTENHACK ) {
 675                 ## HACK part 1
 676                 my $startlabel = $startrank ? '__SUBSTART__' : '__START__';
 677                 $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n";
 678                 $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];"
 679         }
 680         my %used;  # Keep track of the readings that actually appear in the graph
 681         # Sort the readings by rank if we have ranks; this speeds layout.
 682         my @all_readings = $self->end->has_rank
 683                 ? sort { $a->rank <=> $b->rank } $self->readings
 684                 : $self->readings;
 685         # TODO Refrain from outputting lacuna nodes - just grey out the edges.
 686     foreach my $reading ( @all_readings ) {
 687         # Only output readings within our rank range.
 688         next if $startrank && $reading->rank < $startrank;
 689         next if $endrank && $reading->rank > $endrank;
 690         $used{$reading->id} = 1;
 691         # Need not output nodes without separate labels
 692         next if $reading->id eq $reading->text;
 693         my $rattrs;
 694         my $label = $reading->text;
 695         $label .= '-' if $reading->join_next;
 696         $label = "-$label" if $reading->join_prior;
 697         $label =~ s/\"/\\\"/g;
 698                 $rattrs->{'label'} = $label;
 699                 $rattrs->{'id'} = $reading->id;
 700                 $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common;
 701         $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) );
 702     }
 703
 704         # Add the real edges. Need to weight one edge per rank jump, in a
 705         # continuous line.
 706         # my $weighted = $self->_add_edge_weights;
 707     my @edges = $self->paths;
 708         my( %substart, %subend );
 709     foreach my $edge ( @edges ) {
 710         # Do we need to output this edge?
 711         if( $used{$edge->[0]} && $used{$edge->[1]} ) {
 712                 my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
 713                         my $variables = { %edge_attrs, 'label' => $label };
 714
 715                         # Account for the rank gap if necessary
 716                         my $rank0 = $self->reading( $edge->[0] )->rank
 717                                 if $self->reading( $edge->[0] )->has_rank;
 718                         my $rank1 = $self->reading( $edge->[1] )->rank
 719                                 if $self->reading( $edge->[1] )->has_rank;
 720                         if( defined $rank0 && defined $rank1 && $rank1 - $rank0 > 1 ) {
 721                                 $variables->{'minlen'} = $rank1 - $rank0;
 722                         }
 723
 724                         # Add the calculated edge weights
 725                         # if( exists $weighted->{$edge->[0]}
 726                         #       && $weighted->{$edge->[0]} eq $edge->[1] ) {
 727                         #       # $variables->{'color'} = 'red';
 728                         #       $variables->{'weight'} = 3.0;
 729                         # }
 730
 731                         # EXPERIMENTAL: make edge width reflect no. of witnesses
 732                         my $extrawidth = scalar( $self->path_witnesses( $edge ) ) * 0.2;
 733                         $variables->{'penwidth'} = $extrawidth + 0.8; # gives 1 for a single wit
 734
 735                         my $varopts = _dot_attr_string( $variables );
 736                         $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n",
 737                                 $edge->[0], $edge->[1], $varopts );
 738         } elsif( $used{$edge->[0]} ) {
 739                 $subend{$edge->[0]} = $edge->[1];
 740         } elsif( $used{$edge->[1]} ) {
 741                 $substart{$edge->[1]} = $edge->[0];
 742         }
 743     }
 744     # Add substitute start and end edges if necessary
 745     foreach my $node ( keys %substart ) {
 746         my $witstr = $self->_path_display_label ( $self->path_witnesses( $substart{$node}, $node ) );
 747         my $variables = { %edge_attrs, 'label' => $witstr };
 748         my $nrdg = $self->reading( $node );
 749         if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
 750                 # Substart is actually one lower than $startrank
 751                 $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 );
 752         }
 753         my $varopts = _dot_attr_string( $variables );
 754         $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
 755         }
 756     foreach my $node ( keys %subend ) {
 757         my $witstr = $self->_path_display_label ( $self->path_witnesses( $node, $subend{$node} ) );
 758         my $variables = { %edge_attrs, 'label' => $witstr };
 759         my $varopts = _dot_attr_string( $variables );
 760         $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
 761         }
 762         # HACK part 2
 763         if( $STRAIGHTENHACK ) {
 764                 my $endlabel = $endrank ? '__SUBEND__' : '__END__';
 765                 $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n";
 766         }
 767
 768     $dot .= "}\n";
 769     return $dot;
 770 }
 771
 772 sub _dot_attr_string {
 773         my( $hash ) = @_;
 774         my @attrs;
 775         foreach my $k ( sort keys %$hash ) {
 776                 my $v = $hash->{$k};
 777                 push( @attrs, $k.'="'.$v.'"' );
 778         }
 779         return( '[ ' . join( ', ', @attrs ) . ' ]' );
 780 }
 781
 782 sub _add_edge_weights {
 783         my $self = shift;
 784         # Walk the graph from START to END, choosing the successor node with
 785         # the largest number of witness paths each time.
 786         my $weighted = {};
 787         my $curr = $self->start->id;
 788         my $ranked = $self->end->has_rank;
 789         while( $curr ne $self->end->id ) {
 790                 my $rank = $ranked ? $self->reading( $curr )->rank : 0;
 791                 my @succ = sort { $self->path_witnesses( $curr, $a )
 792                                                         <=> $self->path_witnesses( $curr, $b ) }
 793                         $self->sequence->successors( $curr );
 794                 my $next = pop @succ;
 795                 my $nextrank = $ranked ? $self->reading( $next )->rank : 0;
 796                 # Try to avoid lacunae in the weighted path.
 797                 while( @succ &&
 798                            ( $self->reading( $next )->is_lacuna ||
 799                                  $nextrank - $rank > 1 ) ){
 800                         $next = pop @succ;
 801                 }
 802                 $weighted->{$curr} = $next;
 803                 $curr = $next;
 804         }
 805         return $weighted;
 806 }
 807
 808 =head2 path_witnesses( $edge )
 809
 810 Returns the list of sigils whose witnesses are associated with the given edge.
 811 The edge can be passed as either an array or an arrayref of ( $source, $target ).
 812
 813 =cut
 814
 815 sub path_witnesses {
 816         my( $self, @edge ) = @_;
 817         # If edge is an arrayref, cope.
 818         if( @edge == 1 && ref( $edge[0] ) eq 'ARRAY' ) {
 819                 my $e = shift @edge;
 820                 @edge = @$e;
 821         }
 822         my @wits = keys %{$self->sequence->get_edge_attributes( @edge )};
 823         return @wits;
 824 }
 825
 826 # Helper function. Make a display label for the given witnesses, showing a.c.
 827 # witnesses only where the main witness is not also in the list.
 828 sub _path_display_label {
 829         my $self = shift;
 830         my %wits;
 831         map { $wits{$_} = 1 } @_;
 832
 833         # If an a.c. wit is listed, remove it if the main wit is also listed.
 834         # Otherwise keep it for explicit listing.
 835         my $aclabel = $self->ac_label;
 836         my @disp_ac;
 837         foreach my $w ( sort keys %wits ) {
 838                 if( $w =~ /^(.*)\Q$aclabel\E$/ ) {
 839                         if( exists $wits{$1} ) {
 840                                 delete $wits{$w};
 841                         } else {
 842                                 push( @disp_ac, $w );
 843                         }
 844                 }
 845         }
 846
 847         # See if we are in a majority situation.
 848         my $maj = scalar( $self->tradition->witnesses ) * 0.6;
 849         $maj = $maj > 5 ? $maj : 5;
 850         if( scalar keys %wits > $maj ) {
 851                 unshift( @disp_ac, 'majority' );
 852                 return join( ', ', @disp_ac );
 853         } else {
 854                 return join( ', ', sort keys %wits );
 855         }
 856 }
 857
 858 =head2 readings_at_rank( $rank )
 859
 860 Returns a list of readings at a given rank, taken from the alignment table.
 861
 862 =cut
 863
 864 sub readings_at_rank {
 865         my( $self, $rank ) = @_;
 866         my $table = $self->alignment_table;
 867         # Table rank is real rank - 1.
 868         my @elements = map { $_->{'tokens'}->[$rank-1] } @{$table->{'alignment'}};
 869         my %readings;
 870         foreach my $e ( @elements ) {
 871                 next unless ref( $e ) eq 'HASH';
 872                 next unless exists $e->{'t'};
 873                 $readings{$e->{'t'}->id} = $e->{'t'};
 874         }
 875         return values %readings;
 876 }
 877
 878 =head2 as_graphml
 879
 880 Returns a GraphML representation of the collation.  The GraphML will contain
 881 two graphs. The first expresses the attributes of the readings and the witness
 882 paths that link them; the second expresses the relationships that link the
 883 readings.  This is the native transfer format for a tradition.
 884
 885 =begin testing
 886
 887 use Text::Tradition;
 888
 889 my $READINGS = 311;
 890 my $PATHS = 361;
 891
 892 my $datafile = 't/data/florilegium_tei_ps.xml';
 893 my $tradition = Text::Tradition->new( 'input' => 'TEI',
 894                                       'name' => 'test0',
 895                                       'file' => $datafile,
 896                                       'linear' => 1 );
 897
 898 ok( $tradition, "Got a tradition object" );
 899 is( scalar $tradition->witnesses, 13, "Found all witnesses" );
 900 ok( $tradition->collation, "Tradition has a collation" );
 901
 902 my $c = $tradition->collation;
 903 is( scalar $c->readings, $READINGS, "Collation has all readings" );
 904 is( scalar $c->paths, $PATHS, "Collation has all paths" );
 905 is( scalar $c->relationships, 0, "Collation has all relationships" );
 906
 907 # Add a few relationships
 908 $c->add_relationship( 'w123', 'w125', { 'type' => 'collated' } );
 909 $c->add_relationship( 'w193', 'w196', { 'type' => 'collated' } );
 910 $c->add_relationship( 'w257', 'w262', { 'type' => 'transposition' } );
 911
 912 # Now write it to GraphML and parse it again.
 913
 914 my $graphml = $c->as_graphml;
 915 my $st = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml );
 916 is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all readings" );
 917 is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" );
 918 is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" );
 919
 920 # Now add a stemma, write to GraphML, and parse again.
 921 my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
 922 is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" );
 923 is( $tradition->stemmata, 1, "Tradition now has the stemma" );
 924 $graphml = $c->as_graphml;
 925 like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" );
 926
 927 =end testing
 928
 929 =cut
 930
 931 sub as_graphml {
 932     my( $self, $options ) = @_;
 933         $self->calculate_ranks unless $self->_graphcalc_done;
 934
 935         my $start = $options->{'from'}
 936                 ? $self->reading( $options->{'from'} ) : $self->start;
 937         my $end = $options->{'to'}
 938                 ? $self->reading( $options->{'to'} ) : $self->end;
 939         if( $start->has_rank && $end->has_rank && $end->rank < $start->rank ) {
 940                 throw( 'Start node must be before end node' );
 941         }
 942         # The readings need to be ranked for this to work.
 943         $start = $self->start unless $start->has_rank;
 944         $end = $self->end unless $end->has_rank;
 945         my $rankoffset = 0;
 946         unless( $start eq $self->start ) {
 947                 $rankoffset = $start->rank - 1;
 948         }
 949         my %use_readings;
 950
 951     # Some namespaces
 952     my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
 953     my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
 954     my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
 955         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
 956
 957     # Create the document and root node
 958     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
 959     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
 960     $graphml->setDocumentElement( $root );
 961     $root->setNamespace( $xsi_ns, 'xsi', 0 );
 962     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
 963
 964     # List of attribute types to save on our objects and their corresponding
 965     # GraphML types
 966     my %save_types = (
 967         'Str' => 'string',
 968         'Int' => 'int',
 969         'Bool' => 'boolean',
 970         'ReadingID' => 'string',
 971         'RelationshipType' => 'string',
 972         'RelationshipScope' => 'string',
 973     );
 974
 975     # List of attribute names *not* to save on our objects.
 976     # We will also not save any attribute beginning with _.
 977     my %skipsave;
 978     map { $skipsave{$_} = 1 } qw/ cached_svg /;
 979
 980     # Add the data keys for the graph. Include an extra key 'version' for the
 981     # GraphML output version.
 982     my %graph_data_keys;
 983     my $gdi = 0;
 984     my %graph_attributes = ( 'version' => 'string' );
 985         # Graph attributes include those of Tradition and those of Collation.
 986         my %gattr_from;
 987         my $tmeta = $self->tradition->meta;
 988         my $cmeta = $self->meta;
 989         map { $gattr_from{$_->name} = 'Tradition' } $tmeta->get_all_attributes;
 990         map { $gattr_from{$_->name} = 'Collation' } $cmeta->get_all_attributes;
 991         foreach my $attr ( ( $tmeta->get_all_attributes, $cmeta->get_all_attributes ) ) {
 992                 next if $attr->name =~ /^_/;
 993                 next if $skipsave{$attr->name};
 994                 next unless $save_types{$attr->type_constraint->name};
 995                 $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
 996         }
 997     # Extra custom key for the tradition stemma(ta)
 998     $graph_attributes{'stemmata'} = 'string';
 999
1000     foreach my $datum ( sort keys %graph_attributes ) {
1001         $graph_data_keys{$datum} = 'dg'.$gdi++;
1002         my $key = $root->addNewChild( $graphml_ns, 'key' );
1003         $key->setAttribute( 'attr.name', $datum );
1004         $key->setAttribute( 'attr.type', $graph_attributes{$datum} );
1005         $key->setAttribute( 'for', 'graph' );
1006         $key->setAttribute( 'id', $graph_data_keys{$datum} );
1007     }
1008
1009     # Add the data keys for reading nodes
1010     my %reading_attributes;
1011     my $rmeta = Text::Tradition::Collation::Reading->meta;
1012     foreach my $attr( $rmeta->get_all_attributes ) {
1013                 next if $attr->name =~ /^_/;
1014                 next if $skipsave{$attr->name};
1015                 next unless $save_types{$attr->type_constraint->name};
1016                 $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1017         }
1018         # Extra custom key for the reading morphology
1019         $reading_attributes{'lexemes'} = 'string';
1020
1021     my %node_data_keys;
1022     my $ndi = 0;
1023     foreach my $datum ( sort keys %reading_attributes ) {
1024         $node_data_keys{$datum} = 'dn'.$ndi++;
1025         my $key = $root->addNewChild( $graphml_ns, 'key' );
1026         $key->setAttribute( 'attr.name', $datum );
1027         $key->setAttribute( 'attr.type', $reading_attributes{$datum} );
1028         $key->setAttribute( 'for', 'node' );
1029         $key->setAttribute( 'id', $node_data_keys{$datum} );
1030     }
1031
1032     # Add the data keys for edges, that is, paths and relationships. Path
1033     # data does not come from a Moose class so is here manually.
1034     my $edi = 0;
1035     my %edge_data_keys;
1036     my %edge_attributes = (
1037         witness => 'string',                    # ID/label for a path
1038         extra => 'boolean',                             # Path key
1039         );
1040     my @path_attributes = keys %edge_attributes; # track our manual additions
1041     my $pmeta = Text::Tradition::Collation::Relationship->meta;
1042     foreach my $attr( $pmeta->get_all_attributes ) {
1043                 next if $attr->name =~ /^_/;
1044                 next if $skipsave{$attr->name};
1045                 next unless $save_types{$attr->type_constraint->name};
1046                 $edge_attributes{$attr->name} = $save_types{$attr->type_constraint->name};
1047         }
1048     foreach my $datum ( sort keys %edge_attributes ) {
1049         $edge_data_keys{$datum} = 'de'.$edi++;
1050         my $key = $root->addNewChild( $graphml_ns, 'key' );
1051         $key->setAttribute( 'attr.name', $datum );
1052         $key->setAttribute( 'attr.type', $edge_attributes{$datum} );
1053         $key->setAttribute( 'for', 'edge' );
1054         $key->setAttribute( 'id', $edge_data_keys{$datum} );
1055     }
1056
1057     # Add the collation graph itself. First, sanitize the name to a valid XML ID.
1058     my $xmlidname = $self->tradition->name;
1059     $xmlidname =~ s/(?!$xml10_namechar_rx)./_/g;
1060     if( $xmlidname !~ /^$xml10_namestartchar_rx/ ) {
1061         $xmlidname = '_'.$xmlidname;
1062     }
1063     my $sgraph = $root->addNewChild( $graphml_ns, 'graph' );
1064     $sgraph->setAttribute( 'edgedefault', 'directed' );
1065     $sgraph->setAttribute( 'id', $xmlidname );
1066     $sgraph->setAttribute( 'parse.edgeids', 'canonical' );
1067     $sgraph->setAttribute( 'parse.edges', 0 ); # fill in later
1068     $sgraph->setAttribute( 'parse.nodeids', 'canonical' );
1069     $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later
1070     $sgraph->setAttribute( 'parse.order', 'nodesfirst' );
1071
1072     # Tradition/collation attribute data
1073     foreach my $datum ( keys %graph_attributes ) {
1074         my $value;
1075         if( $datum eq 'version' ) {
1076                 $value = '3.2';
1077         } elsif( $datum eq 'stemmata' ) {
1078                 my @stemstrs;
1079                 map { push( @stemstrs, $_->editable( {linesep => ''} ) ) }
1080                         $self->tradition->stemmata;
1081                 $value = join( "\n", @stemstrs );
1082         } elsif( $gattr_from{$datum} eq 'Tradition' ) {
1083                 $value = $self->tradition->$datum;
1084         } else {
1085                 $value = $self->$datum;
1086         }
1087                 _add_graphml_data( $sgraph, $graph_data_keys{$datum}, $value );
1088         }
1089
1090     my $node_ctr = 0;
1091     my %node_hash;
1092     # Add our readings to the graph
1093     foreach my $n ( sort { $a->id cmp $b->id } $self->readings ) {
1094         next if $n->has_rank && $n ne $self->start && $n ne $self->end &&
1095                 ( $n->rank < $start->rank || $n->rank > $end->rank );
1096         $use_readings{$n->id} = 1;
1097         # Add to the main graph
1098         my $node_el = $sgraph->addNewChild( $graphml_ns, 'node' );
1099         my $node_xmlid = 'n' . $node_ctr++;
1100         $node_hash{ $n->id } = $node_xmlid;
1101         $node_el->setAttribute( 'id', $node_xmlid );
1102         foreach my $d ( keys %reading_attributes ) {
1103                 my $nval = $n->$d;
1104                 # Custom serialization
1105                 if( $d eq 'lexemes' ) {
1106                                 # If nval is a true value, we have lexemes so we need to
1107                                 # serialize them. Otherwise set nval to undef so that the
1108                                 # key is excluded from this reading.
1109                         $nval = $nval ? $n->_serialize_lexemes : undef;
1110                 } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) {
1111                         $nval = undef;
1112                 }
1113                 if( $rankoffset && $d eq 'rank' && $n ne $self->start ) {
1114                         # Adjust the ranks within the subgraph.
1115                         $nval = $n eq $self->end ? $end->rank - $rankoffset + 1
1116                                 : $nval - $rankoffset;
1117                 }
1118                 _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
1119                         if defined $nval;
1120         }
1121     }
1122
1123     # Add the path edges to the sequence graph
1124     my $edge_ctr = 0;
1125     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->sequence->edges() ) {
1126         # We add an edge in the graphml for every witness in $e.
1127         next unless( $use_readings{$e->[0]} || $use_readings{$e->[1]} );
1128         my @edge_wits = sort $self->path_witnesses( $e );
1129         $e->[0] = $self->start->id unless $use_readings{$e->[0]};
1130         $e->[1] = $self->end->id unless $use_readings{$e->[1]};
1131         # Skip any path from start to end; that witness is not in the subgraph.
1132         next if ( $e->[0] eq $self->start->id && $e->[1] eq $self->end->id );
1133         foreach my $wit ( @edge_wits ) {
1134                         my( $id, $from, $to ) = ( 'e'.$edge_ctr++,
1135                                                                                 $node_hash{ $e->[0] },
1136                                                                                 $node_hash{ $e->[1] } );
1137                         my $edge_el = $sgraph->addNewChild( $graphml_ns, 'edge' );
1138                         $edge_el->setAttribute( 'source', $from );
1139                         $edge_el->setAttribute( 'target', $to );
1140                         $edge_el->setAttribute( 'id', $id );
1141
1142                         # It's a witness path, so add the witness
1143                         my $base = $wit;
1144                         my $key = $edge_data_keys{'witness'};
1145                         # Is this an ante-corr witness?
1146                         my $aclabel = $self->ac_label;
1147                         if( $wit =~ /^(.*)\Q$aclabel\E$/ ) {
1148                                 # Keep the base witness
1149                                 $base = $1;
1150                                 # ...and record that this is an 'extra' reading path
1151                                 _add_graphml_data( $edge_el, $edge_data_keys{'extra'}, $aclabel );
1152                         }
1153                         _add_graphml_data( $edge_el, $edge_data_keys{'witness'}, $base );
1154                 }
1155         }
1156
1157         # Report the actual number of nodes and edges that went in
1158         $sgraph->setAttribute( 'parse.edges', $edge_ctr );
1159         $sgraph->setAttribute( 'parse.nodes', $node_ctr );
1160
1161         # Add the relationship graph to the XML
1162         map { delete $edge_data_keys{$_} } @path_attributes;
1163         $self->relations->_as_graphml( $graphml_ns, $root, \%node_hash,
1164                 $node_data_keys{'id'}, \%edge_data_keys );
1165
1166     # Save and return the thing
1167     my $result = decode_utf8( $graphml->toString(1) );
1168     return $result;
1169 }
1170
1171 sub _add_graphml_data {
1172     my( $el, $key, $value ) = @_;
1173     return unless defined $value;
1174     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
1175     $data_el->setAttribute( 'key', $key );
1176     $data_el->appendText( $value );
1177 }
1178
1179 =head2 as_csv
1180
1181 Returns a CSV alignment table representation of the collation graph, one
1182 row per witness (or witness uncorrected.)
1183
1184 =cut
1185
1186 sub as_csv {
1187     my( $self ) = @_;
1188     my $table = $self->alignment_table;
1189     my $csv = Text::CSV->new( { binary => 1, quote_null => 0 } );
1190     my @result;
1191     # Make the header row
1192     $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
1193         push( @result, decode_utf8( $csv->string ) );
1194     # Make the rest of the rows
1195     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1196         my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
1197         my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
1198         $csv->combine( @row );
1199         push( @result, decode_utf8( $csv->string ) );
1200     }
1201     return join( "\n", @result );
1202 }
1203
1204 =head2 alignment_table( $use_refs, $include_witnesses )
1205
1206 Return a reference to an alignment table, in a slightly enhanced CollateX
1207 format which looks like this:
1208
1209  $table = { alignment => [ { witness => "SIGIL",
1210                              tokens => [ { t => "TEXT" }, ... ] },
1211                            { witness => "SIG2",
1212                              tokens => [ { t => "TEXT" }, ... ] },
1213                            ... ],
1214             length => TEXTLEN };
1215
1216 If $use_refs is set to 1, the reading object is returned in the table
1217 instead of READINGTEXT; if not, the text of the reading is returned.
1218
1219 If $include_witnesses is set to a hashref, only the witnesses whose sigil
1220 keys have a true hash value will be included.
1221
1222 =cut
1223
1224 sub alignment_table {
1225     my( $self ) = @_;
1226     $self->calculate_ranks() unless $self->_graphcalc_done;
1227     return $self->cached_table if $self->has_cached_table;
1228
1229     # Make sure we can do this
1230         throw( "Need a linear graph in order to make an alignment table" )
1231                 unless $self->linear;
1232         $self->calculate_ranks unless $self->end->has_rank;
1233
1234     my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 };
1235     my @all_pos = ( 1 .. $self->end->rank - 1 );
1236     foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) {
1237         # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
1238         my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1239         my @row = _make_witness_row( \@wit_path, \@all_pos );
1240         push( @{$table->{'alignment'}},
1241                 { 'witness' => $wit->sigil, 'tokens' => \@row } );
1242         if( $wit->is_layered ) {
1243                 my @wit_ac_path = $self->reading_sequence( $self->start, $self->end,
1244                         $wit->sigil.$self->ac_label );
1245             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
1246                         push( @{$table->{'alignment'}},
1247                                 { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
1248         }
1249     }
1250     $self->cached_table( $table );
1251     return $table;
1252 }
1253
1254 sub _make_witness_row {
1255     my( $path, $positions ) = @_;
1256     my %char_hash;
1257     map { $char_hash{$_} = undef } @$positions;
1258     my $debug = 0;
1259     foreach my $rdg ( @$path ) {
1260         my $rtext = $rdg->text;
1261         $rtext = '#LACUNA#' if $rdg->is_lacuna;
1262         print STDERR "rank " . $rdg->rank . "\n" if $debug;
1263         # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank;
1264         $char_hash{$rdg->rank} = { 't' => $rdg };
1265     }
1266     my @row = map { $char_hash{$_} } @$positions;
1267     # Fill in lacuna markers for undef spots in the row
1268     my $last_el = shift @row;
1269     my @filled_row = ( $last_el );
1270     foreach my $el ( @row ) {
1271         # If we are using node reference, make the lacuna node appear many times
1272         # in the table.  If not, use the lacuna tag.
1273         if( $last_el && $last_el->{'t'}->is_lacuna && !defined $el ) {
1274             $el = $last_el;
1275         }
1276         push( @filled_row, $el );
1277         $last_el = $el;
1278     }
1279     return @filled_row;
1280 }
1281
1282 =head1 NAVIGATION METHODS
1283
1284 =head2 reading_sequence( $first, $last, $sigil, $backup )
1285
1286 Returns the ordered list of readings, starting with $first and ending
1287 with $last, for the witness given in $sigil. If a $backup sigil is
1288 specified (e.g. when walking a layered witness), it will be used wherever
1289 no $sigil path exists.  If there is a base text reading, that will be
1290 used wherever no path exists for $sigil or $backup.
1291
1292 =cut
1293
1294 # TODO Think about returning some lazy-eval iterator.
1295 # TODO Get rid of backup; we should know from what witness is whether we need it.
1296
1297 sub reading_sequence {
1298     my( $self, $start, $end, $witness ) = @_;
1299
1300     $witness = $self->baselabel unless $witness;
1301     my @readings = ( $start );
1302     my %seen;
1303     my $n = $start;
1304     while( $n && $n->id ne $end->id ) {
1305         if( exists( $seen{$n->id} ) ) {
1306             throw( "Detected loop for $witness at " . $n->id );
1307         }
1308         $seen{$n->id} = 1;
1309
1310         my $next = $self->next_reading( $n, $witness );
1311         unless( $next ) {
1312             throw( "Did not find any path for $witness from reading " . $n->id );
1313         }
1314         push( @readings, $next );
1315         $n = $next;
1316     }
1317     # Check that the last reading is our end reading.
1318     my $last = $readings[$#readings];
1319     throw( "Last reading found from " . $start->text .
1320         " for witness $witness is not the end!" ) # TODO do we get this far?
1321         unless $last->id eq $end->id;
1322
1323     return @readings;
1324 }
1325
1326 =head2 next_reading( $reading, $sigil );
1327
1328 Returns the reading that follows the given reading along the given witness
1329 path.
1330
1331 =cut
1332
1333 sub next_reading {
1334     # Return the successor via the corresponding path.
1335     my $self = shift;
1336     my $answer = $self->_find_linked_reading( 'next', @_ );
1337         return undef unless $answer;
1338     return $self->reading( $answer );
1339 }
1340
1341 =head2 prior_reading( $reading, $sigil )
1342
1343 Returns the reading that precedes the given reading along the given witness
1344 path.
1345
1346 =cut
1347
1348 sub prior_reading {
1349     # Return the predecessor via the corresponding path.
1350     my $self = shift;
1351     my $answer = $self->_find_linked_reading( 'prior', @_ );
1352     return $self->reading( $answer );
1353 }
1354
1355 sub _find_linked_reading {
1356     my( $self, $direction, $node, $path ) = @_;
1357
1358     # Get a backup if we are dealing with a layered witness
1359     my $alt_path;
1360     my $aclabel = $self->ac_label;
1361     if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
1362         $alt_path = $1;
1363     }
1364
1365     my @linked_paths = $direction eq 'next'
1366         ? $self->sequence->edges_from( $node )
1367         : $self->sequence->edges_to( $node );
1368     return undef unless scalar( @linked_paths );
1369
1370     # We have to find the linked path that contains all of the
1371     # witnesses supplied in $path.
1372     my( @path_wits, @alt_path_wits );
1373     @path_wits = sort( $self->_witnesses_of_label( $path ) ) if $path;
1374     @alt_path_wits = sort( $self->_witnesses_of_label( $alt_path ) ) if $alt_path;
1375     my $base_le;
1376     my $alt_le;
1377     foreach my $le ( @linked_paths ) {
1378         if( $self->sequence->has_edge_attribute( @$le, $self->baselabel ) ) {
1379             $base_le = $le;
1380         }
1381                 my @le_wits = sort $self->path_witnesses( $le );
1382                 if( _is_within( \@path_wits, \@le_wits ) ) {
1383                         # This is the right path.
1384                         return $direction eq 'next' ? $le->[1] : $le->[0];
1385                 } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
1386                         $alt_le = $le;
1387                 }
1388     }
1389     # Got this far? Return the alternate path if it exists.
1390     return $direction eq 'next' ? $alt_le->[1] : $alt_le->[0]
1391         if $alt_le;
1392
1393     # Got this far? Return the base path if it exists.
1394     return $direction eq 'next' ? $base_le->[1] : $base_le->[0]
1395         if $base_le;
1396
1397     # Got this far? We have no appropriate path.
1398     warn "Could not find $direction node from " . $node->id
1399         . " along path $path";
1400     return undef;
1401 }
1402
1403 # Some set logic.
1404 sub _is_within {
1405     my( $set1, $set2 ) = @_;
1406     my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
1407     foreach my $el ( @$set1 ) {
1408         $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
1409     }
1410     return $ret;
1411 }
1412
1413 # Return the string that joins together a list of witnesses for
1414 # display on a single path.
1415 sub _witnesses_of_label {
1416     my( $self, $label ) = @_;
1417     my $regex = $self->wit_list_separator;
1418     my @answer = split( /\Q$regex\E/, $label );
1419     return @answer;
1420 }
1421
1422 =head2 common_readings
1423
1424 Returns the list of common readings in the graph (i.e. those readings that are
1425 shared by all non-lacunose witnesses.)
1426
1427 =cut
1428
1429 sub common_readings {
1430         my $self = shift;
1431         my @common = grep { $_->is_common } $self->readings;
1432         return @common;
1433 }
1434
1435 =head2 path_text( $sigil, [, $start, $end ] )
1436
1437 Returns the text of a witness (plus its backup, if we are using a layer)
1438 as stored in the collation.  The text is returned as a string, where the
1439 individual readings are joined with spaces and the meta-readings (e.g.
1440 lacunae) are omitted.  Optional specification of $start and $end allows
1441 the generation of a subset of the witness text.
1442
1443 =cut
1444
1445 sub path_text {
1446         my( $self, $wit, $start, $end ) = @_;
1447         $start = $self->start unless $start;
1448         $end = $self->end unless $end;
1449         my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
1450         my $pathtext = '';
1451         my $last;
1452         foreach my $r ( @path ) {
1453                 unless ( $r->join_prior || !$last || $last->join_next ) {
1454                         $pathtext .= ' ';
1455                 }
1456                 $pathtext .= $r->text;
1457                 $last = $r;
1458         }
1459         return $pathtext;
1460 }
1461
1462 =head1 INITIALIZATION METHODS
1463
1464 These are mostly for use by parsers.
1465
1466 =head2 make_witness_path( $witness )
1467
1468 Link the array of readings contained in $witness->path (and in
1469 $witness->uncorrected_path if it exists) into collation paths.
1470 Clear out the arrays when finished.
1471
1472 =head2 make_witness_paths
1473
1474 Call make_witness_path for all witnesses in the tradition.
1475
1476 =cut
1477
1478 # For use when a collation is constructed from a base text and an apparatus.
1479 # We have the sequences of readings and just need to add path edges.
1480 # When we are done, clear out the witness path attributes, as they are no
1481 # longer needed.
1482 # TODO Find a way to replace the witness path attributes with encapsulated functions?
1483
1484 sub make_witness_paths {
1485     my( $self ) = @_;
1486     foreach my $wit ( $self->tradition->witnesses ) {
1487         # print STDERR "Making path for " . $wit->sigil . "\n";
1488         $self->make_witness_path( $wit );
1489     }
1490 }
1491
1492 sub make_witness_path {
1493     my( $self, $wit ) = @_;
1494     my @chain = @{$wit->path};
1495     my $sig = $wit->sigil;
1496     # Add start and end if necessary
1497     unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1498     push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1499     foreach my $idx ( 0 .. $#chain-1 ) {
1500         $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
1501     }
1502     if( $wit->is_layered ) {
1503         @chain = @{$wit->uncorrected_path};
1504                 unshift( @chain, $self->start ) unless $chain[0] eq $self->start;
1505                 push( @chain, $self->end ) unless $chain[-1] eq $self->end;
1506         foreach my $idx( 0 .. $#chain-1 ) {
1507             my $source = $chain[$idx];
1508             my $target = $chain[$idx+1];
1509             $self->add_path( $source, $target, $sig.$self->ac_label )
1510                 unless $self->has_path( $source, $target, $sig );
1511         }
1512     }
1513     $wit->clear_path;
1514     $wit->clear_uncorrected_path;
1515 }
1516
1517 =head2 calculate_ranks
1518
1519 Calculate the reading ranks (that is, their aligned positions relative
1520 to each other) for the graph.  This can only be called on linear collations.
1521
1522 =begin testing
1523
1524 use Text::Tradition;
1525
1526 my $cxfile = 't/data/Collatex-16.xml';
1527 my $t = Text::Tradition->new(
1528     'name'  => 'inline',
1529     'input' => 'CollateX',
1530     'file'  => $cxfile,
1531     );
1532 my $c = $t->collation;
1533
1534 # Make an svg
1535 my $table = $c->alignment_table;
1536 ok( $c->has_cached_table, "Alignment table was cached" );
1537 is( $c->alignment_table, $table, "Cached table returned upon second call" );
1538 $c->calculate_ranks;
1539 is( $c->alignment_table, $table, "Cached table retained with no rank change" );
1540 $c->add_relationship( 'n24', 'n23', { 'type' => 'spelling' } );
1541 isnt( $c->alignment_table, $table, "Alignment table changed after relationship add" );
1542
1543 =end testing
1544
1545 =cut
1546
1547 sub calculate_ranks {
1548     my $self = shift;
1549     # Save the existing ranks, in case we need to invalidate the cached SVG.
1550     my %existing_ranks;
1551     map { $existing_ranks{$_} = $_->rank } $self->readings;
1552
1553     # Do the rankings based on the relationship equivalence graph, starting
1554     # with the start node.
1555     my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks();
1556
1557     # Transfer our rankings from the topological graph to the real one.
1558     foreach my $r ( $self->readings ) {
1559         if( defined $node_ranks->{$self->equivalence( $r->id )} ) {
1560             $r->rank( $node_ranks->{$self->equivalence( $r->id )} );
1561         } else {
1562                 # Die. Find the last rank we calculated.
1563                 my @all_defined = sort { ( $node_ranks->{$self->equivalence( $a->id )}||-1 )
1564                                  <=> ( $node_ranks->{$self->equivalence( $b->id )}||-1 ) }
1565                         $self->readings;
1566                 my $last = pop @all_defined;
1567             throw( "Ranks not calculated after $last - do you have a cycle in the graph?" );
1568         }
1569     }
1570     # Do we need to invalidate the cached data?
1571     if( $self->has_cached_svg || $self->has_cached_table ) {
1572         foreach my $r ( $self->readings ) {
1573                 next if defined( $existing_ranks{$r} )
1574                         && $existing_ranks{$r} == $r->rank;
1575                 # Something has changed, so clear the cache
1576                 $self->_clear_cache;
1577                         # ...and recalculate the common readings.
1578                         $self->calculate_common_readings();
1579                 last;
1580         }
1581     }
1582         # The graph calculation information is now up to date.
1583         $self->_graphcalc_done(1);
1584 }
1585
1586 sub _clear_cache {
1587         my $self = shift;
1588         $self->wipe_svg if $self->has_cached_svg;
1589         $self->wipe_table if $self->has_cached_table;
1590 }
1591
1592
1593 =head2 flatten_ranks
1594
1595 A convenience method for parsing collation data.  Searches the graph for readings
1596 with the same text at the same rank, and merges any that are found.
1597
1598 =cut
1599
1600 sub flatten_ranks {
1601     my $self = shift;
1602     my %unique_rank_rdg;
1603     my $changed;
1604     foreach my $rdg ( $self->readings ) {
1605         next unless $rdg->has_rank;
1606         my $key = $rdg->rank . "||" . $rdg->text;
1607         if( exists $unique_rank_rdg{$key} ) {
1608                 # Make sure they don't have different grammatical forms
1609                         my $ur = $unique_rank_rdg{$key};
1610                         if( $rdg->disambiguated && $ur->disambiguated ) {
1611                                 my $rform = join( '//', map { $_->form->to_string } $rdg->lexemes );
1612                                 my $uform = join( '//', map { $_->form->to_string } $ur->lexemes );
1613                                 next unless $rform eq $uform;
1614                         } elsif( $rdg->disambiguated xor $ur->disambiguated ) {
1615                                 next;
1616                         }
1617             # Combine!
1618                 #print STDERR "Combining readings at same rank: $key\n";
1619                 $changed = 1;
1620             $self->merge_readings( $unique_rank_rdg{$key}, $rdg );
1621             # TODO see if this now makes a common point.
1622         } else {
1623             $unique_rank_rdg{$key} = $rdg;
1624         }
1625     }
1626     # If we merged readings, the ranks are still fine but the alignment
1627     # table is wrong. Wipe it.
1628     $self->wipe_table() if $changed;
1629 }
1630
1631
1632 =head2 calculate_common_readings
1633
1634 Goes through the graph identifying the readings that appear in every witness
1635 (apart from those with lacunae at that spot.) Marks them as common and returns
1636 the list.
1637
1638 =begin testing
1639
1640 use Text::Tradition;
1641
1642 my $cxfile = 't/data/Collatex-16.xml';
1643 my $t = Text::Tradition->new(
1644     'name'  => 'inline',
1645     'input' => 'CollateX',
1646     'file'  => $cxfile,
1647     );
1648 my $c = $t->collation;
1649
1650 my @common = $c->calculate_common_readings();
1651 is( scalar @common, 8, "Found correct number of common readings" );
1652 my @marked = sort $c->common_readings();
1653 is( scalar @common, 8, "All common readings got marked as such" );
1654 my @expected = qw/ n1 n11 n16 n19 n20 n5 n6 n7 /;
1655 is_deeply( \@marked, \@expected, "Found correct list of common readings" );
1656
1657 =end testing
1658
1659 =cut
1660
1661 sub calculate_common_readings {
1662         my $self = shift;
1663         my @common;
1664         map { $_->is_common( 0 ) } $self->readings;
1665         # Implicitly calls calculate_ranks
1666         my $table = $self->alignment_table;
1667         foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
1668                 my @row = map { $_->{'tokens'}->[$idx]
1669                                                         ? $_->{'tokens'}->[$idx]->{'t'} : '' }
1670                                         @{$table->{'alignment'}};
1671                 my %hash;
1672                 foreach my $r ( @row ) {
1673                         if( $r ) {
1674                                 $hash{$r->id} = $r unless $r->is_meta;
1675                         } else {
1676                                 $hash{'UNDEF'} = $r;
1677                         }
1678                 }
1679                 if( keys %hash == 1 && !exists $hash{'UNDEF'} ) {
1680                         my( $r ) = values %hash;
1681                         $r->is_common( 1 );
1682                         push( @common, $r );
1683                 }
1684         }
1685         return @common;
1686 }
1687
1688 =head2 text_from_paths
1689
1690 Calculate the text array for all witnesses from the path, for later consistency
1691 checking.  Only to be used if there is no non-graph-based way to know the
1692 original texts.
1693
1694 =cut
1695
1696 sub text_from_paths {
1697         my $self = shift;
1698     foreach my $wit ( $self->tradition->witnesses ) {
1699         my @readings = $self->reading_sequence( $self->start, $self->end, $wit->sigil );
1700         my @text;
1701         foreach my $r ( @readings ) {
1702                 next if $r->is_meta;
1703                 push( @text, $r->text );
1704         }
1705         $wit->text( \@text );
1706         if( $wit->is_layered ) {
1707                         my @ucrdgs = $self->reading_sequence( $self->start, $self->end,
1708                                                                                                   $wit->sigil.$self->ac_label );
1709                         my @uctext;
1710                         foreach my $r ( @ucrdgs ) {
1711                                 next if $r->is_meta;
1712                                 push( @uctext, $r->text );
1713                         }
1714                         $wit->layertext( \@uctext );
1715         }
1716     }
1717 }
1718
1719 =head1 UTILITY FUNCTIONS
1720
1721 =head2 common_predecessor( $reading_a, $reading_b )
1722
1723 Find the last reading that occurs in sequence before both the given readings.
1724 At the very least this should be $self->start.
1725
1726 =head2 common_successor( $reading_a, $reading_b )
1727
1728 Find the first reading that occurs in sequence after both the given readings.
1729 At the very least this should be $self->end.
1730
1731 =begin testing
1732
1733 use Text::Tradition;
1734
1735 my $cxfile = 't/data/Collatex-16.xml';
1736 my $t = Text::Tradition->new(
1737     'name'  => 'inline',
1738     'input' => 'CollateX',
1739     'file'  => $cxfile,
1740     );
1741 my $c = $t->collation;
1742
1743 is( $c->common_predecessor( 'n24', 'n23' )->id,
1744     'n20', "Found correct common predecessor" );
1745 is( $c->common_successor( 'n24', 'n23' )->id,
1746     '__END__', "Found correct common successor" );
1747
1748 is( $c->common_predecessor( 'n19', 'n17' )->id,
1749     'n16', "Found correct common predecessor for readings on same path" );
1750 is( $c->common_successor( 'n21', 'n10' )->id,
1751     '__END__', "Found correct common successor for readings on same path" );
1752
1753 =end testing
1754
1755 =cut
1756
1757 ## Return the closest reading that is a predecessor of both the given readings.
1758 sub common_predecessor {
1759         my $self = shift;
1760         my( $r1, $r2 ) = $self->_objectify_args( @_ );
1761         return $self->_common_in_path( $r1, $r2, 'predecessors' );
1762 }
1763
1764 sub common_successor {
1765         my $self = shift;
1766         my( $r1, $r2 ) = $self->_objectify_args( @_ );
1767         return $self->_common_in_path( $r1, $r2, 'successors' );
1768 }
1769
1770
1771 # TODO think about how to do this without ranks...
1772 sub _common_in_path {
1773         my( $self, $r1, $r2, $dir ) = @_;
1774         my $iter = $self->end->rank;
1775         my @candidates;
1776         my @last_r1 = ( $r1 );
1777         my @last_r2 = ( $r2 );
1778         # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' );
1779         my %all_seen;
1780         # print STDERR "Finding common $dir for $r1, $r2\n";
1781         while( !@candidates ) {
1782                 last unless $iter--;  # Avoid looping infinitely
1783                 # Iterate separately down the graph from r1 and r2
1784                 my( @new_lc1, @new_lc2 );
1785                 foreach my $lc ( @last_r1 ) {
1786                         foreach my $p ( $lc->$dir ) {
1787                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) {
1788                                         # print STDERR "Path candidate $p from $lc\n";
1789                                         push( @candidates, $p );
1790                                 } elsif( !$all_seen{$p->id} ) {
1791                                         $all_seen{$p->id} = 'r1';
1792                                         push( @new_lc1, $p );
1793                                 }
1794                         }
1795                 }
1796                 foreach my $lc ( @last_r2 ) {
1797                         foreach my $p ( $lc->$dir ) {
1798                                 if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) {
1799                                         # print STDERR "Path candidate $p from $lc\n";
1800                                         push( @candidates, $p );
1801                                 } elsif( !$all_seen{$p->id} ) {
1802                                         $all_seen{$p->id} = 'r2';
1803                                         push( @new_lc2, $p );
1804                                 }
1805                         }
1806                 }
1807                 @last_r1 = @new_lc1;
1808                 @last_r2 = @new_lc2;
1809         }
1810         my @answer = sort { $a->rank <=> $b->rank } @candidates;
1811         return $dir eq 'predecessors' ? pop( @answer ) : shift ( @answer );
1812 }
1813
1814 sub throw {
1815         Text::Tradition::Error->throw(
1816                 'ident' => 'Collation error',
1817                 'message' => $_[0],
1818                 );
1819 }
1820
1821 no Moose;
1822 __PACKAGE__->meta->make_immutable;
1823
1824 =head1 LICENSE
1825
1826 This package is free software and is provided "as is" without express
1827 or implied warranty.  You can redistribute it and/or modify it under
1828 the same terms as Perl itself.
1829
1830 =head1 AUTHOR
1831
1832 Tara L Andrews E<lt>aurum@cpan.orgE<gt>