lib/Text/Tradition/Parser/BaseText.pm

   1 package Text::Tradition::Parser::BaseText;
   2
   3 use strict;
   4 use warnings;
   5 use Module::Load;
   6 use Algorithm::Diff;
   7
   8 =head1 NAME
   9
  10 Text::Tradition::Parser::BaseText
  11
  12 =head1 SYNOPSIS
  13
  14 use Text::Tradition::Parser::BaseText qw( merge_base );
  15 merge_base( $graph, 'reference.txt', @apparatus_entries )
  16
  17 =head1 DESCRIPTION
  18
  19 For an overview of the package, see the documentation for the
  20 Text::Tradition::Graph module.
  21
  22 This module is meant for use with certain of the other Parser classes
  23 - whenever a list of variants is given with reference to a base text,
  24 these must be joined into a single collation.  The parser should
  25 therefore make a list of variants and their locations, and BaseText
  26 will join those listed variants onto the reference text.
  27
  28 =head1 SUBROUTINES
  29
  30 =over
  31
  32 =item B<parse>
  33
  34 parse( $graph, %opts );
  35
  36 Takes an initialized graph and a set of options, which must include:
  37 - 'base' - the base text referenced by the variants
  38 - 'format' - the format of the variant list
  39 - 'data' - the variants, in the given format.
  40
  41 =cut
  42
  43 sub parse {
  44     my( $tradition, %opts ) = @_;
  45
  46     my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
  47     load( $format_mod );
  48     my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
  49     merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
  50 }
  51
  52 =item B<merge_base>
  53
  54 merge_base( $graph, 'reference.txt', @apparatus_entries )
  55
  56 Takes three arguments: a newly-initialized Text::Tradition::Graph
  57 object, a text file containing the reference text, and a list of
  58 variants (apparatus entries).  Adds the base text to the graph, and
  59 joins the variants to that.
  60
  61 The list of variants is an array of hash references; each hash takes
  62 the form
  63  { '_id' => line reference,
  64    'rdg_0' => lemma reading,
  65    'rdg_1' => first variant,
  66    ...  # and so on until all distinct readings are listed
  67    'WitnessA' => 'rdg_0',
  68    'WitnessB' => 'rdg_1',
  69    ...  # and so on until all witnesses are listed with their readings
  70  }
  71
  72 Any hash key that is not of the form /^rdg_\d+$/ and that does not
  73 begin with an underscore is assumed to be a witness name.  Any 'meta'
  74 information to be passed must be passed in a key with a leading
  75 underscore in its name.
  76
  77 =cut
  78
  79 my $SHORTEND; # Debug var - set this to limit the number of lines parsed
  80
  81 my %base_text_index;
  82 my $edits_required = {};
  83
  84 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
  85
  86 sub merge_base {
  87     my( $collation, $base_file, @app_entries ) = @_;
  88     my @base_line_starts = read_base( $base_file, $collation );
  89
  90     my %all_witnesses;
  91     my @unwitnessed_lemma_nodes;
  92     foreach my $app ( @app_entries ) {
  93         my( $line, $num ) = split( /\./, $app->{_id} );
  94         # DEBUG with a short graph
  95         last if $SHORTEND && $line > $SHORTEND;
  96         # DEBUG for problematic entries
  97         my $scrutinize = '';
  98         my $first_line_reading = $base_line_starts[ $line ];
  99         my $too_far = $base_line_starts[ $line+1 ];
 100
 101         my $lemma = $app->{rdg_0};
 102         my $seq = 1;
 103         # Is this the Nth occurrence of this reading in the line?
 104         if( $lemma =~ s/(_)?(\d)$// ) {
 105             $seq = $2;
 106         }
 107         my @lemma_words = split( /\s+/, $lemma );
 108
 109         # Now search for the lemma words within this line.
 110         my $lemma_start = $first_line_reading;
 111         my $lemma_end;
 112         my %seen;
 113         while( $lemma_start ne $too_far ) {
 114             # Loop detection
 115             if( $seen{ $lemma_start->name() } ) {
 116                 warn "Detected loop at " . $lemma_start->name() .
 117                     ", ref $line,$num";
 118                 last;
 119             }
 120             $seen{ $lemma_start->name() } = 1;
 121
 122             # Try to match the lemma.
 123             my $unmatch = 0;
 124             print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
 125                 $lemma_words[0] . "...\n"
 126                 if "$line.$num" eq $scrutinize;
 127             if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
 128                 # Skip it if we need a match that is not the first.
 129                 if( --$seq < 1 ) {
 130                     # Now we have to compare the rest of the words here.
 131                     if( scalar( @lemma_words ) > 1 ) {
 132                         my $next_reading =
 133                             $collation->next_reading( $lemma_start );
 134                         foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 135                             printf STDERR "Now matching %s against %s\n",
 136                                     cmp_str($next_reading), $w
 137                                 if "$line.$num" eq $scrutinize;
 138                             if( $w ne cmp_str($next_reading) ) {
 139                                 $unmatch = 1;
 140                                 last;
 141                             } else {
 142                                 $lemma_end = $next_reading;
 143                                 $next_reading =
 144                                     $collation->next_reading( $lemma_end );
 145                             }
 146                         }
 147                     } else {
 148                         $lemma_end = $lemma_start;
 149                     }
 150                 } else {
 151                     $unmatch = 1;
 152                 }
 153             }
 154             last unless ( $unmatch || !defined( $lemma_end ) );
 155             $lemma_end = undef;
 156             $lemma_start = $collation->next_reading( $lemma_start );
 157         }
 158
 159         unless( $lemma_end ) {
 160             warn "No match found for @lemma_words at $line.$num";
 161             next;
 162         }
 163
 164         # Now we have found the lemma; we will record an 'edit', in
 165         # terms of a splice operation, for each subsequent reading.
 166         # We also note which witnesses take the given edit.
 167
 168         my @lemma_set = $collation->reading_sequence( $lemma_start,
 169                                                       $lemma_end );
 170         my @reading_sets = [ @lemma_set ];
 171
 172         # For each reading that is not rdg_0, we create the variant
 173         # reading nodes, and store the range as an edit operation on
 174         # the base text.
 175         my $variant_objects;
 176         my %pc_seen; # Keep track of mss with explicit post-corr data
 177         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
 178             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 179             push( @unwitnessed_lemma_nodes, @lemma_set )
 180                 if !@mss && $k eq 'rdg_0';
 181
 182             # Keep track of what witnesses we have seen.
 183             @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
 184             # Keep track of which witnesses bear corrected readings here.
 185             foreach my $m ( @mss ) {
 186                 my $base = _is_post_corr( $m );
 187                 next unless $base;
 188                 $pc_seen{$base} = 1;
 189             }
 190             next if $k eq 'rdg_0';
 191
 192             # TODO don't hardcode the reading split operation
 193             my @variant = split( /\s+/, $app->{$k} );
 194             @variant = () if $app->{$k} eq '/'; # This is an omission.
 195
 196             # Make the variant into a set of readings.
 197             my @variant_readings;
 198             my $ctr = 0;
 199             foreach my $vw ( @variant ) {
 200                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 201                 my $vwreading = $collation->add_reading( $vwname );
 202                 $vwreading->text( $vw );
 203                 push( @variant_readings, $vwreading );
 204             }
 205
 206             $variant_objects->{$k} = { 'mss' => \@mss,
 207                                        'reading' => \@variant_readings,
 208             };
 209             push( @reading_sets, \@variant_readings );
 210         }
 211
 212         # Now collate and collapse the identical readings within the
 213         # collated sets.  Modifies the reading sets that were passed.
 214         collate_variants( $collation, @reading_sets );
 215
 216         # TODO Here would be a very good place to set up relationships
 217         # between the nodes and the lemma.
 218         set_relationships( $app, \@lemma_set, $variant_objects );
 219
 220         # Now create the splice-edit objects that will be used
 221         # to reconstruct each witness.
 222
 223         foreach my $rkey ( keys %$variant_objects ) {
 224             # Object is argument list for splice, so:
 225             # offset, length, replacements
 226             my $edit_object = [ $lemma_start->name,
 227                                 scalar( @lemma_set ),
 228                                 $variant_objects->{$rkey}->{reading} ];
 229             foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
 230                 # Is this a p.c. entry?
 231                 my $base = _is_post_corr( $ms );
 232                 if( $base ) { # this is a post-corr witness
 233                     my $pc_key = $base . "_post";
 234                     _add_hash_entry( $edits_required, $pc_key, $edit_object );
 235                 } else { # this is an ante-corr witness
 236                     my $pc_key = $ms . "_post";
 237                     _add_hash_entry( $edits_required, $ms, $edit_object );
 238                     unless( $pc_seen{$ms} ) {
 239                         # If this witness carries no correction, add this
 240                         # same object to its post-corrected state.
 241                         _add_hash_entry( $edits_required, $pc_key,
 242                                          $edit_object );
 243                     }
 244                 }
 245             }
 246         }
 247     } # Finished going through the apparatus entries
 248
 249     # Now make the witness objects, and create their text sequences
 250     foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
 251         print STDERR "Creating witness $w\n";
 252         my $witness_obj = $collation->tradition->add_witness( sigil => $w );
 253         my $debug = $w eq 'Vb10';
 254         my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
 255         my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
 256             if exists( $edits_required->{$w."_post"} );
 257
 258         my @repeated = _check_for_repeated( @ante_corr_seq );
 259         warn "Repeated elements @repeated in $w a.c."
 260             if @repeated;
 261         @repeated = _check_for_repeated( @post_corr_seq );
 262         warn "Repeated elements @repeated in $w p.c."
 263             if @repeated;
 264
 265         # Now save these paths in my witness object
 266         if( @post_corr_seq ) {
 267             $witness_obj->path( \@post_corr_seq );
 268             $witness_obj->uncorrected_path( \@ante_corr_seq );
 269         } else {
 270             $witness_obj->path( \@ante_corr_seq );
 271         }
 272     }
 273
 274     # Now remove our 'base text' edges, which is to say, the only
 275     # ones we have created so far.  Also remove any nodes that didn't
 276     # appear in any witnesses.
 277     foreach ( $collation->paths() ) {
 278         $collation->del_path( $_ );
 279     }
 280     foreach( @unwitnessed_lemma_nodes ) {
 281         $collation->del_reading( $_ );
 282     }
 283
 284     # Now walk paths and calculate positions.
 285     my @common_readings =
 286         $collation->make_witness_paths();
 287     $collation->calculate_positions( @common_readings );
 288 }
 289
 290 sub _check_for_repeated {
 291     my @seq = @_;
 292     my %unique;
 293     my @repeated;
 294     foreach ( @seq ) {
 295         if( exists $unique{$_->name} ) {
 296             push( @repeated, $_->name );
 297         } else {
 298             $unique{$_->name} = 1;
 299         }
 300     }
 301     return @repeated;
 302 }
 303
 304 =item B<read_base>
 305
 306 my @line_beginnings = read_base( 'reference.txt', $collation );
 307
 308 Takes a text file and a (presumed empty) collation object, adds the
 309 words as simple linear readings to the collation, and returns a
 310 list of readings that represent the beginning of lines. This collation
 311 is now the starting point for application of apparatus entries in
 312 merge_base, e.g. from a CSV file or a Classical Text Editor file.
 313
 314 =cut
 315
 316 sub read_base {
 317     my( $base_file, $collation ) = @_;
 318
 319     # This array gives the first reading for each line.  We put the
 320     # common starting point in line zero.
 321     my $last_reading = $collation->start();
 322     $base_text_index{$last_reading->name} = 0;
 323     my $lineref_array = [ $last_reading ]; # There is no line zero.
 324
 325     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
 326     my $i = 1;
 327     while(<BASE>) {
 328         # Make the readings, and connect them up for the base, but
 329         # also save the first reading of each line in an array for the
 330         # purpose.
 331         # TODO use configurable reading separator
 332         chomp;
 333         my @words = split;
 334         my $started = 0;
 335         my $wordref = 0;
 336         my $lineref = scalar @$lineref_array;
 337         last if $SHORTEND && $lineref > $SHORTEND;
 338         foreach my $w ( @words ) {
 339             my $readingref = join( ',', $lineref, ++$wordref );
 340             my $reading = $collation->add_reading( $readingref );
 341             $reading->text( $w );
 342             unless( $started ) {
 343                 push( @$lineref_array, $reading );
 344                 $started = 1;
 345             }
 346             # Add edge paths in the graph, for easier tracking when
 347             # we start applying corrections.  These paths will be
 348             # removed when we're done.
 349             my $path = $collation->add_path( $last_reading, $reading,
 350                                              $collation->baselabel );
 351             $last_reading = $reading;
 352
 353             # Note an array index for the reading, for later correction splices.
 354             $base_text_index{$readingref} = $i++;
 355         }
 356     }
 357     close BASE;
 358     # Ending point for all texts
 359     my $endpoint = $collation->add_reading( '#END#' );
 360     $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
 361     push( @$lineref_array, $endpoint );
 362     $base_text_index{$endpoint->name} = $i;
 363
 364     return( @$lineref_array );
 365 }
 366
 367 =item B<collate_variants>
 368
 369 collate_variants( $collation, @reading_ranges )
 370
 371 Given a set of readings in the form
 372 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
 373 walks through each to identify those readings that are identical.  The
 374 collation is a Text::Tradition::Collation object; the elements of
 375 @readings are Text::Tradition::Collation::Reading objects that appear
 376 on the collation graph.
 377
 378 TODO: Handle collapsed and non-collapsed transpositions.
 379
 380 =cut
 381
 382 sub collate_variants {
 383     my( $collation, @reading_sets ) = @_;
 384
 385     # Two different ways to do this, depending on whether we want
 386     # transposed reading nodes to be merged into one (producing a
 387     # nonlinear, bidirectional graph) or not (producing a relatively
 388     # linear, unidirectional graph.)
 389     return $collation->linear ? collate_linearly( @_ )
 390         : collate_nonlinearly( @_ );
 391 }
 392
 393 sub collate_linearly {
 394     my( $collation, $lemma_set, @variant_sets ) = @_;
 395
 396     my @unique;
 397     push( @unique, @$lemma_set );
 398     while( @variant_sets ) {
 399         my $variant_set = shift @variant_sets;
 400         # Use diff to do this job
 401         my $diff = Algorithm::Diff->new( \@unique, $variant_set,
 402                                          {'keyGen' => \&_collation_hash} );
 403         my @new_unique;
 404         my %merged;
 405         while( $diff->Next ) {
 406             if( $diff->Same ) {
 407                 # merge the nodes
 408                 my @l = $diff->Items( 1 );
 409                 my @v = $diff->Items( 2 );
 410                 foreach my $i ( 0 .. $#l ) {
 411                     if( !$merged{$l[$i]->name} ) {
 412                         $collation->merge_readings( $l[$i], $v[$i] );
 413                         $merged{$l[$i]->name} = 1;
 414                     } else {
 415                         print STDERR "Would have double merged " . $l[$i]->name . "\n";
 416                     }
 417                 }
 418                 # splice the lemma nodes into the variant set
 419                 my( $offset ) = $diff->Get( 'min2' );
 420                 splice( @$variant_set, $offset, scalar( @l ), @l );
 421                 push( @new_unique, @l );
 422             } else {
 423                 # Keep the old unique readings
 424                 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
 425                 # Add the new readings to the 'unique' list
 426                 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
 427             }
 428         }
 429         @unique = @new_unique;
 430     }
 431 }
 432
 433 sub collate_nonlinearly {
 434     my( $collation, $lemma_set, @variant_sets ) = @_;
 435
 436     my @unique;
 437     push( @unique, @$lemma_set );
 438     while( @variant_sets ) {
 439         my $variant_set = shift @variant_sets;
 440         # Simply match the first reading that carries the same word, so
 441         # long as that reading has not yet been used to match another
 442         # word in this variant. That way lies loopy madness.
 443         my @distinct;
 444         my %merged;
 445         foreach my $idx ( 0 .. $#{$variant_set} ) {
 446             my $vw = $variant_set->[$idx];
 447             my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
 448             my $matched;
 449             if( @same ) {
 450                 foreach my $i ( 0 .. $#same ) {
 451                     unless( $merged{$same[$i]->name} ) {
 452                         print STDERR sprintf( "Merging %s into %s\n",
 453                                               $vw->name,
 454                                               $same[$i]->name );
 455                         $collation->merge_readings( $same[$i], $vw );
 456                         $merged{$same[$i]->name} = 1;
 457                         $matched = $i;
 458                         $variant_set->[$idx] = $same[$i];
 459                     }
 460                 }
 461             }
 462             unless( @same && defined($matched) ) {
 463                 push( @distinct, $vw );
 464             }
 465         }
 466         push( @unique, @distinct );
 467     }
 468 }
 469
 470
 471
 472 sub _collation_hash {
 473     my $node = shift;
 474     return cmp_str( $node );
 475 }
 476
 477 sub set_relationships {
 478     my( $app, $lemma, $variants ) = @_;
 479     foreach my $rkey ( keys %$variants ) {
 480         my $var = $variants->{$rkey}->{'reading'};
 481         my $typekey = sprintf( "_%s_type", $rkey );
 482         my $type = $app->{$typekey};
 483
 484         # Transposition: look for nodes with the same label but different IDs
 485         # and mark them as transposed-identical.
 486
 487         # Lexical / Grammatical / Spelling: look for non-identical nodes.
 488         # Need to work out how to handle many-to-many mapping.
 489     }
 490 }
 491
 492
 493
 494 sub apply_edits {
 495     my( $collation, $edit_sequence, $debug ) = @_;
 496     my @lemma_text = $collation->reading_sequence( $collation->start,
 497                                            $collation->reading( '#END#' ) );
 498     my $drift = 0;
 499     foreach my $correction ( @$edit_sequence ) {
 500         my( $lemma_start, $length, $items ) = @$correction;
 501         my $offset = $base_text_index{$lemma_start};
 502         my $realoffset = $offset + $drift;
 503         if( $debug ||
 504             $lemma_text[$realoffset]->name ne $lemma_start ) {
 505             my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
 506             my @base_phrase;
 507             my $i = $realoffset;
 508             my $l = $collation->reading( $lemma_start );
 509             while( $i < $realoffset+$length ) {
 510                 push( @base_phrase, $l );
 511                 $l = $collation->next_reading( $l );
 512                 $i++;
 513             }
 514
 515             print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
 516                                   "with %s (%s) with drift %d\n",
 517                                   join( ' ', map {$_->label} @base_phrase ),
 518                                   join( ' ', map {$_->name} @base_phrase ),
 519                                   $realoffset,
 520                                   join( ' ', map {$_->label} @$items ),
 521                                   join( ' ', map {$_->name} @$items ),
 522                                   $drift,
 523                                   ) if $debug;
 524
 525             warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
 526                            "but %s (%s) is there instead",
 527                            join( ' ', map {$_->label} @base_phrase ),
 528                            join( ' ', map {$_->name} @base_phrase ),
 529                            join( ' ', map {$_->label} @$items ),
 530                            join( ' ', map {$_->name} @$items ),
 531                            join( ' ', map {$_->label} @this_phrase ),
 532                            join( ' ', map {$_->name} @this_phrase ),
 533                            ) )
 534                 if $lemma_text[$realoffset]->name ne $lemma_start;
 535         }
 536         splice( @lemma_text, $realoffset, $length, @$items );
 537         $drift += @$items - $length;
 538     }
 539     return @lemma_text;
 540 }
 541
 542
 543 # Helper function. Given a witness sigil, if it is a post-correctione
 544 # sigil,return the base witness.  If not, return a false value.
 545 sub _is_post_corr {
 546     my( $sigil ) = @_;
 547     if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
 548         return $1;
 549     }
 550     return undef;
 551 }
 552
 553 sub _add_hash_entry {
 554     my( $hash, $key, $entry ) = @_;
 555     if( exists $hash->{$key} ) {
 556         push( @{$hash->{$key}}, $entry );
 557     } else {
 558         $hash->{$key} = [ $entry ];
 559     }
 560 }
 561
 562
 563 =item B<cmp_str>
 564
 565 Pretend you never saw this method.  Really it needs to not be hardcoded.
 566
 567 =cut
 568
 569 sub cmp_str {
 570     my( $reading ) = @_;
 571     my $word = $reading->label();
 572     $word = lc( $word );
 573     $word =~ s/\W//g;
 574     $word =~ s/v/u/g;
 575     $word =~ s/j/i/g;
 576     $word =~ s/cha/ca/g;
 577     $word =~ s/quatuor/quattuor/g;
 578     $word =~ s/ioannes/iohannes/g;
 579     return $word;
 580 }
 581
 582 =back
 583
 584 =head1 LICENSE
 585
 586 This package is free software and is provided "as is" without express
 587 or implied warranty.  You can redistribute it and/or modify it under
 588 the same terms as Perl itself.
 589
 590 =head1 AUTHOR
 591
 592 Tara L Andrews, aurum@cpan.org
 593
 594 =cut
 595
 596 1;