lib/Text/Tradition/Parser/BaseText.pm

   1 package Text::Tradition::Parser::BaseText;
   2
   3 use strict;
   4 use warnings;
   5 use Module::Load;
   6 use Algorithm::Diff;
   7
   8 =head1 NAME
   9
  10 Text::Tradition::Parser::BaseText
  11
  12 =head1 SYNOPSIS
  13
  14 use Text::Tradition::Parser::BaseText qw( merge_base );
  15 merge_base( $graph, 'reference.txt', @apparatus_entries )
  16
  17 =head1 DESCRIPTION
  18
  19 For an overview of the package, see the documentation for the
  20 Text::Tradition::Graph module.
  21
  22 This module is meant for use with certain of the other Parser classes
  23 - whenever a list of variants is given with reference to a base text,
  24 these must be joined into a single collation.  The parser should
  25 therefore make a list of variants and their locations, and BaseText
  26 will join those listed variants onto the reference text.
  27
  28 =head1 SUBROUTINES
  29
  30 =over
  31
  32 =item B<parse>
  33
  34 parse( $graph, %opts );
  35
  36 Takes an initialized graph and a set of options, which must include:
  37 - 'base' - the base text referenced by the variants
  38 - 'format' - the format of the variant list
  39 - 'data' - the variants, in the given format.
  40
  41 =cut
  42
  43 sub parse {
  44     my( $tradition, %opts ) = @_;
  45
  46     my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
  47     load( $format_mod );
  48     my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
  49     merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
  50 }
  51
  52 =item B<merge_base>
  53
  54 merge_base( $graph, 'reference.txt', @apparatus_entries )
  55
  56 Takes three arguments: a newly-initialized Text::Tradition::Graph
  57 object, a text file containing the reference text, and a list of
  58 variants (apparatus entries).  Adds the base text to the graph, and
  59 joins the variants to that.
  60
  61 The list of variants is an array of hash references; each hash takes
  62 the form
  63  { '_id' => line reference,
  64    'rdg_0' => lemma reading,
  65    'rdg_1' => first variant,
  66    ...  # and so on until all distinct readings are listed
  67    'WitnessA' => 'rdg_0',
  68    'WitnessB' => 'rdg_1',
  69    ...  # and so on until all witnesses are listed with their readings
  70  }
  71
  72 Any hash key that is not of the form /^rdg_\d+$/ and that does not
  73 begin with an underscore is assumed to be a witness name.  Any 'meta'
  74 information to be passed must be passed in a key with a leading
  75 underscore in its name.
  76
  77 =cut
  78
  79 my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
  80
  81 my %base_text_index;
  82 my $edits_required = {};
  83
  84 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
  85
  86 sub merge_base {
  87     my( $collation, $base_file, @app_entries ) = @_;
  88     my @base_line_starts = read_base( $base_file, $collation );
  89
  90     my %all_witnesses;
  91     my @unwitnessed_lemma_nodes;
  92     foreach my $app ( @app_entries ) {
  93         my( $line, $num ) = split( /\./, $app->{_id} );
  94         # DEBUG with a short graph
  95         last if $SHORTEND && $line > $SHORTEND;
  96         # DEBUG for problematic entries
  97         my $scrutinize = '';
  98         my $first_line_reading = $base_line_starts[ $line ];
  99         my $too_far = $base_line_starts[ $line+1 ];
 100
 101         my $lemma = $app->{rdg_0};
 102         my $seq = 1;
 103         # Is this the Nth occurrence of this reading in the line?
 104         if( $lemma =~ s/(_)?(\d)$// ) {
 105             $seq = $2;
 106         }
 107         my @lemma_words = split( /\s+/, $lemma );
 108
 109         # Now search for the lemma words within this line.
 110         my $lemma_start = $first_line_reading;
 111         my $lemma_end;
 112         my %seen;
 113         while( $lemma_start ne $too_far ) {
 114             # Loop detection
 115             if( $seen{ $lemma_start->name() } ) {
 116                 warn "Detected loop at " . $lemma_start->name() .
 117                     ", ref $line,$num";
 118                 last;
 119             }
 120             $seen{ $lemma_start->name() } = 1;
 121
 122             # Try to match the lemma.
 123             my $unmatch = 0;
 124             print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
 125                 $lemma_words[0] . "...\n"
 126                 if "$line.$num" eq $scrutinize;
 127             if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
 128                 # Skip it if we need a match that is not the first.
 129                 if( --$seq < 1 ) {
 130                     # Now we have to compare the rest of the words here.
 131                     if( scalar( @lemma_words ) > 1 ) {
 132                         my $next_reading =
 133                             $collation->next_reading( $lemma_start );
 134                         foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 135                             printf STDERR "Now matching %s against %s\n",
 136                                     cmp_str($next_reading), $w
 137                                 if "$line.$num" eq $scrutinize;
 138                             if( $w ne cmp_str($next_reading) ) {
 139                                 $unmatch = 1;
 140                                 last;
 141                             } else {
 142                                 $lemma_end = $next_reading;
 143                                 $next_reading =
 144                                     $collation->next_reading( $lemma_end );
 145                             }
 146                         }
 147                     } else {
 148                         $lemma_end = $lemma_start;
 149                     }
 150                 } else {
 151                     $unmatch = 1;
 152                 }
 153             }
 154             last unless ( $unmatch || !defined( $lemma_end ) );
 155             $lemma_end = undef;
 156             $lemma_start = $collation->next_reading( $lemma_start );
 157         }
 158
 159         unless( $lemma_end ) {
 160             warn "No match found for @lemma_words at $line.$num";
 161             next;
 162         }
 163
 164         # Now we have found the lemma; we will record an 'edit', in
 165         # terms of a splice operation, for each subsequent reading.
 166         # We also note which witnesses take the given edit.
 167
 168         my @lemma_set = $collation->reading_sequence( $lemma_start,
 169                                                       $lemma_end );
 170         my @reading_sets = [ @lemma_set ];
 171
 172         # For each reading that is not rdg_0, we create the variant
 173         # reading nodes, and store the range as an edit operation on
 174         # the base text.
 175         my $variant_objects;
 176         my %pc_seen; # Keep track of mss with explicit post-corr data
 177         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
 178             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 179
 180             # Keep track of lemma nodes that don't actually appear in
 181             # any MSS; we will want to remove them from the collation.
 182             push( @unwitnessed_lemma_nodes, @lemma_set )
 183                 if !@mss && $k eq 'rdg_0';
 184
 185             # Keep track of what witnesses we have seen.
 186             @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
 187             # Keep track of which witnesses bear corrected readings here.
 188             foreach my $m ( @mss ) {
 189                 my $base = _is_post_corr( $m );
 190                 next unless $base;
 191                 $pc_seen{$base} = 1;
 192             }
 193             next if $k eq 'rdg_0';
 194
 195             # Parse the variant into reading tokens.
 196             # TODO don't hardcode the reading split operation
 197             my @variant = split( /\s+/, $app->{$k} );
 198             @variant = () if $app->{$k} eq '/'; # This is an omission.
 199
 200             my @variant_readings;
 201             my $ctr = 0;
 202             foreach my $vw ( @variant ) {
 203                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 204                 my $vwreading = $collation->add_reading( $vwname );
 205                 $vwreading->text( $vw );
 206                 push( @variant_readings, $vwreading );
 207             }
 208
 209             $variant_objects->{$k} = { 'mss' => \@mss,
 210                                        'reading' => \@variant_readings,
 211             };
 212             push( @reading_sets, \@variant_readings );
 213         }
 214
 215         # Now collate and collapse the identical readings within the
 216         # collated sets.  Modifies the reading sets that were passed.
 217         collate_variants( $collation, @reading_sets );
 218
 219         # Record any stated relationships between the nodes and the lemma.
 220         set_relationships( $collation, $app, \@lemma_set, $variant_objects );
 221
 222         # Now create the splice-edit objects that will be used
 223         # to reconstruct each witness.
 224
 225         foreach my $rkey ( keys %$variant_objects ) {
 226             # Object is argument list for splice, so:
 227             # offset, length, replacements
 228             my $edit_object = [ $lemma_start->name,
 229                                 scalar( @lemma_set ),
 230                                 $variant_objects->{$rkey}->{reading} ];
 231             foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
 232                 # Is this a p.c. entry?
 233                 my $base = _is_post_corr( $ms );
 234                 if( $base ) { # this is a post-corr witness
 235                     my $pc_key = $base . "_post";
 236                     _add_hash_entry( $edits_required, $pc_key, $edit_object );
 237                 } else { # this is an ante-corr witness
 238                     my $pc_key = $ms . "_post";
 239                     _add_hash_entry( $edits_required, $ms, $edit_object );
 240                     unless( $pc_seen{$ms} ) {
 241                         # If this witness carries no correction, add this
 242                         # same object to its post-corrected state.
 243                         _add_hash_entry( $edits_required, $pc_key,
 244                                          $edit_object );
 245                     }
 246                 }
 247             }
 248         }
 249     } # Finished going through the apparatus entries
 250
 251     # Now make the witness objects, and create their text sequences
 252     foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
 253         print STDERR "Creating witness $w\n";
 254         my $witness_obj = $collation->tradition->add_witness( sigil => $w );
 255         my $debug; #  = $w eq 'Vb11';
 256         my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
 257         my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
 258             if exists( $edits_required->{$w."_post"} );
 259
 260         my @repeated = _check_for_repeated( @ante_corr_seq );
 261         warn "Repeated elements @repeated in $w a.c."
 262             if @repeated;
 263         @repeated = _check_for_repeated( @post_corr_seq );
 264         warn "Repeated elements @repeated in $w p.c."
 265             if @repeated;
 266
 267         # Now save these paths in my witness object
 268         if( @post_corr_seq ) {
 269             $witness_obj->path( \@post_corr_seq );
 270             $witness_obj->uncorrected_path( \@ante_corr_seq );
 271         } else {
 272             $witness_obj->path( \@ante_corr_seq );
 273         }
 274     }
 275
 276     # Now remove our 'base text' edges, which is to say, the only
 277     # ones we have created so far.  Also remove any unwitnessed
 278     # lemma nodes (TODO unless we are treating base as witness)
 279     foreach ( $collation->paths() ) {
 280         $collation->del_path( $_ );
 281     }
 282     foreach( @unwitnessed_lemma_nodes ) {
 283         $collation->del_reading( $_ );
 284     }
 285
 286     ### HACKY HACKY Do some one-off path corrections here.
 287     if( $collation->linear ) {
 288         my $c = $collation;
 289         my $end = $SHORTEND ? $SHORTEND : 155;
 290         my $path = $c->tradition->witness('Vb11')->path;
 291         if( $end > 16 ) {
 292             $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
 293             splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
 294         }
 295         # What else?
 296     } else {
 297         my $c = $collation;
 298         my $end = $SHORTEND ? $SHORTEND : 155;
 299         # Vb5:
 300         my $path = $c->tradition->witness('Vb5')->path;
 301         splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106;
 302         # Vb11:
 303         $path = $c->tradition->witness('Vb11')->path;
 304         if( $end > 16 ) {
 305             $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
 306             splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) );
 307         }
 308         # Vb12 a.c.:
 309         $path = $c->tradition->witness('Vb12')->uncorrected_path;
 310         splice( @$path, 1828, 1, $c->reading('rdg_2/137.5.0') ) if $end > 137;
 311         # Vb13:
 312         $path = $c->tradition->witness('Vb13')->path;
 313         splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
 314         # Vb20 a.c.:
 315         $path = $c->tradition->witness('Vb20')->uncorrected_path;
 316         splice( @$path, 1251, 1, $c->reading( '94,6' ) ) if $end > 94;
 317         # Vb26:
 318         $path = $c->tradition->witness('Vb26')->path;
 319         splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46;
 320     }
 321
 322     # Now walk paths and calculate positions.
 323     my @common_readings =
 324         $collation->make_witness_paths();
 325     $collation->calculate_positions( @common_readings );
 326 }
 327
 328 sub _check_for_repeated {
 329     my @seq = @_;
 330     my %unique;
 331     my @repeated;
 332     foreach ( @seq ) {
 333         if( exists $unique{$_->name} ) {
 334             push( @repeated, $_->name );
 335         } else {
 336             $unique{$_->name} = 1;
 337         }
 338     }
 339     return @repeated;
 340 }
 341
 342 =item B<read_base>
 343
 344 my @line_beginnings = read_base( 'reference.txt', $collation );
 345
 346 Takes a text file and a (presumed empty) collation object, adds the
 347 words as simple linear readings to the collation, and returns a
 348 list of readings that represent the beginning of lines. This collation
 349 is now the starting point for application of apparatus entries in
 350 merge_base, e.g. from a CSV file or a Classical Text Editor file.
 351
 352 =cut
 353
 354 sub read_base {
 355     my( $base_file, $collation ) = @_;
 356
 357     # This array gives the first reading for each line.  We put the
 358     # common starting point in line zero.
 359     my $last_reading = $collation->start();
 360     $base_text_index{$last_reading->name} = 0;
 361     my $lineref_array = [ $last_reading ]; # There is no line zero.
 362
 363     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
 364     my $i = 1;
 365     while(<BASE>) {
 366         # Make the readings, and connect them up for the base, but
 367         # also save the first reading of each line in an array for the
 368         # purpose.
 369         # TODO use configurable reading separator
 370         chomp;
 371         my @words = split;
 372         my $started = 0;
 373         my $wordref = 0;
 374         my $lineref = scalar @$lineref_array;
 375         last if $SHORTEND && $lineref > $SHORTEND;
 376         foreach my $w ( @words ) {
 377             my $readingref = join( ',', $lineref, ++$wordref );
 378             my $reading = $collation->add_reading( $readingref );
 379             $reading->text( $w );
 380             unless( $started ) {
 381                 push( @$lineref_array, $reading );
 382                 $started = 1;
 383             }
 384             # Add edge paths in the graph, for easier tracking when
 385             # we start applying corrections.  These paths will be
 386             # removed when we're done.
 387             my $path = $collation->add_path( $last_reading, $reading,
 388                                              $collation->baselabel );
 389             $last_reading = $reading;
 390
 391             # Note an array index for the reading, for later correction splices.
 392             $base_text_index{$readingref} = $i++;
 393         }
 394     }
 395     close BASE;
 396     # Ending point for all texts
 397     my $endpoint = $collation->add_reading( '#END#' );
 398     $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
 399     push( @$lineref_array, $endpoint );
 400     $base_text_index{$endpoint->name} = $i;
 401
 402     return( @$lineref_array );
 403 }
 404
 405 =item B<collate_variants>
 406
 407 collate_variants( $collation, @reading_ranges )
 408
 409 Given a set of readings in the form
 410 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
 411 walks through each to identify those readings that are identical.  The
 412 collation is a Text::Tradition::Collation object; the elements of
 413 @readings are Text::Tradition::Collation::Reading objects that appear
 414 on the collation graph.
 415
 416 TODO: Handle collapsed and non-collapsed transpositions.
 417
 418 =cut
 419
 420 sub collate_variants {
 421     my( $collation, @reading_sets ) = @_;
 422
 423     # Two different ways to do this, depending on whether we want
 424     # transposed reading nodes to be merged into one (producing a
 425     # nonlinear, bidirectional graph) or not (producing a relatively
 426     # linear, unidirectional graph.)
 427     return $collation->linear ? collate_linearly( @_ )
 428         : collate_nonlinearly( @_ );
 429 }
 430
 431 sub collate_linearly {
 432     my( $collation, $lemma_set, @variant_sets ) = @_;
 433
 434     my @unique;
 435     push( @unique, @$lemma_set );
 436     while( @variant_sets ) {
 437         my $variant_set = shift @variant_sets;
 438         # Use diff to do this job
 439         my $diff = Algorithm::Diff->new( \@unique, $variant_set,
 440                                          {'keyGen' => \&_collation_hash} );
 441         my @new_unique;
 442         my %merged;
 443         while( $diff->Next ) {
 444             if( $diff->Same ) {
 445                 # merge the nodes
 446                 my @l = $diff->Items( 1 );
 447                 my @v = $diff->Items( 2 );
 448                 foreach my $i ( 0 .. $#l ) {
 449                     if( !$merged{$l[$i]->name} ) {
 450                         $collation->merge_readings( $l[$i], $v[$i] );
 451                         $merged{$l[$i]->name} = 1;
 452                     } else {
 453                         print STDERR "Would have double merged " . $l[$i]->name . "\n";
 454                     }
 455                 }
 456                 # splice the lemma nodes into the variant set
 457                 my( $offset ) = $diff->Get( 'min2' );
 458                 splice( @$variant_set, $offset, scalar( @l ), @l );
 459                 push( @new_unique, @l );
 460             } else {
 461                 # Keep the old unique readings
 462                 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
 463                 # Add the new readings to the 'unique' list
 464                 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
 465             }
 466         }
 467         @unique = @new_unique;
 468     }
 469 }
 470
 471 sub collate_nonlinearly {
 472     my( $collation, $lemma_set, @variant_sets ) = @_;
 473
 474     my @unique;
 475     push( @unique, @$lemma_set );
 476     while( @variant_sets ) {
 477         my $variant_set = shift @variant_sets;
 478         # Simply match the first reading that carries the same word, so
 479         # long as that reading has not yet been used to match another
 480         # word in this variant. That way lies loopy madness.
 481         my @distinct;
 482         my %merged;
 483         foreach my $idx ( 0 .. $#{$variant_set} ) {
 484             my $vw = $variant_set->[$idx];
 485             my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
 486             my $matched;
 487             if( @same ) {
 488                 foreach my $i ( 0 .. $#same ) {
 489                     unless( $merged{$same[$i]->name} ) {
 490                         #print STDERR sprintf( "Merging %s into %s\n",
 491                         #                     $vw->name,
 492                         #                     $same[$i]->name );
 493                         $collation->merge_readings( $same[$i], $vw );
 494                         $merged{$same[$i]->name} = 1;
 495                         $matched = $i;
 496                         $variant_set->[$idx] = $same[$i];
 497                     }
 498                 }
 499             }
 500             unless( @same && defined($matched) ) {
 501                 push( @distinct, $vw );
 502             }
 503         }
 504         push( @unique, @distinct );
 505     }
 506 }
 507
 508
 509
 510 sub _collation_hash {
 511     my $node = shift;
 512     return cmp_str( $node );
 513 }
 514
 515 sub set_relationships {
 516     my( $collation, $app, $lemma, $variants ) = @_;
 517     foreach my $rkey ( keys %$variants ) {
 518         my $var = $variants->{$rkey}->{'reading'};
 519         my $type = $app->{sprintf( "_%s_type", $rkey )};
 520         my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
 521         my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
 522
 523         my %rel_options = ();
 524         $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
 525         $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
 526
 527         if( $type =~ /^(inv|tr|rep)$/i ) {
 528             # Transposition or repetition: look for nodes with the
 529             # same label but different IDs and mark them.
 530             $type = 'repetition' if $type =~ /^rep/i;
 531             $rel_options{'type'} = $type;
 532             my %labels;
 533             foreach my $r ( @$lemma ) {
 534                 $labels{cmp_str( $r )} = $r;
 535             }
 536             foreach my $r( @$var ) {
 537                 if( exists $labels{$r->label} &&
 538                     $r->name ne $labels{$r->label}->name ) {
 539                     if( $type eq 'repetition' ) {
 540                         # Repetition
 541                         $collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
 542                     } else {
 543                         # Transposition
 544                         $r->set_identical( $labels{$r->label} );
 545                     }
 546                 }
 547             }
 548         } elsif( $type =~ /^(gr|lex|sp(el)?)$/i ) {
 549
 550             # Grammar/spelling/lexical: this can be a one-to-one or
 551             # one-to-many mapping.  We should think about merging
 552             # readings if it is one-to-many.
 553
 554             $type = 'grammatical' if $type =~ /gr/i;
 555             $type = 'spelling' if $type =~ /sp/i;
 556             $type = 'repetition' if $type =~ /rep/i;
 557             $type = 'lexical' if $type =~ /lex/i;
 558             $rel_options{'type'} = $type;
 559             if( @$lemma == @$var ) {
 560                 foreach my $i ( 0 .. $#{$lemma} ) {
 561                     $collation->add_relationship( $var->[$i], $lemma->[$i],
 562                         \%rel_options );
 563                 }
 564             } else {
 565                 # An uneven many-to-many mapping.  Make a segment out of
 566                 # whatever we have.
 567                 my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
 568                 my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
 569                 $collation->add_relationship( $varseg, $lemseg, \%rel_options );
 570             }
 571         } elsif( $type !~ /^(add|om)$/i ) {
 572             warn "Unrecognized type $type";
 573         }
 574     }
 575 }
 576
 577
 578
 579 sub apply_edits {
 580     my( $collation, $edit_sequence, $debug ) = @_;
 581     my @lemma_text = $collation->reading_sequence( $collation->start,
 582                                            $collation->reading( '#END#' ) );
 583     my $drift = 0;
 584     foreach my $correction ( @$edit_sequence ) {
 585         my( $lemma_start, $length, $items ) = @$correction;
 586         my $offset = $base_text_index{$lemma_start};
 587         my $realoffset = $offset + $drift;
 588         if( $debug ||
 589             $lemma_text[$realoffset]->name ne $lemma_start ) {
 590             my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
 591             my @base_phrase;
 592             my $i = $realoffset;
 593             my $l = $collation->reading( $lemma_start );
 594             while( $i < $realoffset+$length ) {
 595                 push( @base_phrase, $l );
 596                 $l = $collation->next_reading( $l );
 597                 $i++;
 598             }
 599
 600             print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
 601                                   "with %s (%s) with drift %d\n",
 602                                   join( ' ', map {$_->label} @base_phrase ),
 603                                   join( ' ', map {$_->name} @base_phrase ),
 604                                   $realoffset,
 605                                   join( ' ', map {$_->label} @$items ),
 606                                   join( ' ', map {$_->name} @$items ),
 607                                   $drift,
 608                                   ) if $debug;
 609
 610             if( $lemma_text[$realoffset]->name ne $lemma_start ) {
 611                 warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
 612                                "but %s (%s) is there instead",
 613                                join( ' ', map {$_->label} @base_phrase ),
 614                                join( ' ', map {$_->name} @base_phrase ),
 615                                join( ' ', map {$_->label} @$items ),
 616                                join( ' ', map {$_->name} @$items ),
 617                                join( ' ', map {$_->label} @this_phrase ),
 618                                join( ' ', map {$_->name} @this_phrase ),
 619                       ) );
 620                 # next;
 621             }
 622         }
 623         splice( @lemma_text, $realoffset, $length, @$items );
 624         $drift += @$items - $length;
 625     }
 626     return @lemma_text;
 627 }
 628
 629
 630 # Helper function. Given a witness sigil, if it is a post-correctione
 631 # sigil,return the base witness.  If not, return a false value.
 632 sub _is_post_corr {
 633     my( $sigil ) = @_;
 634     if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
 635         return $1;
 636     }
 637     return undef;
 638 }
 639
 640 sub _add_hash_entry {
 641     my( $hash, $key, $entry ) = @_;
 642     if( exists $hash->{$key} ) {
 643         push( @{$hash->{$key}}, $entry );
 644     } else {
 645         $hash->{$key} = [ $entry ];
 646     }
 647 }
 648
 649
 650 =item B<cmp_str>
 651
 652 Pretend you never saw this method.  Really it needs to not be hardcoded.
 653
 654 =cut
 655
 656 sub cmp_str {
 657     my( $reading ) = @_;
 658     my $word = $reading->label();
 659     $word = lc( $word );
 660     $word =~ s/\W//g;
 661     $word =~ s/v/u/g;
 662     $word =~ s/j/i/g;
 663     $word =~ s/cha/ca/g;
 664     $word =~ s/quatuor/quattuor/g;
 665     $word =~ s/ioannes/iohannes/g;
 666     return $word;
 667 }
 668
 669 =back
 670
 671 =head1 LICENSE
 672
 673 This package is free software and is provided "as is" without express
 674 or implied warranty.  You can redistribute it and/or modify it under
 675 the same terms as Perl itself.
 676
 677 =head1 AUTHOR
 678
 679 Tara L Andrews, aurum@cpan.org
 680
 681 =cut
 682
 683 1;