lib/Text/Tradition/Parser/BaseText.pm

   1 package Text::Tradition::Parser::BaseText;
   2
   3 use strict;
   4 use warnings;
   5 use Module::Load;
   6 use Algorithm::Diff;
   7
   8 =head1 NAME
   9
  10 Text::Tradition::Parser::BaseText
  11
  12 =head1 SYNOPSIS
  13
  14 use Text::Tradition::Parser::BaseText qw( merge_base );
  15 merge_base( $graph, 'reference.txt', @apparatus_entries )
  16
  17 =head1 DESCRIPTION
  18
  19 For an overview of the package, see the documentation for the
  20 Text::Tradition::Graph module.
  21
  22 This module is meant for use with certain of the other Parser classes
  23 - whenever a list of variants is given with reference to a base text,
  24 these must be joined into a single collation.  The parser should
  25 therefore make a list of variants and their locations, and BaseText
  26 will join those listed variants onto the reference text.
  27
  28 =head1 SUBROUTINES
  29
  30 =over
  31
  32 =item B<parse>
  33
  34 parse( $graph, %opts );
  35
  36 Takes an initialized graph and a set of options, which must include:
  37 - 'base' - the base text referenced by the variants
  38 - 'format' - the format of the variant list
  39 - 'data' - the variants, in the given format.
  40
  41 =cut
  42
  43 sub parse {
  44     my( $tradition, %opts ) = @_;
  45
  46     my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
  47     load( $format_mod );
  48     my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
  49     merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
  50 }
  51
  52 =item B<merge_base>
  53
  54 merge_base( $graph, 'reference.txt', @apparatus_entries )
  55
  56 Takes three arguments: a newly-initialized Text::Tradition::Graph
  57 object, a text file containing the reference text, and a list of
  58 variants (apparatus entries).  Adds the base text to the graph, and
  59 joins the variants to that.
  60
  61 The list of variants is an array of hash references; each hash takes
  62 the form
  63  { '_id' => line reference,
  64    'rdg_0' => lemma reading,
  65    'rdg_1' => first variant,
  66    ...  # and so on until all distinct readings are listed
  67    'WitnessA' => 'rdg_0',
  68    'WitnessB' => 'rdg_1',
  69    ...  # and so on until all witnesses are listed with their readings
  70  }
  71
  72 Any hash key that is not of the form /^rdg_\d+$/ and that does not
  73 begin with an underscore is assumed to be a witness name.  Any 'meta'
  74 information to be passed must be passed in a key with a leading
  75 underscore in its name.
  76
  77 =cut
  78
  79 my $SHORTEND; # Debug var - set this to limit the number of lines parsed
  80
  81 my %base_text_index;
  82 my $edits_required = {};
  83
  84 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
  85
  86 sub merge_base {
  87     my( $collation, $base_file, @app_entries ) = @_;
  88     my @base_line_starts = read_base( $base_file, $collation );
  89
  90     my %all_witnesses;
  91     my @unwitnessed_lemma_nodes;
  92     foreach my $app ( @app_entries ) {
  93         my( $line, $num ) = split( /\./, $app->{_id} );
  94         # DEBUG with a short graph
  95         last if $SHORTEND && $line > $SHORTEND;
  96         # DEBUG for problematic entries
  97         my $scrutinize = '';
  98         my $first_line_reading = $base_line_starts[ $line ];
  99         my $too_far = $base_line_starts[ $line+1 ];
 100
 101         my $lemma = $app->{rdg_0};
 102         my $seq = 1;
 103         # Is this the Nth occurrence of this reading in the line?
 104         if( $lemma =~ s/(_)?(\d)$// ) {
 105             $seq = $2;
 106         }
 107         my @lemma_words = split( /\s+/, $lemma );
 108
 109         # Now search for the lemma words within this line.
 110         my $lemma_start = $first_line_reading;
 111         my $lemma_end;
 112         my %seen;
 113         while( $lemma_start ne $too_far ) {
 114             # Loop detection
 115             if( $seen{ $lemma_start->name() } ) {
 116                 warn "Detected loop at " . $lemma_start->name() .
 117                     ", ref $line,$num";
 118                 last;
 119             }
 120             $seen{ $lemma_start->name() } = 1;
 121
 122             # Try to match the lemma.
 123             my $unmatch = 0;
 124             print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
 125                 $lemma_words[0] . "...\n"
 126                 if "$line.$num" eq $scrutinize;
 127             if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
 128                 # Skip it if we need a match that is not the first.
 129                 if( --$seq < 1 ) {
 130                     # Now we have to compare the rest of the words here.
 131                     if( scalar( @lemma_words ) > 1 ) {
 132                         my $next_reading =
 133                             $collation->next_reading( $lemma_start );
 134                         foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 135                             printf STDERR "Now matching %s against %s\n",
 136                                     cmp_str($next_reading), $w
 137                                 if "$line.$num" eq $scrutinize;
 138                             if( $w ne cmp_str($next_reading) ) {
 139                                 $unmatch = 1;
 140                                 last;
 141                             } else {
 142                                 $lemma_end = $next_reading;
 143                                 $next_reading =
 144                                     $collation->next_reading( $lemma_end );
 145                             }
 146                         }
 147                     } else {
 148                         $lemma_end = $lemma_start;
 149                     }
 150                 } else {
 151                     $unmatch = 1;
 152                 }
 153             }
 154             last unless ( $unmatch || !defined( $lemma_end ) );
 155             $lemma_end = undef;
 156             $lemma_start = $collation->next_reading( $lemma_start );
 157         }
 158
 159         unless( $lemma_end ) {
 160             warn "No match found for @lemma_words at $line.$num";
 161             next;
 162         }
 163
 164         # Now we have found the lemma; we will record an 'edit', in
 165         # terms of a splice operation, for each subsequent reading.
 166         # We also note which witnesses take the given edit.
 167
 168         my @lemma_set = $collation->reading_sequence( $lemma_start,
 169                                                       $lemma_end );
 170         my @reading_sets = [ @lemma_set ];
 171
 172         # For each reading that is not rdg_0, we create the variant
 173         # reading nodes, and store the range as an edit operation on
 174         # the base text.
 175         my $variant_objects;
 176         my %pc_seen; # Keep track of mss with explicit post-corr data
 177         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
 178             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 179             push( @unwitnessed_lemma_nodes, @lemma_set )
 180                 if !@mss && $k eq 'rdg_0';
 181
 182             # Keep track of what witnesses we have seen.
 183             @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
 184             # Keep track of which witnesses bear corrected readings here.
 185             foreach my $m ( @mss ) {
 186                 my $base = _is_post_corr( $m );
 187                 next unless $base;
 188                 $pc_seen{$base} = 1;
 189             }
 190             next if $k eq 'rdg_0';
 191
 192             # TODO don't hardcode the reading split operation
 193             my @variant = split( /\s+/, $app->{$k} );
 194             @variant = () if $app->{$k} eq '/'; # This is an omission.
 195
 196             # Make the variant into a set of readings.
 197             my @variant_readings;
 198             my $ctr = 0;
 199             foreach my $vw ( @variant ) {
 200                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 201                 my $vwreading = $collation->add_reading( $vwname );
 202                 $vwreading->text( $vw );
 203                 push( @variant_readings, $vwreading );
 204             }
 205
 206             $variant_objects->{$k} = { 'mss' => \@mss,
 207                                        'reading' => \@variant_readings,
 208             };
 209             push( @reading_sets, \@variant_readings );
 210         }
 211
 212         # Now collate and collapse the identical readings within the
 213         # collated sets.  Modifies the reading sets that were passed.
 214         collate_variants( $collation, @reading_sets );
 215
 216         # TODO Here would be a very good place to set up relationships
 217         # between the nodes and the lemma.
 218         set_relationships( $app, \@lemma_set, $variant_objects );
 219
 220         # Now create the splice-edit objects that will be used
 221         # to reconstruct each witness.
 222
 223         foreach my $rkey ( keys %$variant_objects ) {
 224             # Object is argument list for splice, so:
 225             # offset, length, replacements
 226             my $edit_object = [ $base_text_index{$lemma_start->name},
 227                                 scalar( @lemma_set ),
 228                                 $variant_objects->{$rkey}->{reading} ];
 229             foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
 230                 # Is this a p.c. entry?
 231                 my $base = _is_post_corr( $ms );
 232                 if( $base ) { # this is a post-corr witness
 233                     my $pc_key = $base . "_post";
 234                     _add_hash_entry( $edits_required, $pc_key, $edit_object );
 235                 } else { # this is an ante-corr witness
 236                     my $pc_key = $ms . "_post";
 237                     _add_hash_entry( $edits_required, $ms, $edit_object );
 238                     unless( $pc_seen{$ms} ) {
 239                         # If this witness carries no correction, add this
 240                         # same object to its post-corrected state.
 241                         _add_hash_entry( $edits_required, $pc_key,
 242                                          $edit_object );
 243                     }
 244                 }
 245             }
 246         }
 247     } # Finished going through the apparatus entries
 248
 249     # Now make the witness objects, and create their text sequences
 250     foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
 251         print STDERR "Creating witness $w\n";
 252         my $witness_obj = $collation->tradition->add_witness( sigil => $w );
 253         my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w} );
 254         my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"} )
 255             if exists( $edits_required->{$w."_post"} );
 256
 257         my @repeated = _check_for_repeated( @ante_corr_seq );
 258         warn "Repeated elements @repeated in $w a.c."
 259             if @repeated;
 260         @repeated = _check_for_repeated( @post_corr_seq );
 261         warn "Repeated elements @repeated in $w p.c."
 262             if @repeated;
 263
 264         # Now save these paths in my witness object
 265         if( @post_corr_seq ) {
 266             $witness_obj->path( \@post_corr_seq );
 267             $witness_obj->uncorrected_path( \@ante_corr_seq );
 268         } else {
 269             $witness_obj->path( \@ante_corr_seq );
 270         }
 271     }
 272
 273     # Now remove our 'base text' edges, which is to say, the only
 274     # ones we have created so far.  Also remove any nodes that didn't
 275     # appear in any witnesses.
 276     foreach ( $collation->paths() ) {
 277         $collation->del_path( $_ );
 278     }
 279     foreach( @unwitnessed_lemma_nodes ) {
 280         $collation->del_reading( $_ );
 281     }
 282
 283     # Now walk paths and calculate positions.
 284     my @common_readings =
 285         $collation->make_witness_paths();
 286     $collation->calculate_positions( @common_readings );
 287 }
 288
 289 sub _check_for_repeated {
 290     my @seq = @_;
 291     my %unique;
 292     my @repeated;
 293     foreach ( @seq ) {
 294         if( exists $unique{$_->name} ) {
 295             push( @repeated, $_->name );
 296         } else {
 297             $unique{$_->name} = 1;
 298         }
 299     }
 300     return @repeated;
 301 }
 302
 303 =item B<read_base>
 304
 305 my @line_beginnings = read_base( 'reference.txt', $collation );
 306
 307 Takes a text file and a (presumed empty) collation object, adds the
 308 words as simple linear readings to the collation, and returns a
 309 list of readings that represent the beginning of lines. This collation
 310 is now the starting point for application of apparatus entries in
 311 merge_base, e.g. from a CSV file or a Classical Text Editor file.
 312
 313 =cut
 314
 315 sub read_base {
 316     my( $base_file, $collation ) = @_;
 317
 318     # This array gives the first reading for each line.  We put the
 319     # common starting point in line zero.
 320     my $last_reading = $collation->start();
 321     $base_text_index{$last_reading->name} = 0;
 322     my $lineref_array = [ $last_reading ]; # There is no line zero.
 323
 324     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
 325     my $i = 1;
 326     while(<BASE>) {
 327         # Make the readings, and connect them up for the base, but
 328         # also save the first reading of each line in an array for the
 329         # purpose.
 330         # TODO use configurable reading separator
 331         chomp;
 332         my @words = split;
 333         my $started = 0;
 334         my $wordref = 0;
 335         my $lineref = scalar @$lineref_array;
 336         last if $SHORTEND && $lineref > $SHORTEND;
 337         foreach my $w ( @words ) {
 338             my $readingref = join( ',', $lineref, ++$wordref );
 339             my $reading = $collation->add_reading( $readingref );
 340             $reading->text( $w );
 341             unless( $started ) {
 342                 push( @$lineref_array, $reading );
 343                 $started = 1;
 344             }
 345             # Add edge paths in the graph, for easier tracking when
 346             # we start applying corrections.  These paths will be
 347             # removed when we're done.
 348             my $path = $collation->add_path( $last_reading, $reading,
 349                                              $collation->baselabel );
 350             $last_reading = $reading;
 351
 352             # Note an array index for the reading, for later correction splices.
 353             $base_text_index{$readingref} = $i++;
 354         }
 355     }
 356     close BASE;
 357     # Ending point for all texts
 358     my $endpoint = $collation->add_reading( '#END#' );
 359     $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
 360     push( @$lineref_array, $endpoint );
 361     $base_text_index{$endpoint->name} = $i;
 362
 363     return( @$lineref_array );
 364 }
 365
 366 =item B<collate_variants>
 367
 368 collate_variants( $collation, @reading_ranges )
 369
 370 Given a set of readings in the form
 371 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
 372 walks through each to identify those readings that are identical.  The
 373 collation is a Text::Tradition::Collation object; the elements of
 374 @readings are Text::Tradition::Collation::Reading objects that appear
 375 on the collation graph.
 376
 377 TODO: Handle collapsed and non-collapsed transpositions.
 378
 379 =cut
 380
 381 sub collate_variants {
 382     my( $collation, @reading_sets ) = @_;
 383
 384     # Merge the nodes across the sets so that there is only one node
 385     # for any given reading.  Use diff to identify the 'same' nodes.
 386
 387     my $lemma_set = shift @reading_sets;
 388
 389     my @unique;
 390     push( @unique, @$lemma_set );
 391
 392     while( @reading_sets ) {
 393         my $variant_set = shift @reading_sets;
 394         if( $collation->linear ) {
 395             # Use diff to do this job
 396             my $diff = Algorithm::Diff->new( \@unique, $variant_set,
 397                                              {'keyGen' => \&_collation_hash} );
 398             my @new_unique;
 399             my %merged;
 400             while( $diff->Next ) {
 401                 if( $diff->Same ) {
 402                     # merge the nodes
 403                     my @l = $diff->Items( 1 );
 404                     my @v = $diff->Items( 2 );
 405                     foreach my $i ( 0 .. $#l ) {
 406                         if( !$merged{$l[$i]->name} ) {
 407                             $collation->merge_readings( $l[$i], $v[$i] );
 408                             $merged{$l[$i]->name} = 1;
 409                         } else {
 410                             print STDERR "Would have double merged " . $l[$i]->name . "\n";
 411                         }
 412                     }
 413                     # splice the lemma nodes into the variant set
 414                     my( $offset ) = $diff->Get( 'min2' );
 415                     splice( @$variant_set, $offset, scalar( @l ), @l );
 416                     push( @new_unique, @l );
 417                 } else {
 418                     # Keep the old unique readings
 419                     push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
 420                     # Add the new readings to the 'unique' list
 421                     push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
 422                 }
 423             }
 424             @unique = @new_unique;
 425         } else {
 426             # It becomes a much simpler job
 427             $DB::single = 1;
 428             my @distinct;
 429             my %merged;
 430             foreach my $idx ( 0 .. $#{$variant_set} ) {
 431                 my $vw = $variant_set->[$idx];
 432                 my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
 433                 my $matched;
 434                 if( @same ) {
 435                     foreach my $i ( 0 .. $#same ) {
 436                         unless( $merged{$same[$i]->name} ) {
 437                             print STDERR sprintf( "Merging %s into %s\n",
 438                                                   $vw->name,
 439                                                   $same[$i]->name );
 440                             $collation->merge_readings( $same[$i], $vw );
 441                             $merged{$same[$i]->name} = 1;
 442                             $matched = $i;
 443                             $variant_set->[$idx] = $same[$i];
 444                         }
 445                     }
 446                 }
 447                 unless( @same && defined($matched) ) {
 448                     push( @distinct, $vw );
 449                 }
 450             }
 451             push( @unique, @distinct );
 452         }
 453     }
 454
 455     return;
 456 }
 457
 458
 459 sub _collation_hash {
 460     my $node = shift;
 461     return cmp_str( $node );
 462 }
 463
 464 sub set_relationships {
 465     my( $app, $lemma, $variants ) = @_;
 466     foreach my $rkey ( keys %$variants ) {
 467         my $var = $variants->{$rkey}->{'reading'};
 468         my $typekey = sprintf( "_%s_type", $rkey );
 469         my $type = $app->{$typekey};
 470
 471         # Transposition: look for nodes with the same label but different IDs
 472         # and mark them as transposed-identical.
 473
 474         # Lexical / Grammatical / Spelling: look for non-identical nodes.
 475         # Need to work out how to handle many-to-many mapping.
 476     }
 477 }
 478
 479
 480
 481 sub apply_edits {
 482     my( $collation, $edit_sequence ) = @_;
 483     my @lemma_names = sort { $base_text_index{$a} <=> $base_text_index{$b} }
 484         keys %base_text_index;
 485     my @lemma_text = map { $collation->reading( $_ ) } @lemma_names;
 486
 487     my $drift = 0;
 488     foreach my $correction ( @$edit_sequence ) {
 489         my( $offset, $length, $items ) = @$correction;
 490         my $realoffset = $offset + $drift;
 491         splice( @lemma_text, $realoffset, $length, @$items );
 492         $drift += @$items - $length;
 493     }
 494     return @lemma_text;
 495 }
 496
 497
 498 # Helper function. Given a witness sigil, if it is a post-correctione
 499 # sigil,return the base witness.  If not, return a false value.
 500 sub _is_post_corr {
 501     my( $sigil ) = @_;
 502     if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
 503         return $1;
 504     }
 505     return undef;
 506 }
 507
 508 sub _add_hash_entry {
 509     my( $hash, $key, $entry ) = @_;
 510     if( exists $hash->{$key} ) {
 511         push( @{$hash->{$key}}, $entry );
 512     } else {
 513         $hash->{$key} = [ $entry ];
 514     }
 515 }
 516
 517
 518 =item B<cmp_str>
 519
 520 Pretend you never saw this method.  Really it needs to not be hardcoded.
 521
 522 =cut
 523
 524 sub cmp_str {
 525     my( $reading ) = @_;
 526     my $word = $reading->label();
 527     $word = lc( $word );
 528     $word =~ s/\W//g;
 529     $word =~ s/v/u/g;
 530     $word =~ s/j/i/g;
 531     $word =~ s/cha/ca/g;
 532     $word =~ s/quatuor/quattuor/g;
 533     $word =~ s/ioannes/iohannes/g;
 534     return $word;
 535 }
 536
 537 =back
 538
 539 =head1 LICENSE
 540
 541 This package is free software and is provided "as is" without express
 542 or implied warranty.  You can redistribute it and/or modify it under
 543 the same terms as Perl itself.
 544
 545 =head1 AUTHOR
 546
 547 Tara L Andrews, aurum@cpan.org
 548
 549 =cut
 550
 551 1;