lib/Text/Tradition/Parser/BaseText.pm

   1 package Text::Tradition::Parser::BaseText;
   2
   3 use strict;
   4 use warnings;
   5 use Module::Load;
   6 use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );
   7
   8 =head1 NAME
   9
  10 Text::Tradition::Parser::BaseText
  11
  12 =head1 SYNOPSIS
  13
  14 use Text::Tradition::Parser::BaseText qw( merge_base );
  15 merge_base( $graph, 'reference.txt', @apparatus_entries )
  16
  17 =head1 DESCRIPTION
  18
  19 For an overview of the package, see the documentation for the
  20 Text::Tradition module.
  21
  22 This module is meant for use with certain of the other Parser classes
  23 - whenever a list of variants is given with reference to a base text,
  24 these must be joined into a single collation.  The parser should
  25 therefore make a list of variants and their locations, and BaseText
  26 will join those listed variants onto the reference text.
  27
  28 =head1 SUBROUTINES
  29
  30 =over
  31
  32 =item B<parse>
  33
  34 parse( $graph, $opts );
  35
  36 Takes an initialized graph and a hashref of options, which must include:
  37 - 'base' - the base text referenced by the variants
  38 - 'format' - the format of the variant list
  39 - 'data' - the variants, in the given format.
  40
  41 =cut
  42
  43 sub parse {
  44     my( $tradition, $opts ) = @_;
  45
  46     my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
  47     load( $format_mod );
  48     # TODO Handle a string someday if we ever have a format other than KUL
  49     my @apparatus_entries = $format_mod->can('read')->( $opts );
  50     merge_base( $tradition->collation, $opts->{'base'}, @apparatus_entries );
  51 }
  52
  53 =item B<merge_base>
  54
  55 merge_base( $graph, 'reference.txt', @apparatus_entries )
  56
  57 Takes three arguments: a newly-initialized Text::Tradition::Graph
  58 object, a text file containing the reference text, and a list of
  59 variants (apparatus entries).  Adds the base text to the graph, and
  60 joins the variants to that.
  61
  62 The list of variants is an array of hash references; each hash takes
  63 the form
  64  { '_id' => line reference,
  65    'rdg_0' => lemma reading,
  66    'rdg_1' => first variant,
  67    ...  # and so on until all distinct readings are listed
  68    'WitnessA' => 'rdg_0',
  69    'WitnessB' => 'rdg_1',
  70    ...  # and so on until all witnesses are listed with their readings
  71  }
  72
  73 Any hash key that is not of the form /^rdg_\d+$/ and that does not
  74 begin with an underscore is assumed to be a witness name.  Any 'meta'
  75 information to be passed must be passed in a key with a leading
  76 underscore in its name.
  77
  78 =cut
  79
  80 my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
  81
  82 my %base_text_index;
  83 my $edits_required = {};
  84
  85 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
  86
  87 sub merge_base {
  88     my( $collation, $base_file, @app_entries ) = @_;
  89     my @base_line_starts = read_base( $base_file, $collation );
  90
  91     my %all_witnesses;
  92     foreach my $app ( @app_entries ) {
  93         my( $line, $num ) = split( /\./, $app->{_id} );
  94         # DEBUG with a short graph
  95         last if $SHORTEND && $line > $SHORTEND;
  96         # DEBUG for problematic entries
  97         my $scrutinize = '';
  98         my $first_line_reading = $base_line_starts[ $line ];
  99         my $too_far = $base_line_starts[ $line+1 ];
 100
 101         my $lemma = $app->{rdg_0};
 102         my $seq = 1;
 103         # Is this the Nth occurrence of this reading in the line?
 104         if( $lemma =~ s/(_)?(\d)$// ) {
 105             $seq = $2;
 106         }
 107         my @lemma_words = split( /\s+/, $lemma );
 108
 109         # Now search for the lemma words within this line.
 110         my $lemma_start = $first_line_reading;
 111         my $lemma_end;
 112         my %seen;
 113         while( $lemma_start ne $too_far ) {
 114             # Loop detection
 115             if( $seen{ $lemma_start->id() } ) {
 116                 warn "Detected loop at " . $lemma_start->id() .
 117                     ", ref $line,$num";
 118                 last;
 119             }
 120             $seen{ $lemma_start->id() } = 1;
 121
 122             # Try to match the lemma.
 123             my $unmatch = 0;
 124             print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
 125                 $lemma_words[0] . "...\n"
 126                 if "$line.$num" eq $scrutinize;
 127             if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
 128                 # Skip it if we need a match that is not the first.
 129                 if( --$seq < 1 ) {
 130                     # Now we have to compare the rest of the words here.
 131                     if( scalar( @lemma_words ) > 1 ) {
 132                         my $next_reading =
 133                             $collation->next_reading( $lemma_start );
 134                         foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 135                             printf STDERR "Now matching %s against %s\n",
 136                                     cmp_str($next_reading), $w
 137                                 if "$line.$num" eq $scrutinize;
 138                             if( $w ne cmp_str($next_reading) ) {
 139                                 $unmatch = 1;
 140                                 last;
 141                             } else {
 142                                 $lemma_end = $next_reading;
 143                                 $next_reading =
 144                                     $collation->next_reading( $lemma_end );
 145                             }
 146                         }
 147                     } else {
 148                         $lemma_end = $lemma_start;
 149                     }
 150                 } else {
 151                     $unmatch = 1;
 152                 }
 153             }
 154             last unless ( $unmatch || !defined( $lemma_end ) );
 155             $lemma_end = undef;
 156             $lemma_start = $collation->next_reading( $lemma_start );
 157         }
 158
 159         unless( $lemma_end ) {
 160             warn "No match found for @lemma_words at $line.$num";
 161             next;
 162         }
 163
 164         # Now we have found the lemma; we will record an 'edit', in
 165         # terms of a splice operation, for each subsequent reading.
 166         # We also note which witnesses take the given edit.
 167
 168         my @lemma_set = $collation->reading_sequence( $lemma_start,
 169                                                       $lemma_end );
 170         my @reading_sets = [ @lemma_set ];
 171
 172         # For each reading that is not rdg_0, we create the variant
 173         # reading nodes, and store the range as an edit operation on
 174         # the base text.
 175         my $variant_objects;
 176         my %pc_seen; # Keep track of mss with explicit post-corr data
 177         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
 178             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 179
 180             # Keep track of what witnesses we have seen.
 181             @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
 182             # Keep track of which witnesses bear corrected readings here.
 183             foreach my $m ( @mss ) {
 184                 my $base = _is_post_corr( $m );
 185                 next unless $base;
 186                 $pc_seen{$base} = 1;
 187             }
 188             next if $k eq 'rdg_0';
 189
 190             # Parse the variant into reading tokens.
 191             # TODO don't hardcode the reading split operation
 192             my @variant = split( /\s+/, $app->{$k} );
 193             @variant = () if $app->{$k} eq '/'; # This is an omission.
 194
 195             my @variant_readings;
 196             my $ctr = 0;
 197             foreach my $vw ( @variant ) {
 198                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 199                 my $vwreading = $collation->add_reading( {
 200                         'id' => $vwname,
 201                         'text' => $vw } );
 202                 push( @variant_readings, $vwreading );
 203             }
 204
 205             $variant_objects->{$k} = { 'mss' => \@mss,
 206                                        'reading' => \@variant_readings,
 207             };
 208             push( @reading_sets, \@variant_readings );
 209         }
 210
 211         # Now collate and collapse the identical readings within the
 212         # collated sets.  Modifies the reading sets that were passed.
 213         collate_variants( $collation, @reading_sets );
 214
 215         # Record any stated relationships between the nodes and the lemma.
 216         set_relationships( $collation, $app, \@lemma_set, $variant_objects );
 217
 218         # Now create the splice-edit objects that will be used
 219         # to reconstruct each witness.
 220
 221         foreach my $rkey ( keys %$variant_objects ) {
 222             # Object is argument list for splice, so:
 223             # offset, length, replacements
 224             my $edit_object = [ $lemma_start->id,
 225                                 scalar( @lemma_set ),
 226                                 $variant_objects->{$rkey}->{reading} ];
 227             foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
 228                 # Is this a p.c. entry?
 229                 my $base = _is_post_corr( $ms );
 230                 if( $base ) { # this is a post-corr witness
 231                     my $pc_key = $base . "_post";
 232                     add_hash_entry( $edits_required, $pc_key, $edit_object );
 233                 } else { # this is an ante-corr witness
 234                     my $pc_key = $ms . "_post";
 235                     add_hash_entry( $edits_required, $ms, $edit_object );
 236                     unless( $pc_seen{$ms} ) {
 237                         # If this witness carries no correction, add this
 238                         # same object to its post-corrected state.
 239                         add_hash_entry( $edits_required, $pc_key,
 240                                          $edit_object );
 241                     }
 242                 }
 243             }
 244         }
 245     } # Finished going through the apparatus entries
 246
 247     # Now make the witness objects, and create their text sequences
 248     foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
 249         print STDERR "Creating witness $w\n";
 250         my $witness_obj = $collation->tradition->add_witness( sigil => $w );
 251         my $debug; #  = $w eq 'Vb11';
 252         my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
 253         my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
 254             if exists( $edits_required->{$w."_post"} );
 255
 256         my @repeated = check_for_repeated( @ante_corr_seq );
 257         warn "Repeated elements @repeated in $w a.c."
 258             if @repeated;
 259         @repeated = check_for_repeated( @post_corr_seq );
 260         warn "Repeated elements @repeated in $w p.c."
 261             if @repeated;
 262
 263         # Now save these paths in my witness object
 264         if( @post_corr_seq ) {
 265             $witness_obj->path( \@post_corr_seq );
 266             $witness_obj->uncorrected_path( \@ante_corr_seq );
 267         } else {
 268             $witness_obj->path( \@ante_corr_seq );
 269         }
 270     }
 271
 272     # Now remove our 'base text' edges, which is to say, the only
 273     # ones we have created so far.  Also remove any unwitnessed
 274     # lemma nodes (TODO unless we are treating base as witness)
 275     foreach ( $collation->paths() ) {
 276         $collation->del_path( $_, $collation->baselabel );
 277     }
 278
 279     ### HACKY HACKY Do some one-off path corrections here.
 280     require( 'data/boodts/s158.HACK' );
 281     KUL::HACK::pre_path_hack( $collation );
 282
 283     # Now walk paths and calculate positional rank.
 284     $collation->make_witness_paths();
 285     # Now delete any orphaned readings.
 286         foreach my $r ( $collation->sequence->isolated_vertices ) {
 287                 print STDERR "Deleting unconnected reading $r / " .
 288                         $collation->reading( $r )->text . "\n";
 289                 $collation->del_reading( $r );
 290         }
 291
 292     KUL::HACK::post_path_hack( $collation );
 293     # Have to check relationship validity at this point, because before that
 294     # we had no paths.
 295 #     foreach my $rel ( $collation->relationships ) {
 296 #         next unless $rel->equal_rank;
 297 #         unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
 298 #             warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
 299 #                             $rel->type, $rel->from->id, $rel->to->id );
 300 #         }
 301 #     }
 302     $collation->calculate_ranks();
 303 }
 304
 305 =item B<read_base>
 306
 307 my @line_beginnings = read_base( 'reference.txt', $collation );
 308
 309 Takes a text file and a (presumed empty) collation object, adds the
 310 words as simple linear readings to the collation, and returns a
 311 list of readings that represent the beginning of lines. This collation
 312 is now the starting point for application of apparatus entries in
 313 merge_base, e.g. from a CSV file or a Classical Text Editor file.
 314
 315 =cut
 316
 317 sub read_base {
 318     my( $base_file, $collation ) = @_;
 319
 320     # This array gives the first reading for each line.  We put the
 321     # common starting point in line zero.
 322     my $last_reading = $collation->start;
 323     $base_text_index{$last_reading->id} = 0;
 324     my $lineref_array = [ $last_reading ]; # There is no line zero.
 325
 326     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
 327     my $i = 1;
 328     while(<BASE>) {
 329         # Make the readings, and connect them up for the base, but
 330         # also save the first reading of each line in an array for the
 331         # purpose.
 332         # TODO use configurable reading separator
 333         chomp;
 334         my @words = split;
 335         my $started = 0;
 336         my $wordref = 0;
 337         my $lineref = scalar @$lineref_array;
 338         last if $SHORTEND && $lineref > $SHORTEND;
 339         foreach my $w ( @words ) {
 340             my $readingref = join( ',', $lineref, ++$wordref );
 341             my $reading = $collation->add_reading( { id => $readingref, text => $w } );
 342             unless( $started ) {
 343                 push( @$lineref_array, $reading );
 344                 $started = 1;
 345             }
 346             # Add edge paths in the graph, for easier tracking when
 347             # we start applying corrections.  These paths will be
 348             # removed when we're done.
 349             my $path = $collation->add_path( $last_reading, $reading,
 350                                              $collation->baselabel );
 351             $last_reading = $reading;
 352
 353             # Note an array index for the reading, for later correction splices.
 354             $base_text_index{$readingref} = $i++;
 355         }
 356     }
 357     close BASE;
 358     # Ending point for all texts
 359     $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
 360     push( @$lineref_array, $collation->end );
 361     $base_text_index{$collation->end->id} = $i;
 362
 363     return( @$lineref_array );
 364 }
 365
 366 sub set_relationships {
 367     my( $collation, $app, $lemma, $variants ) = @_;
 368     foreach my $rkey ( keys %$variants ) {
 369         my $var = $variants->{$rkey}->{'reading'};
 370         my $type = $app->{sprintf( "_%s_type", $rkey )};
 371         my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
 372         my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
 373
 374         my %rel_options = ();
 375         $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
 376         $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
 377
 378         if( $type =~ /^(inv|tr|rep)$/i ) {
 379             # Transposition or repetition: look for nodes with the
 380             # same label but different IDs and mark them.
 381             $type = 'repetition' if $type =~ /^rep/i;
 382             $rel_options{'type'} = $type;
 383             $rel_options{'equal_rank'} = undef;
 384             my %labels;
 385             foreach my $r ( @$lemma ) {
 386                 $labels{cmp_str( $r )} = $r;
 387             }
 388             foreach my $r( @$var ) {
 389                 if( exists $labels{$r->text} &&
 390                     $r->id ne $labels{$r->text}->id ) {
 391                     if( $type eq 'repetition' ) {
 392                         # Repetition
 393                         $collation->add_relationship( $r, $labels{$r->text}, \%rel_options );
 394                     } else {
 395                         # Transposition
 396                         $r->set_identical( $labels{$r->text} );
 397                     }
 398                 }
 399             }
 400         } elsif( $type =~ /^(gr|sp(el)?)$/i ) {
 401
 402             # Grammar/spelling/lexical: this can be a one-to-one or
 403             # one-to-many mapping.  We should think about merging
 404             # readings if it is one-to-many.
 405
 406             $type = 'grammatical' if $type =~ /gr/i;
 407             $type = 'spelling' if $type =~ /sp/i;
 408             $type = 'repetition' if $type =~ /rep/i;
 409             # $type = 'lexical' if $type =~ /lex/i;
 410             $rel_options{'type'} = $type;
 411             $rel_options{'equal_rank'} = 1;
 412             if( @$lemma == @$var ) {
 413                 foreach my $i ( 0 .. $#{$lemma} ) {
 414                     $collation->add_relationship( $var->[$i], $lemma->[$i],
 415                         \%rel_options );
 416                 }
 417             } else {
 418                 # An uneven many-to-many mapping.  Skip for now.
 419                 # We really want to make a segment out of whatever we have.
 420                 # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
 421                 # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
 422                 # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
 423                 if( @$lemma == 1 && @$var == 1 ) {
 424                     $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
 425                 }
 426             }
 427         } elsif( $type !~ /^(add|om|lex)$/i ) {
 428             warn "Unrecognized type $type";
 429         }
 430     }
 431 }
 432
 433
 434
 435 sub apply_edits {
 436     my( $collation, $edit_sequence, $debug ) = @_;
 437     my @lemma_text = $collation->reading_sequence(
 438         $collation->start, $collation->end );
 439     my $drift = 0;
 440     foreach my $correction ( @$edit_sequence ) {
 441         my( $lemma_start, $length, $items ) = @$correction;
 442         my $offset = $base_text_index{$lemma_start};
 443         my $realoffset = $offset + $drift;
 444         if( $debug ||
 445             $lemma_text[$realoffset]->id ne $lemma_start ) {
 446             my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
 447             my @base_phrase;
 448             my $i = $realoffset;
 449             my $l = $collation->reading( $lemma_start );
 450             while( $i < $realoffset+$length ) {
 451                 push( @base_phrase, $l );
 452                 $l = $collation->next_reading( $l );
 453                 $i++;
 454             }
 455
 456             print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
 457                                   "with %s (%s) with drift %d\n",
 458                                   join( ' ', map {$_->text} @base_phrase ),
 459                                   join( ' ', map {$_->id} @base_phrase ),
 460                                   $realoffset,
 461                                   join( ' ', map {$_->text} @$items ),
 462                                   join( ' ', map {$_->id} @$items ),
 463                                   $drift,
 464                                   ) if $debug;
 465
 466             if( $lemma_text[$realoffset]->id ne $lemma_start ) {
 467                 warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
 468                                "but %s (%s) is there instead",
 469                                join( ' ', map {$_->text} @base_phrase ),
 470                                join( ' ', map {$_->id} @base_phrase ),
 471                                join( ' ', map {$_->text} @$items ),
 472                                join( ' ', map {$_->id} @$items ),
 473                                join( ' ', map {$_->text} @this_phrase ),
 474                                join( ' ', map {$_->id} @this_phrase ),
 475                       ) );
 476                 # next;
 477             }
 478         }
 479         splice( @lemma_text, $realoffset, $length, @$items );
 480         $drift += @$items - $length;
 481     }
 482     return @lemma_text;
 483 }
 484
 485
 486 # Helper function. Given a witness sigil, if it is a post-correctione
 487 # sigil,return the base witness.  If not, return a false value.
 488 sub _is_post_corr {
 489     my( $sigil ) = @_;
 490     if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
 491         return $1;
 492     }
 493     return undef;
 494 }
 495
 496
 497 =back
 498
 499 =head1 LICENSE
 500
 501 This package is free software and is provided "as is" without express
 502 or implied warranty.  You can redistribute it and/or modify it under
 503 the same terms as Perl itself.
 504
 505 =head1 AUTHOR
 506
 507 Tara L Andrews, aurum@cpan.org
 508
 509 =cut
 510
 511 1;