lib/Text/Tradition/Parser/BaseText.pm

   1 package Text::Tradition::Parser::BaseText;
   2
   3 use strict;
   4 use warnings;
   5 use Module::Load;
   6 use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );
   7
   8 =head1 NAME
   9
  10 Text::Tradition::Parser::BaseText
  11
  12 =head1 SYNOPSIS
  13
  14 use Text::Tradition::Parser::BaseText qw( merge_base );
  15 merge_base( $graph, 'reference.txt', @apparatus_entries )
  16
  17 =head1 DESCRIPTION
  18
  19 For an overview of the package, see the documentation for the
  20 Text::Tradition module.
  21
  22 This module is meant for use with certain of the other Parser classes
  23 - whenever a list of variants is given with reference to a base text,
  24 these must be joined into a single collation.  The parser should
  25 therefore make a list of variants and their locations, and BaseText
  26 will join those listed variants onto the reference text.
  27
  28 =head1 SUBROUTINES
  29
  30 =over
  31
  32 =item B<parse>
  33
  34 parse( $graph, $opts );
  35
  36 Takes an initialized graph and a hashref of options, which must include:
  37 - 'base' - the base text referenced by the variants
  38 - 'format' - the format of the variant list
  39 - 'data' - the variants, in the given format.
  40
  41 =cut
  42
  43 sub parse {
  44     my( $tradition, $opts ) = @_;
  45
  46     my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
  47     load( $format_mod );
  48     # TODO Handle a string someday if we ever have a format other than KUL
  49     my @apparatus_entries = $format_mod->can('read')->( $opts );
  50     merge_base( $tradition->collation, $opts, @apparatus_entries );
  51 }
  52
  53 =item B<merge_base>
  54
  55 merge_base( $graph, 'reference.txt', @apparatus_entries )
  56
  57 Takes three arguments: a newly-initialized Text::Tradition::Graph
  58 object, a text file containing the reference text, and a list of
  59 variants (apparatus entries).  Adds the base text to the graph, and
  60 joins the variants to that.
  61
  62 The list of variants is an array of hash references; each hash takes
  63 the form
  64  { '_id' => line reference,
  65    'rdg_0' => lemma reading,
  66    'rdg_1' => first variant,
  67    ...  # and so on until all distinct readings are listed
  68    'WitnessA' => 'rdg_0',
  69    'WitnessB' => 'rdg_1',
  70    ...  # and so on until all witnesses are listed with their readings
  71  }
  72
  73 Any hash key that is not of the form /^rdg_\d+$/ and that does not
  74 begin with an underscore is assumed to be a witness name.  Any 'meta'
  75 information to be passed must be passed in a key with a leading
  76 underscore in its name.
  77
  78 =cut
  79
  80 my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
  81
  82 my %base_text_index;
  83 my $edits_required = {};
  84
  85 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
  86
  87 sub merge_base {
  88     my( $collation, $opts, @app_entries ) = @_;
  89     my @base_line_starts = read_base( $opts->{'base'}, $collation );
  90
  91     my %all_witnesses;
  92     foreach my $app ( @app_entries ) {
  93         my( $line, $num ) = split( /\./, $app->{_id} );
  94         # DEBUG with a short graph
  95         last if $SHORTEND && $line > $SHORTEND;
  96         # DEBUG for problematic entries
  97         my $scrutinize = '';
  98         my $first_line_reading = $base_line_starts[ $line ];
  99         my $too_far = $base_line_starts[ $line+1 ];
 100
 101         my $lemma = $app->{rdg_0};
 102         my $seq = 1;
 103         # Is this the Nth occurrence of this reading in the line?
 104         if( $lemma =~ s/(_)?(\d)$// ) {
 105             $seq = $2;
 106         }
 107         my @lemma_words = split( /\s+/, $lemma );
 108
 109         # Now search for the lemma words within this line.
 110         my $lemma_start = $first_line_reading;
 111         my $lemma_end;
 112         my %seen;
 113         while( $lemma_start ne $too_far ) {
 114             # Loop detection
 115             if( $seen{ $lemma_start->id() } ) {
 116                 warn "Detected loop at " . $lemma_start->id() .
 117                     ", ref $line,$num";
 118                 last;
 119             }
 120             $seen{ $lemma_start->id() } = 1;
 121
 122             # Try to match the lemma.
 123             my $unmatch = 0;
 124             print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
 125                 $lemma_words[0] . "...\n"
 126                 if "$line.$num" eq $scrutinize;
 127             if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
 128                 # Skip it if we need a match that is not the first.
 129                 if( --$seq < 1 ) {
 130                     # Now we have to compare the rest of the words here.
 131                     if( scalar( @lemma_words ) > 1 ) {
 132                         my $next_reading =
 133                             $collation->next_reading( $lemma_start );
 134                         foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 135                             printf STDERR "Now matching %s against %s\n",
 136                                     cmp_str($next_reading), $w
 137                                 if "$line.$num" eq $scrutinize;
 138                             if( $w ne cmp_str($next_reading) ) {
 139                                 $unmatch = 1;
 140                                 last;
 141                             } else {
 142                                 $lemma_end = $next_reading;
 143                                 $next_reading =
 144                                     $collation->next_reading( $lemma_end );
 145                             }
 146                         }
 147                     } else {
 148                         $lemma_end = $lemma_start;
 149                     }
 150                 } else {
 151                     $unmatch = 1;
 152                 }
 153             }
 154             last unless ( $unmatch || !defined( $lemma_end ) );
 155             $lemma_end = undef;
 156             $lemma_start = $collation->next_reading( $lemma_start );
 157         }
 158
 159         unless( $lemma_end ) {
 160             warn "No match found for @lemma_words at $line.$num";
 161             next;
 162         }
 163
 164         # Now we have found the lemma; we will record an 'edit', in
 165         # terms of a splice operation, for each subsequent reading.
 166         # We also note which witnesses take the given edit.
 167
 168         my @lemma_set = $collation->reading_sequence( $lemma_start,
 169                                                       $lemma_end );
 170         my @reading_sets = [ @lemma_set ];
 171
 172         # For each reading that is not rdg_0, we create the variant
 173         # reading nodes, and store the range as an edit operation on
 174         # the base text.
 175         my $variant_objects;
 176         my %pc_seen; # Keep track of mss with explicit post-corr data
 177         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
 178             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 179
 180             # Keep track of what witnesses we have seen.
 181             @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
 182             # Keep track of which witnesses bear corrected readings here.
 183             foreach my $m ( @mss ) {
 184                 my $base = _is_post_corr( $m );
 185                 next unless $base;
 186                 $pc_seen{$base} = 1;
 187             }
 188             next if $k eq 'rdg_0';
 189
 190             # Parse the variant into reading tokens.
 191             # TODO don't hardcode the reading split operation
 192             my @variant = split( /\s+/, $app->{$k} );
 193             @variant = () if $app->{$k} eq '/'; # This is an omission.
 194
 195             my @variant_readings;
 196             my $ctr = 0;
 197             foreach my $vw ( @variant ) {
 198                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 199                 my $vwreading = $collation->add_reading( {
 200                         'id' => $vwname,
 201                         'text' => $vw } );
 202                 push( @variant_readings, $vwreading );
 203             }
 204
 205             $variant_objects->{$k} = { 'mss' => \@mss,
 206                                        'reading' => \@variant_readings,
 207             };
 208             push( @reading_sets, \@variant_readings );
 209         }
 210
 211         # Now collate and collapse the identical readings within the
 212         # collated sets.  Modifies the reading sets that were passed.
 213         collate_variants( $collation, @reading_sets );
 214
 215         # Record any stated relationships between the nodes and the lemma.
 216         set_relationships( $collation, $app, \@lemma_set, $variant_objects );
 217
 218         # Now create the splice-edit objects that will be used
 219         # to reconstruct each witness.
 220
 221         foreach my $rkey ( keys %$variant_objects ) {
 222             # Object is argument list for splice, so:
 223             # offset, length, replacements
 224             my $edit_object = [ $lemma_start->id,
 225                                 scalar( @lemma_set ),
 226                                 $variant_objects->{$rkey}->{reading} ];
 227             foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
 228                 # Is this a p.c. entry?
 229                 my $base = _is_post_corr( $ms );
 230                 if( $base ) { # this is a post-corr witness
 231                     my $pc_key = $base . "_post";
 232                     add_hash_entry( $edits_required, $pc_key, $edit_object );
 233                 } else { # this is an ante-corr witness
 234                     my $pc_key = $ms . "_post";
 235                     add_hash_entry( $edits_required, $ms, $edit_object );
 236                     unless( $pc_seen{$ms} ) {
 237                         # If this witness carries no correction, add this
 238                         # same object to its post-corrected state.
 239                         add_hash_entry( $edits_required, $pc_key,
 240                                          $edit_object );
 241                     }
 242                 }
 243             }
 244         }
 245     } # Finished going through the apparatus entries
 246
 247     # Now make the witness objects, and create their text sequences
 248     foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
 249         print STDERR "Creating witness $w\n";
 250         my $witness_obj = $collation->tradition->add_witness( sigil => $w );
 251         my $debug; #  = $w eq 'Vb11';
 252         my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
 253         my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
 254             if exists( $edits_required->{$w."_post"} );
 255
 256         my @repeated = check_for_repeated( @ante_corr_seq );
 257         warn "Repeated elements @repeated in $w a.c."
 258             if @repeated;
 259         @repeated = check_for_repeated( @post_corr_seq );
 260         warn "Repeated elements @repeated in $w p.c."
 261             if @repeated;
 262
 263         # Now save these paths in my witness object
 264         if( @post_corr_seq ) {
 265             $witness_obj->path( \@post_corr_seq );
 266             $witness_obj->uncorrected_path( \@ante_corr_seq );
 267         } else {
 268             $witness_obj->path( \@ante_corr_seq );
 269         }
 270     }
 271
 272     # Now remove our 'base text' edges, which is to say, the only
 273     # ones we have created so far.  Also remove any unwitnessed
 274     # lemma nodes (TODO unless we are treating base as witness)
 275     foreach ( $collation->paths() ) {
 276         $collation->del_path( $_, $collation->baselabel );
 277     }
 278
 279     ### HACKY HACKY Do some one-off path corrections here.
 280     if( $opts->{'input'} eq 'KUL' ) {
 281                 require 'data/boodts/s158.HACK';
 282                 KUL::HACK::pre_path_hack( $collation );
 283         }
 284
 285     # Now walk paths and calculate positional rank.
 286     $collation->make_witness_paths();
 287     # Now delete any orphaned readings.
 288         foreach my $r ( $collation->sequence->isolated_vertices ) {
 289                 print STDERR "Deleting unconnected reading $r / " .
 290                         $collation->reading( $r )->text . "\n";
 291                 $collation->del_reading( $r );
 292         }
 293
 294     KUL::HACK::post_path_hack( $collation ) if $opts->{'input'} eq 'KUL';
 295     # Have to check relationship validity at this point, because before that
 296     # we had no paths.
 297 #     foreach my $rel ( $collation->relationships ) {
 298 #         next unless $rel->equal_rank;
 299 #         unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
 300 #             warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
 301 #                             $rel->type, $rel->from->id, $rel->to->id );
 302 #         }
 303 #     }
 304     # $collation->calculate_ranks();
 305 }
 306
 307 =item B<read_base>
 308
 309 my @line_beginnings = read_base( 'reference.txt', $collation );
 310
 311 Takes a text file and a (presumed empty) collation object, adds the
 312 words as simple linear readings to the collation, and returns a
 313 list of readings that represent the beginning of lines. This collation
 314 is now the starting point for application of apparatus entries in
 315 merge_base, e.g. from a CSV file or a Classical Text Editor file.
 316
 317 =cut
 318
 319 sub read_base {
 320     my( $base_file, $collation ) = @_;
 321
 322     # This array gives the first reading for each line.  We put the
 323     # common starting point in line zero.
 324     my $last_reading = $collation->start;
 325     $base_text_index{$last_reading->id} = 0;
 326     my $lineref_array = [ $last_reading ]; # There is no line zero.
 327
 328     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
 329     my $i = 1;
 330     while(<BASE>) {
 331         # Make the readings, and connect them up for the base, but
 332         # also save the first reading of each line in an array for the
 333         # purpose.
 334         # TODO use configurable reading separator
 335         chomp;
 336         my @words = split;
 337         my $started = 0;
 338         my $wordref = 0;
 339         my $lineref = scalar @$lineref_array;
 340         last if $SHORTEND && $lineref > $SHORTEND;
 341         foreach my $w ( @words ) {
 342             my $readingref = join( ',', $lineref, ++$wordref );
 343             my $reading = $collation->add_reading( { id => $readingref, text => $w } );
 344             unless( $started ) {
 345                 push( @$lineref_array, $reading );
 346                 $started = 1;
 347             }
 348             # Add edge paths in the graph, for easier tracking when
 349             # we start applying corrections.  These paths will be
 350             # removed when we're done.
 351             my $path = $collation->add_path( $last_reading, $reading,
 352                                              $collation->baselabel );
 353             $last_reading = $reading;
 354
 355             # Note an array index for the reading, for later correction splices.
 356             $base_text_index{$readingref} = $i++;
 357         }
 358     }
 359     close BASE;
 360     # Ending point for all texts
 361     $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
 362     push( @$lineref_array, $collation->end );
 363     $base_text_index{$collation->end->id} = $i;
 364
 365     return( @$lineref_array );
 366 }
 367
 368 sub set_relationships {
 369     my( $collation, $app, $lemma, $variants ) = @_;
 370     foreach my $rkey ( keys %$variants ) {
 371         my $var = $variants->{$rkey}->{'reading'};
 372         my $type = $app->{sprintf( "_%s_type", $rkey )};
 373         my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
 374         my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
 375
 376         my %rel_options = ();
 377         $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
 378         $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
 379
 380         if( $type =~ /^(inv|tr|rep)$/i ) {
 381             # Transposition or repetition: look for nodes with the
 382             # same label but different IDs and mark them.
 383             $type = 'repetition' if $type =~ /^rep/i;
 384             $rel_options{'type'} = $type;
 385             $rel_options{'equal_rank'} = undef;
 386             my %labels;
 387             foreach my $r ( @$lemma ) {
 388                 $labels{cmp_str( $r )} = $r;
 389             }
 390             foreach my $r( @$var ) {
 391                 if( exists $labels{$r->text} &&
 392                     $r->id ne $labels{$r->text}->id ) {
 393                     if( $type eq 'repetition' ) {
 394                         # Repetition
 395                         $collation->add_relationship( $r, $labels{$r->text}, \%rel_options );
 396                     } else {
 397                         # Transposition
 398                         $r->set_identical( $labels{$r->text} );
 399                     }
 400                 }
 401             }
 402         } elsif( $type =~ /^(gr|sp(el)?)$/i ) {
 403
 404             # Grammar/spelling/lexical: this can be a one-to-one or
 405             # one-to-many mapping.  We should think about merging
 406             # readings if it is one-to-many.
 407
 408             $type = 'grammatical' if $type =~ /gr/i;
 409             $type = 'spelling' if $type =~ /sp/i;
 410             $type = 'repetition' if $type =~ /rep/i;
 411             # $type = 'lexical' if $type =~ /lex/i;
 412             $rel_options{'type'} = $type;
 413             $rel_options{'equal_rank'} = 1;
 414             if( @$lemma == @$var ) {
 415                 foreach my $i ( 0 .. $#{$lemma} ) {
 416                     $collation->add_relationship( $var->[$i], $lemma->[$i],
 417                         \%rel_options );
 418                 }
 419             } else {
 420                 # An uneven many-to-many mapping.  Skip for now.
 421                 # We really want to make a segment out of whatever we have.
 422                 # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
 423                 # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
 424                 # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
 425                 if( @$lemma == 1 && @$var == 1 ) {
 426                     $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
 427                 }
 428             }
 429         } elsif( $type !~ /^(add|om|lex)$/i ) {
 430             warn "Unrecognized type $type";
 431         }
 432     }
 433 }
 434
 435
 436
 437 sub apply_edits {
 438     my( $collation, $edit_sequence, $debug ) = @_;
 439     my @lemma_text = $collation->reading_sequence(
 440         $collation->start, $collation->end );
 441     my $drift = 0;
 442     foreach my $correction ( @$edit_sequence ) {
 443         my( $lemma_start, $length, $items ) = @$correction;
 444         my $offset = $base_text_index{$lemma_start};
 445         my $realoffset = $offset + $drift;
 446         if( $debug ||
 447             $lemma_text[$realoffset]->id ne $lemma_start ) {
 448             my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
 449             my @base_phrase;
 450             my $i = $realoffset;
 451             my $l = $collation->reading( $lemma_start );
 452             while( $i < $realoffset+$length ) {
 453                 push( @base_phrase, $l );
 454                 $l = $collation->next_reading( $l );
 455                 $i++;
 456             }
 457
 458             print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
 459                                   "with %s (%s) with drift %d\n",
 460                                   join( ' ', map {$_->text} @base_phrase ),
 461                                   join( ' ', map {$_->id} @base_phrase ),
 462                                   $realoffset,
 463                                   join( ' ', map {$_->text} @$items ),
 464                                   join( ' ', map {$_->id} @$items ),
 465                                   $drift,
 466                                   ) if $debug;
 467
 468             if( $lemma_text[$realoffset]->id ne $lemma_start ) {
 469                 warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
 470                                "but %s (%s) is there instead",
 471                                join( ' ', map {$_->text} @base_phrase ),
 472                                join( ' ', map {$_->id} @base_phrase ),
 473                                join( ' ', map {$_->text} @$items ),
 474                                join( ' ', map {$_->id} @$items ),
 475                                join( ' ', map {$_->text} @this_phrase ),
 476                                join( ' ', map {$_->id} @this_phrase ),
 477                       ) );
 478                 # next;
 479             }
 480         }
 481         splice( @lemma_text, $realoffset, $length, @$items );
 482         $drift += @$items - $length;
 483     }
 484     return @lemma_text;
 485 }
 486
 487
 488 # Helper function. Given a witness sigil, if it is a post-correctione
 489 # sigil,return the base witness.  If not, return a false value.
 490 sub _is_post_corr {
 491     my( $sigil ) = @_;
 492     if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
 493         return $1;
 494     }
 495     return undef;
 496 }
 497
 498
 499 =back
 500
 501 =head1 LICENSE
 502
 503 This package is free software and is provided "as is" without express
 504 or implied warranty.  You can redistribute it and/or modify it under
 505 the same terms as Perl itself.
 506
 507 =head1 AUTHOR
 508
 509 Tara L Andrews, aurum@cpan.org
 510
 511 =cut
 512
 513 1;