lib/Text/Tradition/Parser/BaseText.pm

   1 package Text::Tradition::Parser::BaseText;
   2
   3 use strict;
   4 use warnings;
   5 use Module::Load;
   6 use TryCatch;
   7 use Text::Tradition::Parser::Util qw( collate_variants cmp_str
   8         check_for_repeated add_hash_entry );
   9
  10 =head1 NAME
  11
  12 Text::Tradition::Parser::BaseText
  13
  14 =head1 SYNOPSIS
  15
  16 use Text::Tradition::Parser::BaseText qw( merge_base );
  17 merge_base( $graph, 'reference.txt', @apparatus_entries )
  18
  19 =head1 DESCRIPTION
  20
  21 For an overview of the package, see the documentation for the
  22 Text::Tradition module.
  23
  24 This module is meant for use with certain of the other Parser classes
  25 - whenever a list of variants is given with reference to a base text,
  26 these must be joined into a single collation.  The parser should
  27 therefore make a list of variants and their locations, and BaseText
  28 will join those listed variants onto the reference text.
  29
  30 =head1 SUBROUTINES
  31
  32 =over
  33
  34 =item B<parse>
  35
  36 parse( $graph, $opts );
  37
  38 Takes an initialized graph and a hashref of options, which must include:
  39 - 'base' - the base text referenced by the variants
  40 - 'format' - the format of the variant list
  41 - 'data' - the variants, in the given format.
  42
  43 =cut
  44
  45 sub parse {
  46     my( $tradition, $opts ) = @_;
  47
  48     my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
  49     load( $format_mod );
  50     # TODO Handle a string someday if we ever have a format other than KUL
  51     my @apparatus_entries = $format_mod->can('read')->( $opts );
  52     merge_base( $tradition->collation, $opts, @apparatus_entries );
  53 }
  54
  55 =item B<merge_base>
  56
  57 merge_base( $graph, 'reference.txt', @apparatus_entries )
  58
  59 Takes three arguments: a newly-initialized Text::Tradition::Graph
  60 object, a text file containing the reference text, and a list of
  61 variants (apparatus entries).  Adds the base text to the graph, and
  62 joins the variants to that.
  63
  64 The list of variants is an array of hash references; each hash takes
  65 the form
  66  { '_id' => line reference,
  67    'rdg_0' => lemma reading,
  68    'rdg_1' => first variant,
  69    ...  # and so on until all distinct readings are listed
  70    'WitnessA' => 'rdg_0',
  71    'WitnessB' => 'rdg_1',
  72    ...  # and so on until all witnesses are listed with their readings
  73  }
  74
  75 Any hash key that is not of the form /^rdg_\d+$/ and that does not
  76 begin with an underscore is assumed to be a witness name.  Any 'meta'
  77 information to be passed must be passed in a key with a leading
  78 underscore in its name.
  79
  80 =cut
  81
  82 my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
  83
  84 my %base_text_index;
  85 my $edits_required = {};
  86
  87 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
  88
  89 sub merge_base {
  90     my( $collation, $opts, @app_entries ) = @_;
  91     my @base_line_starts = read_base( $opts->{'base'}, $collation );
  92
  93     my %all_witnesses;
  94     foreach my $app ( @app_entries ) {
  95         my( $line, $num ) = split( /\./, $app->{_id} );
  96         # DEBUG with a short graph
  97         last if $SHORTEND && $line > $SHORTEND;
  98         # DEBUG for problematic entries
  99         my $scrutinize = '';
 100         my $first_line_reading = $base_line_starts[ $line ];
 101         my $too_far = $base_line_starts[ $line+1 ];
 102
 103         my $lemma = $app->{rdg_0};
 104         my $seq = 1;
 105         # Is this the Nth occurrence of this reading in the line?
 106         if( $lemma =~ s/(_)?(\d)$// ) {
 107             $seq = $2;
 108         }
 109         my @lemma_words = split( /\s+/, $lemma );
 110
 111         # Now search for the lemma words within this line.
 112         my $lemma_start = $first_line_reading;
 113         my $lemma_end;
 114         my %seen;
 115         while( $lemma_start ne $too_far ) {
 116             # Loop detection
 117             if( $seen{ $lemma_start->id() } ) {
 118                 warn "Detected loop at " . $lemma_start->id() .
 119                     ", ref $line,$num";
 120                 last;
 121             }
 122             $seen{ $lemma_start->id() } = 1;
 123
 124             # Try to match the lemma.
 125             my $unmatch = 0;
 126             print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
 127                 $lemma_words[0] . "...\n"
 128                 if "$line.$num" eq $scrutinize;
 129             if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
 130                 # Skip it if we need a match that is not the first.
 131                 if( --$seq < 1 ) {
 132                     # Now we have to compare the rest of the words here.
 133                     if( scalar( @lemma_words ) > 1 ) {
 134                         my $next_reading =
 135                             $collation->next_reading( $lemma_start );
 136                         foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 137                             printf STDERR "Now matching %s against %s\n",
 138                                     cmp_str($next_reading), $w
 139                                 if "$line.$num" eq $scrutinize;
 140                             if( $w ne cmp_str($next_reading) ) {
 141                                 $unmatch = 1;
 142                                 last;
 143                             } else {
 144                                 $lemma_end = $next_reading;
 145                                 $next_reading =
 146                                     $collation->next_reading( $lemma_end );
 147                             }
 148                         }
 149                     } else {
 150                         $lemma_end = $lemma_start;
 151                     }
 152                 } else {
 153                     $unmatch = 1;
 154                 }
 155             }
 156             last unless ( $unmatch || !defined( $lemma_end ) );
 157             $lemma_end = undef;
 158             $lemma_start = $collation->next_reading( $lemma_start );
 159         }
 160
 161         unless( $lemma_end ) {
 162             warn "No match found for @lemma_words at $line.$num";
 163             next;
 164         }
 165
 166         # Now we have found the lemma; we will record an 'edit', in
 167         # terms of a splice operation, for each subsequent reading.
 168         # We also note which witnesses take the given edit.
 169
 170         my @lemma_set = $collation->reading_sequence( $lemma_start,
 171                                                       $lemma_end );
 172         my @reading_sets = [ @lemma_set ];
 173
 174         # For each reading that is not rdg_0, we create the variant
 175         # reading nodes, and store the range as an edit operation on
 176         # the base text.
 177         my $variant_objects;
 178         my %pc_seen; # Keep track of mss with explicit post-corr data
 179         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
 180             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 181
 182             # Keep track of what witnesses we have seen.
 183             @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
 184             # Keep track of which witnesses bear corrected readings here.
 185             foreach my $m ( @mss ) {
 186                 my $base = _is_post_corr( $m );
 187                 next unless $base;
 188                 $pc_seen{$base} = 1;
 189             }
 190             next if $k eq 'rdg_0';
 191
 192             # Parse the variant into reading tokens.
 193             # TODO don't hardcode the reading split operation
 194             my @variant = split( /\s+/, $app->{$k} );
 195             @variant = () if $app->{$k} eq '/'; # This is an omission.
 196
 197             my @variant_readings;
 198             my $ctr = 0;
 199             foreach my $vw ( @variant ) {
 200                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 201                 my $vwreading = $collation->add_reading( {
 202                         'id' => $vwname,
 203                         'text' => $vw } );
 204                 push( @variant_readings, $vwreading );
 205             }
 206
 207             $variant_objects->{$k} = { 'mss' => \@mss,
 208                                        'reading' => \@variant_readings,
 209             };
 210             push( @reading_sets, \@variant_readings );
 211         }
 212
 213         # Now collate and collapse the identical readings within the
 214         # collated sets.  Modifies the reading sets that were passed.
 215         collate_variants( $collation, @reading_sets );
 216
 217         # Record any stated relationships between the nodes and the lemma.
 218         set_relationships( $collation, $app, \@lemma_set, $variant_objects );
 219
 220         # Now create the splice-edit objects that will be used
 221         # to reconstruct each witness.
 222
 223         foreach my $rkey ( keys %$variant_objects ) {
 224             # Object is argument list for splice, so:
 225             # offset, length, replacements
 226             my $edit_object = [ $lemma_start->id,
 227                                 scalar( @lemma_set ),
 228                                 $variant_objects->{$rkey}->{reading} ];
 229             foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
 230                 # Is this a p.c. entry?
 231                 my $base = _is_post_corr( $ms );
 232                 if( $base ) { # this is a post-corr witness
 233                     my $pc_key = $base . "_post";
 234                     add_hash_entry( $edits_required, $pc_key, $edit_object );
 235                 } else { # this is an ante-corr witness
 236                     my $pc_key = $ms . "_post";
 237                     add_hash_entry( $edits_required, $ms, $edit_object );
 238                     unless( $pc_seen{$ms} ) {
 239                         # If this witness carries no correction, add this
 240                         # same object to its post-corrected state.
 241                         add_hash_entry( $edits_required, $pc_key,
 242                                          $edit_object );
 243                     }
 244                 }
 245             }
 246         }
 247     } # Finished going through the apparatus entries
 248
 249     # Now make the witness objects, and create their text sequences
 250     foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
 251         print STDERR "Creating witness $w\n";
 252         my $witness_obj = $collation->tradition->add_witness( sigil => $w );
 253         my $debug; #  = $w eq 'Vb11';
 254         my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
 255         my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
 256             if exists( $edits_required->{$w."_post"} );
 257
 258         my @repeated = check_for_repeated( @ante_corr_seq );
 259         warn "Repeated elements @repeated in $w a.c."
 260             if @repeated;
 261         @repeated = check_for_repeated( @post_corr_seq );
 262         warn "Repeated elements @repeated in $w p.c."
 263             if @repeated;
 264
 265         # Now save these paths in my witness object
 266         if( @post_corr_seq ) {
 267             $witness_obj->path( \@post_corr_seq );
 268             $witness_obj->uncorrected_path( \@ante_corr_seq );
 269         } else {
 270             $witness_obj->path( \@ante_corr_seq );
 271         }
 272     }
 273
 274     # Now remove our 'base text' edges, which is to say, the only
 275     # ones we have created so far.  Also remove any unwitnessed
 276     # lemma nodes (TODO unless we are treating base as witness)
 277     foreach ( $collation->paths() ) {
 278         $collation->del_path( $_, $collation->baselabel );
 279     }
 280
 281     ### HACKY HACKY Do some one-off path corrections here.
 282     if( $opts->{'input'} eq 'KUL' ) {
 283                 require 'data/boodts/s158.HACK';
 284                 KUL::HACK::pre_path_hack( $collation );
 285         }
 286
 287     # Now walk paths and calculate positional rank.
 288     $collation->make_witness_paths();
 289     # Now delete any orphaned readings.
 290         foreach my $r ( $collation->sequence->isolated_vertices ) {
 291                 print STDERR "Deleting unconnected reading $r / " .
 292                         $collation->reading( $r )->text . "\n";
 293                 $collation->del_reading( $r );
 294         }
 295
 296     KUL::HACK::post_path_hack( $collation ) if $opts->{'input'} eq 'KUL';
 297     # Have to check relationship validity at this point, because before that
 298     # we had no paths.
 299 #     foreach my $rel ( $collation->relationships ) {
 300 #         next unless $rel->equal_rank;
 301 #         unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
 302 #             warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
 303 #                             $rel->type, $rel->from->id, $rel->to->id );
 304 #         }
 305 #     }
 306     $collation->calculate_ranks();
 307 }
 308
 309 =item B<read_base>
 310
 311 my @line_beginnings = read_base( 'reference.txt', $collation );
 312
 313 Takes a text file and a (presumed empty) collation object, adds the
 314 words as simple linear readings to the collation, and returns a
 315 list of readings that represent the beginning of lines. This collation
 316 is now the starting point for application of apparatus entries in
 317 merge_base, e.g. from a CSV file or a Classical Text Editor file.
 318
 319 =cut
 320
 321 sub read_base {
 322     my( $base_file, $collation ) = @_;
 323
 324     # This array gives the first reading for each line.  We put the
 325     # common starting point in line zero.
 326     my $last_reading = $collation->start;
 327     $base_text_index{$last_reading->id} = 0;
 328     my $lineref_array = [ $last_reading ]; # There is no line zero.
 329
 330     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
 331     my $i = 1;
 332     while(<BASE>) {
 333         # Make the readings, and connect them up for the base, but
 334         # also save the first reading of each line in an array for the
 335         # purpose.
 336         # TODO use configurable reading separator
 337         chomp;
 338         my @words = split;
 339         my $started = 0;
 340         my $wordref = 0;
 341         my $lineref = scalar @$lineref_array;
 342         last if $SHORTEND && $lineref > $SHORTEND;
 343         foreach my $w ( @words ) {
 344             my $readingref = join( ',', $lineref, ++$wordref );
 345             my $reading = $collation->add_reading( { id => $readingref, text => $w } );
 346             unless( $started ) {
 347                 push( @$lineref_array, $reading );
 348                 $started = 1;
 349             }
 350             # Add edge paths in the graph, for easier tracking when
 351             # we start applying corrections.  These paths will be
 352             # removed when we're done.
 353             my $path = $collation->add_path( $last_reading, $reading,
 354                                              $collation->baselabel );
 355             $last_reading = $reading;
 356
 357             # Note an array index for the reading, for later correction splices.
 358             $base_text_index{$readingref} = $i++;
 359         }
 360     }
 361     close BASE;
 362     # Ending point for all texts
 363     $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
 364     push( @$lineref_array, $collation->end );
 365     $base_text_index{$collation->end->id} = $i;
 366
 367     return( @$lineref_array );
 368 }
 369
 370 sub set_relationships {
 371     my( $collation, $app, $lemma, $variants ) = @_;
 372     foreach my $rkey ( keys %$variants ) {
 373         my $var = $variants->{$rkey}->{'reading'};
 374         my $type = $app->{sprintf( "_%s_type", $rkey )};
 375         my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
 376         my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
 377
 378         my %rel_options = ();
 379         $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
 380         $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
 381
 382         if( $type =~ /^(inv|tr|rep)$/i ) {
 383             # Transposition or repetition: look for nodes with the
 384             # same label but different IDs and mark them.
 385             $type = 'repetition' if $type =~ /^rep/i;
 386             $rel_options{'type'} = $type;
 387             $rel_options{'equal_rank'} = undef;
 388             my %labels;
 389             foreach my $r ( @$lemma ) {
 390                 $labels{cmp_str( $r )} = $r;
 391             }
 392             foreach my $r( @$var ) {
 393                 if( exists $labels{$r->text} &&
 394                     $r->id ne $labels{$r->text}->id ) {
 395                     if( $type eq 'repetition' ) {
 396                         # Repetition
 397                         try {
 398                                 $collation->add_relationship( $r, $labels{$r->text}, \%rel_options );
 399                         } catch( Text::Tradition::Error $e ) {
 400                                 warn "Could not set repetition relationship $r -> "
 401                                         . $labels{$r->text} . ": " . $e->message;
 402                         }
 403                     } else {
 404                         # Transposition
 405                         try {
 406                                 $r->set_identical( $labels{$r->text} );
 407                         } catch( Text::Tradition::Error $e ) {
 408                                 warn "Could not set transposition relationship $r -> "
 409                                         . $labels{$r->text} . ": " . $e->message;
 410                         }
 411                     }
 412                 }
 413             }
 414         } elsif( $type =~ /^(gr|sp(el)?)$/i ) {
 415
 416             # Grammar/spelling/lexical: this can be a one-to-one or
 417             # one-to-many mapping.  We should think about merging
 418             # readings if it is one-to-many.
 419
 420             $type = 'grammatical' if $type =~ /gr/i;
 421             $type = 'spelling' if $type =~ /sp/i;
 422             $type = 'repetition' if $type =~ /rep/i;
 423             # $type = 'lexical' if $type =~ /lex/i;
 424             $rel_options{'type'} = $type;
 425             $rel_options{'equal_rank'} = 1;
 426             if( @$lemma == @$var ) {
 427                 foreach my $i ( 0 .. $#{$lemma} ) {
 428                         try {
 429                                                 $collation->add_relationship( $var->[$i], $lemma->[$i],
 430                                                         \%rel_options );
 431                                         } catch( Text::Tradition::Error $e ) {
 432                                                 warn "Could not set $type relationship " . $var->[$i] . " -> "
 433                                                         . $lemma->[$i] . ": " . $e->message;
 434                                         }
 435                 }
 436             } else {
 437                 # An uneven many-to-many mapping.  Skip for now.
 438                 # We really want to make a segment out of whatever we have.
 439                 # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
 440                 # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
 441                 # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
 442                 # if( @$lemma == 1 && @$var == 1 ) {
 443                 #     $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
 444                 # }
 445             }
 446         } elsif( $type !~ /^(add|om|lex)$/i ) {
 447             warn "Unrecognized type $type";
 448         }
 449     }
 450 }
 451
 452
 453
 454 sub apply_edits {
 455     my( $collation, $edit_sequence, $debug ) = @_;
 456     my @lemma_text = $collation->reading_sequence(
 457         $collation->start, $collation->end );
 458     my $drift = 0;
 459     foreach my $correction ( @$edit_sequence ) {
 460         my( $lemma_start, $length, $items ) = @$correction;
 461         my $offset = $base_text_index{$lemma_start};
 462         my $realoffset = $offset + $drift;
 463         if( $debug ||
 464             $lemma_text[$realoffset]->id ne $lemma_start ) {
 465             my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
 466             my @base_phrase;
 467             my $i = $realoffset;
 468             my $l = $collation->reading( $lemma_start );
 469             while( $i < $realoffset+$length ) {
 470                 push( @base_phrase, $l );
 471                 $l = $collation->next_reading( $l );
 472                 $i++;
 473             }
 474
 475             print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
 476                                   "with %s (%s) with drift %d\n",
 477                                   join( ' ', map {$_->text} @base_phrase ),
 478                                   join( ' ', map {$_->id} @base_phrase ),
 479                                   $realoffset,
 480                                   join( ' ', map {$_->text} @$items ),
 481                                   join( ' ', map {$_->id} @$items ),
 482                                   $drift,
 483                                   ) if $debug;
 484
 485             if( $lemma_text[$realoffset]->id ne $lemma_start ) {
 486                 warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
 487                                "but %s (%s) is there instead",
 488                                join( ' ', map {$_->text} @base_phrase ),
 489                                join( ' ', map {$_->id} @base_phrase ),
 490                                join( ' ', map {$_->text} @$items ),
 491                                join( ' ', map {$_->id} @$items ),
 492                                join( ' ', map {$_->text} @this_phrase ),
 493                                join( ' ', map {$_->id} @this_phrase ),
 494                       ) );
 495                 # next;
 496             }
 497         }
 498         splice( @lemma_text, $realoffset, $length, @$items );
 499         $drift += @$items - $length;
 500     }
 501     return @lemma_text;
 502 }
 503
 504
 505 # Helper function. Given a witness sigil, if it is a post-correctione
 506 # sigil,return the base witness.  If not, return a false value.
 507 sub _is_post_corr {
 508     my( $sigil ) = @_;
 509     if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
 510         return $1;
 511     }
 512     return undef;
 513 }
 514
 515
 516 =back
 517
 518 =head1 LICENSE
 519
 520 This package is free software and is provided "as is" without express
 521 or implied warranty.  You can redistribute it and/or modify it under
 522 the same terms as Perl itself.
 523
 524 =head1 AUTHOR
 525
 526 Tara L Andrews, aurum@cpan.org
 527
 528 =cut
 529
 530 1;