lib/Text/Tradition/Parser/BaseText.pm

   1 package Text::Tradition::Parser::BaseText;
   2
   3 use strict;
   4 use warnings;
   5 use Module::Load;
   6 use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );
   7
   8 =head1 NAME
   9
  10 Text::Tradition::Parser::BaseText
  11
  12 =head1 SYNOPSIS
  13
  14 use Text::Tradition::Parser::BaseText qw( merge_base );
  15 merge_base( $graph, 'reference.txt', @apparatus_entries )
  16
  17 =head1 DESCRIPTION
  18
  19 For an overview of the package, see the documentation for the
  20 Text::Tradition module.
  21
  22 This module is meant for use with certain of the other Parser classes
  23 - whenever a list of variants is given with reference to a base text,
  24 these must be joined into a single collation.  The parser should
  25 therefore make a list of variants and their locations, and BaseText
  26 will join those listed variants onto the reference text.
  27
  28 =head1 SUBROUTINES
  29
  30 =over
  31
  32 =item B<parse>
  33
  34 parse( $graph, $opts );
  35
  36 Takes an initialized graph and a hashref of options, which must include:
  37 - 'base' - the base text referenced by the variants
  38 - 'format' - the format of the variant list
  39 - 'data' - the variants, in the given format.
  40
  41 =cut
  42
  43 sub parse {
  44     my( $tradition, $opts ) = @_;
  45
  46     my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
  47     load( $format_mod );
  48     # TODO Handle a string someday if we ever have a format other than KUL
  49     my @apparatus_entries = $format_mod->can('read')->( $opts );
  50     merge_base( $tradition->collation, $opts->{'base'}, @apparatus_entries );
  51 }
  52
  53 =item B<merge_base>
  54
  55 merge_base( $graph, 'reference.txt', @apparatus_entries )
  56
  57 Takes three arguments: a newly-initialized Text::Tradition::Graph
  58 object, a text file containing the reference text, and a list of
  59 variants (apparatus entries).  Adds the base text to the graph, and
  60 joins the variants to that.
  61
  62 The list of variants is an array of hash references; each hash takes
  63 the form
  64  { '_id' => line reference,
  65    'rdg_0' => lemma reading,
  66    'rdg_1' => first variant,
  67    ...  # and so on until all distinct readings are listed
  68    'WitnessA' => 'rdg_0',
  69    'WitnessB' => 'rdg_1',
  70    ...  # and so on until all witnesses are listed with their readings
  71  }
  72
  73 Any hash key that is not of the form /^rdg_\d+$/ and that does not
  74 begin with an underscore is assumed to be a witness name.  Any 'meta'
  75 information to be passed must be passed in a key with a leading
  76 underscore in its name.
  77
  78 =cut
  79
  80 my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
  81
  82 my %base_text_index;
  83 my $edits_required = {};
  84
  85 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
  86
  87 sub merge_base {
  88     my( $collation, $base_file, @app_entries ) = @_;
  89     my @base_line_starts = read_base( $base_file, $collation );
  90
  91     my %all_witnesses;
  92     my @unwitnessed_lemma_nodes;
  93     foreach my $app ( @app_entries ) {
  94         my( $line, $num ) = split( /\./, $app->{_id} );
  95         # DEBUG with a short graph
  96         last if $SHORTEND && $line > $SHORTEND;
  97         # DEBUG for problematic entries
  98         my $scrutinize = '';
  99         my $first_line_reading = $base_line_starts[ $line ];
 100         my $too_far = $base_line_starts[ $line+1 ];
 101
 102         my $lemma = $app->{rdg_0};
 103         my $seq = 1;
 104         # Is this the Nth occurrence of this reading in the line?
 105         if( $lemma =~ s/(_)?(\d)$// ) {
 106             $seq = $2;
 107         }
 108         my @lemma_words = split( /\s+/, $lemma );
 109
 110         # Now search for the lemma words within this line.
 111         my $lemma_start = $first_line_reading;
 112         my $lemma_end;
 113         my %seen;
 114         while( $lemma_start ne $too_far ) {
 115             # Loop detection
 116             if( $seen{ $lemma_start->name() } ) {
 117                 warn "Detected loop at " . $lemma_start->name() .
 118                     ", ref $line,$num";
 119                 last;
 120             }
 121             $seen{ $lemma_start->name() } = 1;
 122
 123             # Try to match the lemma.
 124             my $unmatch = 0;
 125             print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
 126                 $lemma_words[0] . "...\n"
 127                 if "$line.$num" eq $scrutinize;
 128             if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
 129                 # Skip it if we need a match that is not the first.
 130                 if( --$seq < 1 ) {
 131                     # Now we have to compare the rest of the words here.
 132                     if( scalar( @lemma_words ) > 1 ) {
 133                         my $next_reading =
 134                             $collation->next_reading( $lemma_start );
 135                         foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 136                             printf STDERR "Now matching %s against %s\n",
 137                                     cmp_str($next_reading), $w
 138                                 if "$line.$num" eq $scrutinize;
 139                             if( $w ne cmp_str($next_reading) ) {
 140                                 $unmatch = 1;
 141                                 last;
 142                             } else {
 143                                 $lemma_end = $next_reading;
 144                                 $next_reading =
 145                                     $collation->next_reading( $lemma_end );
 146                             }
 147                         }
 148                     } else {
 149                         $lemma_end = $lemma_start;
 150                     }
 151                 } else {
 152                     $unmatch = 1;
 153                 }
 154             }
 155             last unless ( $unmatch || !defined( $lemma_end ) );
 156             $lemma_end = undef;
 157             $lemma_start = $collation->next_reading( $lemma_start );
 158         }
 159
 160         unless( $lemma_end ) {
 161             warn "No match found for @lemma_words at $line.$num";
 162             next;
 163         }
 164
 165         # Now we have found the lemma; we will record an 'edit', in
 166         # terms of a splice operation, for each subsequent reading.
 167         # We also note which witnesses take the given edit.
 168
 169         my @lemma_set = $collation->reading_sequence( $lemma_start,
 170                                                       $lemma_end );
 171         my @reading_sets = [ @lemma_set ];
 172
 173         # For each reading that is not rdg_0, we create the variant
 174         # reading nodes, and store the range as an edit operation on
 175         # the base text.
 176         my $variant_objects;
 177         my %pc_seen; # Keep track of mss with explicit post-corr data
 178         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
 179             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 180
 181             # Keep track of lemma nodes that don't actually appear in
 182             # any MSS; we will want to remove them from the collation.
 183             push( @unwitnessed_lemma_nodes, @lemma_set )
 184                 if !@mss && $k eq 'rdg_0';
 185
 186             # Keep track of what witnesses we have seen.
 187             @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
 188             # Keep track of which witnesses bear corrected readings here.
 189             foreach my $m ( @mss ) {
 190                 my $base = _is_post_corr( $m );
 191                 next unless $base;
 192                 $pc_seen{$base} = 1;
 193             }
 194             next if $k eq 'rdg_0';
 195
 196             # Parse the variant into reading tokens.
 197             # TODO don't hardcode the reading split operation
 198             my @variant = split( /\s+/, $app->{$k} );
 199             @variant = () if $app->{$k} eq '/'; # This is an omission.
 200
 201             my @variant_readings;
 202             my $ctr = 0;
 203             foreach my $vw ( @variant ) {
 204                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 205                 my $vwreading = $collation->add_reading( $vwname );
 206                 $vwreading->text( $vw );
 207                 push( @variant_readings, $vwreading );
 208             }
 209
 210             $variant_objects->{$k} = { 'mss' => \@mss,
 211                                        'reading' => \@variant_readings,
 212             };
 213             push( @reading_sets, \@variant_readings );
 214         }
 215
 216         # Now collate and collapse the identical readings within the
 217         # collated sets.  Modifies the reading sets that were passed.
 218         collate_variants( $collation, @reading_sets );
 219
 220         # Record any stated relationships between the nodes and the lemma.
 221         set_relationships( $collation, $app, \@lemma_set, $variant_objects );
 222
 223         # Now create the splice-edit objects that will be used
 224         # to reconstruct each witness.
 225
 226         foreach my $rkey ( keys %$variant_objects ) {
 227             # Object is argument list for splice, so:
 228             # offset, length, replacements
 229             my $edit_object = [ $lemma_start->name,
 230                                 scalar( @lemma_set ),
 231                                 $variant_objects->{$rkey}->{reading} ];
 232             foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
 233                 # Is this a p.c. entry?
 234                 my $base = _is_post_corr( $ms );
 235                 if( $base ) { # this is a post-corr witness
 236                     my $pc_key = $base . "_post";
 237                     add_hash_entry( $edits_required, $pc_key, $edit_object );
 238                 } else { # this is an ante-corr witness
 239                     my $pc_key = $ms . "_post";
 240                     add_hash_entry( $edits_required, $ms, $edit_object );
 241                     unless( $pc_seen{$ms} ) {
 242                         # If this witness carries no correction, add this
 243                         # same object to its post-corrected state.
 244                         add_hash_entry( $edits_required, $pc_key,
 245                                          $edit_object );
 246                     }
 247                 }
 248             }
 249         }
 250     } # Finished going through the apparatus entries
 251
 252     # Now make the witness objects, and create their text sequences
 253     foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
 254         print STDERR "Creating witness $w\n";
 255         my $witness_obj = $collation->tradition->add_witness( sigil => $w );
 256         my $debug; #  = $w eq 'Vb11';
 257         my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
 258         my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
 259             if exists( $edits_required->{$w."_post"} );
 260
 261         my @repeated = check_for_repeated( @ante_corr_seq );
 262         warn "Repeated elements @repeated in $w a.c."
 263             if @repeated;
 264         @repeated = check_for_repeated( @post_corr_seq );
 265         warn "Repeated elements @repeated in $w p.c."
 266             if @repeated;
 267
 268         # Now save these paths in my witness object
 269         if( @post_corr_seq ) {
 270             $witness_obj->path( \@post_corr_seq );
 271             $witness_obj->uncorrected_path( \@ante_corr_seq );
 272         } else {
 273             $witness_obj->path( \@ante_corr_seq );
 274         }
 275     }
 276
 277     # Now remove our 'base text' edges, which is to say, the only
 278     # ones we have created so far.  Also remove any unwitnessed
 279     # lemma nodes (TODO unless we are treating base as witness)
 280     foreach ( $collation->paths() ) {
 281         $collation->del_path( $_ );
 282     }
 283     foreach( @unwitnessed_lemma_nodes ) {
 284         $collation->del_reading( $_ );
 285         # TODO do we need to delete any relationship paths here?
 286     }
 287
 288     ### HACKY HACKY Do some one-off path corrections here.
 289     require( 'data/boodts/s158.HACK' );
 290     KUL::HACK::pre_path_hack( $collation );
 291
 292     # Now walk paths and calculate positional rank.
 293     $collation->make_witness_paths();
 294     KUL::HACK::post_path_hack( $collation );
 295     # Have to check relationship validity at this point, because before that
 296     # we had no paths.
 297 #     foreach my $rel ( $collation->relationships ) {
 298 #         next unless $rel->equal_rank;
 299 #         unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
 300 #             warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
 301 #                             $rel->type, $rel->from->name, $rel->to->name );
 302 #         }
 303 #     }
 304     $collation->calculate_ranks();
 305 }
 306
 307 =item B<read_base>
 308
 309 my @line_beginnings = read_base( 'reference.txt', $collation );
 310
 311 Takes a text file and a (presumed empty) collation object, adds the
 312 words as simple linear readings to the collation, and returns a
 313 list of readings that represent the beginning of lines. This collation
 314 is now the starting point for application of apparatus entries in
 315 merge_base, e.g. from a CSV file or a Classical Text Editor file.
 316
 317 =cut
 318
 319 sub read_base {
 320     my( $base_file, $collation ) = @_;
 321
 322     # This array gives the first reading for each line.  We put the
 323     # common starting point in line zero.
 324     my $last_reading = $collation->start();
 325     $base_text_index{$last_reading->name} = 0;
 326     my $lineref_array = [ $last_reading ]; # There is no line zero.
 327
 328     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
 329     my $i = 1;
 330     while(<BASE>) {
 331         # Make the readings, and connect them up for the base, but
 332         # also save the first reading of each line in an array for the
 333         # purpose.
 334         # TODO use configurable reading separator
 335         chomp;
 336         my @words = split;
 337         my $started = 0;
 338         my $wordref = 0;
 339         my $lineref = scalar @$lineref_array;
 340         last if $SHORTEND && $lineref > $SHORTEND;
 341         foreach my $w ( @words ) {
 342             my $readingref = join( ',', $lineref, ++$wordref );
 343             my $reading = $collation->add_reading( $readingref );
 344             $reading->text( $w );
 345             unless( $started ) {
 346                 push( @$lineref_array, $reading );
 347                 $started = 1;
 348             }
 349             # Add edge paths in the graph, for easier tracking when
 350             # we start applying corrections.  These paths will be
 351             # removed when we're done.
 352             my $path = $collation->add_path( $last_reading, $reading,
 353                                              $collation->baselabel );
 354             $last_reading = $reading;
 355
 356             # Note an array index for the reading, for later correction splices.
 357             $base_text_index{$readingref} = $i++;
 358         }
 359     }
 360     close BASE;
 361     # Ending point for all texts
 362     $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
 363     push( @$lineref_array, $collation->end );
 364     $base_text_index{$collation->end->name} = $i;
 365
 366     return( @$lineref_array );
 367 }
 368
 369 sub set_relationships {
 370     my( $collation, $app, $lemma, $variants ) = @_;
 371     foreach my $rkey ( keys %$variants ) {
 372         my $var = $variants->{$rkey}->{'reading'};
 373         my $type = $app->{sprintf( "_%s_type", $rkey )};
 374         my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
 375         my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
 376
 377         my %rel_options = ();
 378         $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
 379         $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
 380
 381         if( $type =~ /^(inv|tr|rep)$/i ) {
 382             # Transposition or repetition: look for nodes with the
 383             # same label but different IDs and mark them.
 384             $type = 'repetition' if $type =~ /^rep/i;
 385             $rel_options{'type'} = $type;
 386             $rel_options{'equal_rank'} = undef;
 387             my %labels;
 388             foreach my $r ( @$lemma ) {
 389                 $labels{cmp_str( $r )} = $r;
 390             }
 391             foreach my $r( @$var ) {
 392                 if( exists $labels{$r->label} &&
 393                     $r->name ne $labels{$r->label}->name ) {
 394                     if( $type eq 'repetition' ) {
 395                         # Repetition
 396                         $collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
 397                     } else {
 398                         # Transposition
 399                         $r->set_identical( $labels{$r->label} );
 400                     }
 401                 }
 402             }
 403         } elsif( $type =~ /^(gr|sp(el)?)$/i ) {
 404
 405             # Grammar/spelling/lexical: this can be a one-to-one or
 406             # one-to-many mapping.  We should think about merging
 407             # readings if it is one-to-many.
 408
 409             $type = 'grammatical' if $type =~ /gr/i;
 410             $type = 'spelling' if $type =~ /sp/i;
 411             $type = 'repetition' if $type =~ /rep/i;
 412             # $type = 'lexical' if $type =~ /lex/i;
 413             $rel_options{'type'} = $type;
 414             $rel_options{'equal_rank'} = 1;
 415             if( @$lemma == @$var ) {
 416                 foreach my $i ( 0 .. $#{$lemma} ) {
 417                     $collation->add_relationship( $var->[$i], $lemma->[$i],
 418                         \%rel_options );
 419                 }
 420             } else {
 421                 # An uneven many-to-many mapping.  Skip for now.
 422                 # We really want to make a segment out of whatever we have.
 423                 # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
 424                 # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
 425                 # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
 426                 if( @$lemma == 1 && @$var == 1 ) {
 427                     $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
 428                 }
 429             }
 430         } elsif( $type !~ /^(add|om|lex)$/i ) {
 431             warn "Unrecognized type $type";
 432         }
 433     }
 434 }
 435
 436
 437
 438 sub apply_edits {
 439     my( $collation, $edit_sequence, $debug ) = @_;
 440     my @lemma_text = $collation->reading_sequence( $collation->start,
 441                                            $collation->reading( '#END#' ) );
 442     my $drift = 0;
 443     foreach my $correction ( @$edit_sequence ) {
 444         my( $lemma_start, $length, $items ) = @$correction;
 445         my $offset = $base_text_index{$lemma_start};
 446         my $realoffset = $offset + $drift;
 447         if( $debug ||
 448             $lemma_text[$realoffset]->name ne $lemma_start ) {
 449             my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
 450             my @base_phrase;
 451             my $i = $realoffset;
 452             my $l = $collation->reading( $lemma_start );
 453             while( $i < $realoffset+$length ) {
 454                 push( @base_phrase, $l );
 455                 $l = $collation->next_reading( $l );
 456                 $i++;
 457             }
 458
 459             print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
 460                                   "with %s (%s) with drift %d\n",
 461                                   join( ' ', map {$_->label} @base_phrase ),
 462                                   join( ' ', map {$_->name} @base_phrase ),
 463                                   $realoffset,
 464                                   join( ' ', map {$_->label} @$items ),
 465                                   join( ' ', map {$_->name} @$items ),
 466                                   $drift,
 467                                   ) if $debug;
 468
 469             if( $lemma_text[$realoffset]->name ne $lemma_start ) {
 470                 warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
 471                                "but %s (%s) is there instead",
 472                                join( ' ', map {$_->label} @base_phrase ),
 473                                join( ' ', map {$_->name} @base_phrase ),
 474                                join( ' ', map {$_->label} @$items ),
 475                                join( ' ', map {$_->name} @$items ),
 476                                join( ' ', map {$_->label} @this_phrase ),
 477                                join( ' ', map {$_->name} @this_phrase ),
 478                       ) );
 479                 # next;
 480             }
 481         }
 482         splice( @lemma_text, $realoffset, $length, @$items );
 483         $drift += @$items - $length;
 484     }
 485     return @lemma_text;
 486 }
 487
 488
 489 # Helper function. Given a witness sigil, if it is a post-correctione
 490 # sigil,return the base witness.  If not, return a false value.
 491 sub _is_post_corr {
 492     my( $sigil ) = @_;
 493     if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
 494         return $1;
 495     }
 496     return undef;
 497 }
 498
 499
 500 =back
 501
 502 =head1 LICENSE
 503
 504 This package is free software and is provided "as is" without express
 505 or implied warranty.  You can redistribute it and/or modify it under
 506 the same terms as Perl itself.
 507
 508 =head1 AUTHOR
 509
 510 Tara L Andrews, aurum@cpan.org
 511
 512 =cut
 513
 514 1;