lib/Text/Tradition/Parser/BaseText.pm

   1 package Text::Tradition::Parser::BaseText;
   2
   3 use strict;
   4 use warnings;
   5 use Module::Load;
   6 use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );
   7
   8 =head1 NAME
   9
  10 Text::Tradition::Parser::BaseText
  11
  12 =head1 SYNOPSIS
  13
  14 use Text::Tradition::Parser::BaseText qw( merge_base );
  15 merge_base( $graph, 'reference.txt', @apparatus_entries )
  16
  17 =head1 DESCRIPTION
  18
  19 For an overview of the package, see the documentation for the
  20 Text::Tradition::Graph module.
  21
  22 This module is meant for use with certain of the other Parser classes
  23 - whenever a list of variants is given with reference to a base text,
  24 these must be joined into a single collation.  The parser should
  25 therefore make a list of variants and their locations, and BaseText
  26 will join those listed variants onto the reference text.
  27
  28 =head1 SUBROUTINES
  29
  30 =over
  31
  32 =item B<parse>
  33
  34 parse( $graph, %opts );
  35
  36 Takes an initialized graph and a set of options, which must include:
  37 - 'base' - the base text referenced by the variants
  38 - 'format' - the format of the variant list
  39 - 'data' - the variants, in the given format.
  40
  41 =cut
  42
  43 sub parse {
  44     my( $tradition, %opts ) = @_;
  45
  46     my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
  47     load( $format_mod );
  48     my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
  49     merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
  50 }
  51
  52 =item B<merge_base>
  53
  54 merge_base( $graph, 'reference.txt', @apparatus_entries )
  55
  56 Takes three arguments: a newly-initialized Text::Tradition::Graph
  57 object, a text file containing the reference text, and a list of
  58 variants (apparatus entries).  Adds the base text to the graph, and
  59 joins the variants to that.
  60
  61 The list of variants is an array of hash references; each hash takes
  62 the form
  63  { '_id' => line reference,
  64    'rdg_0' => lemma reading,
  65    'rdg_1' => first variant,
  66    ...  # and so on until all distinct readings are listed
  67    'WitnessA' => 'rdg_0',
  68    'WitnessB' => 'rdg_1',
  69    ...  # and so on until all witnesses are listed with their readings
  70  }
  71
  72 Any hash key that is not of the form /^rdg_\d+$/ and that does not
  73 begin with an underscore is assumed to be a witness name.  Any 'meta'
  74 information to be passed must be passed in a key with a leading
  75 underscore in its name.
  76
  77 =cut
  78
  79 my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
  80
  81 my %base_text_index;
  82 my $edits_required = {};
  83
  84 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
  85
  86 sub merge_base {
  87     my( $collation, $base_file, @app_entries ) = @_;
  88     my @base_line_starts = read_base( $base_file, $collation );
  89
  90     my %all_witnesses;
  91     my @unwitnessed_lemma_nodes;
  92     foreach my $app ( @app_entries ) {
  93         my( $line, $num ) = split( /\./, $app->{_id} );
  94         # DEBUG with a short graph
  95         last if $SHORTEND && $line > $SHORTEND;
  96         # DEBUG for problematic entries
  97         my $scrutinize = '';
  98         my $first_line_reading = $base_line_starts[ $line ];
  99         my $too_far = $base_line_starts[ $line+1 ];
 100
 101         my $lemma = $app->{rdg_0};
 102         my $seq = 1;
 103         # Is this the Nth occurrence of this reading in the line?
 104         if( $lemma =~ s/(_)?(\d)$// ) {
 105             $seq = $2;
 106         }
 107         my @lemma_words = split( /\s+/, $lemma );
 108
 109         # Now search for the lemma words within this line.
 110         my $lemma_start = $first_line_reading;
 111         my $lemma_end;
 112         my %seen;
 113         while( $lemma_start ne $too_far ) {
 114             # Loop detection
 115             if( $seen{ $lemma_start->name() } ) {
 116                 warn "Detected loop at " . $lemma_start->name() .
 117                     ", ref $line,$num";
 118                 last;
 119             }
 120             $seen{ $lemma_start->name() } = 1;
 121
 122             # Try to match the lemma.
 123             my $unmatch = 0;
 124             print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
 125                 $lemma_words[0] . "...\n"
 126                 if "$line.$num" eq $scrutinize;
 127             if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
 128                 # Skip it if we need a match that is not the first.
 129                 if( --$seq < 1 ) {
 130                     # Now we have to compare the rest of the words here.
 131                     if( scalar( @lemma_words ) > 1 ) {
 132                         my $next_reading =
 133                             $collation->next_reading( $lemma_start );
 134                         foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 135                             printf STDERR "Now matching %s against %s\n",
 136                                     cmp_str($next_reading), $w
 137                                 if "$line.$num" eq $scrutinize;
 138                             if( $w ne cmp_str($next_reading) ) {
 139                                 $unmatch = 1;
 140                                 last;
 141                             } else {
 142                                 $lemma_end = $next_reading;
 143                                 $next_reading =
 144                                     $collation->next_reading( $lemma_end );
 145                             }
 146                         }
 147                     } else {
 148                         $lemma_end = $lemma_start;
 149                     }
 150                 } else {
 151                     $unmatch = 1;
 152                 }
 153             }
 154             last unless ( $unmatch || !defined( $lemma_end ) );
 155             $lemma_end = undef;
 156             $lemma_start = $collation->next_reading( $lemma_start );
 157         }
 158
 159         unless( $lemma_end ) {
 160             warn "No match found for @lemma_words at $line.$num";
 161             next;
 162         }
 163
 164         # Now we have found the lemma; we will record an 'edit', in
 165         # terms of a splice operation, for each subsequent reading.
 166         # We also note which witnesses take the given edit.
 167
 168         my @lemma_set = $collation->reading_sequence( $lemma_start,
 169                                                       $lemma_end );
 170         my @reading_sets = [ @lemma_set ];
 171
 172         # For each reading that is not rdg_0, we create the variant
 173         # reading nodes, and store the range as an edit operation on
 174         # the base text.
 175         my $variant_objects;
 176         my %pc_seen; # Keep track of mss with explicit post-corr data
 177         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
 178             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 179
 180             # Keep track of lemma nodes that don't actually appear in
 181             # any MSS; we will want to remove them from the collation.
 182             push( @unwitnessed_lemma_nodes, @lemma_set )
 183                 if !@mss && $k eq 'rdg_0';
 184
 185             # Keep track of what witnesses we have seen.
 186             @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
 187             # Keep track of which witnesses bear corrected readings here.
 188             foreach my $m ( @mss ) {
 189                 my $base = _is_post_corr( $m );
 190                 next unless $base;
 191                 $pc_seen{$base} = 1;
 192             }
 193             next if $k eq 'rdg_0';
 194
 195             # Parse the variant into reading tokens.
 196             # TODO don't hardcode the reading split operation
 197             my @variant = split( /\s+/, $app->{$k} );
 198             @variant = () if $app->{$k} eq '/'; # This is an omission.
 199
 200             my @variant_readings;
 201             my $ctr = 0;
 202             foreach my $vw ( @variant ) {
 203                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 204                 my $vwreading = $collation->add_reading( $vwname );
 205                 $vwreading->text( $vw );
 206                 push( @variant_readings, $vwreading );
 207             }
 208
 209             $variant_objects->{$k} = { 'mss' => \@mss,
 210                                        'reading' => \@variant_readings,
 211             };
 212             push( @reading_sets, \@variant_readings );
 213         }
 214
 215         # Now collate and collapse the identical readings within the
 216         # collated sets.  Modifies the reading sets that were passed.
 217         collate_variants( $collation, @reading_sets );
 218
 219         # Record any stated relationships between the nodes and the lemma.
 220         set_relationships( $collation, $app, \@lemma_set, $variant_objects );
 221
 222         # Now create the splice-edit objects that will be used
 223         # to reconstruct each witness.
 224
 225         foreach my $rkey ( keys %$variant_objects ) {
 226             # Object is argument list for splice, so:
 227             # offset, length, replacements
 228             my $edit_object = [ $lemma_start->name,
 229                                 scalar( @lemma_set ),
 230                                 $variant_objects->{$rkey}->{reading} ];
 231             foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
 232                 # Is this a p.c. entry?
 233                 my $base = _is_post_corr( $ms );
 234                 if( $base ) { # this is a post-corr witness
 235                     my $pc_key = $base . "_post";
 236                     add_hash_entry( $edits_required, $pc_key, $edit_object );
 237                 } else { # this is an ante-corr witness
 238                     my $pc_key = $ms . "_post";
 239                     add_hash_entry( $edits_required, $ms, $edit_object );
 240                     unless( $pc_seen{$ms} ) {
 241                         # If this witness carries no correction, add this
 242                         # same object to its post-corrected state.
 243                         add_hash_entry( $edits_required, $pc_key,
 244                                          $edit_object );
 245                     }
 246                 }
 247             }
 248         }
 249     } # Finished going through the apparatus entries
 250
 251     # Now make the witness objects, and create their text sequences
 252     foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
 253         print STDERR "Creating witness $w\n";
 254         my $witness_obj = $collation->tradition->add_witness( sigil => $w );
 255         my $debug; #  = $w eq 'Vb11';
 256         my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
 257         my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
 258             if exists( $edits_required->{$w."_post"} );
 259
 260         my @repeated = check_for_repeated( @ante_corr_seq );
 261         warn "Repeated elements @repeated in $w a.c."
 262             if @repeated;
 263         @repeated = check_for_repeated( @post_corr_seq );
 264         warn "Repeated elements @repeated in $w p.c."
 265             if @repeated;
 266
 267         # Now save these paths in my witness object
 268         if( @post_corr_seq ) {
 269             $witness_obj->path( \@post_corr_seq );
 270             $witness_obj->uncorrected_path( \@ante_corr_seq );
 271         } else {
 272             $witness_obj->path( \@ante_corr_seq );
 273         }
 274     }
 275
 276     # Now remove our 'base text' edges, which is to say, the only
 277     # ones we have created so far.  Also remove any unwitnessed
 278     # lemma nodes (TODO unless we are treating base as witness)
 279     foreach ( $collation->paths() ) {
 280         $collation->del_path( $_ );
 281     }
 282     foreach( @unwitnessed_lemma_nodes ) {
 283         $collation->del_reading( $_ );
 284         # TODO do we need to delete any relationship paths here?
 285     }
 286
 287     ### HACKY HACKY Do some one-off path corrections here.
 288     if( $collation->linear ) {
 289        my $c = $collation;
 290        my $end = $SHORTEND ? $SHORTEND : 155;
 291        # Vb11
 292        my $path;
 293        if( $end > 16 ) {
 294            $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
 295            $path = $c->tradition->witness('Vb11')->path;
 296            splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
 297            $path = $c->tradition->witness('Vb11')->uncorrected_path;
 298            splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
 299        }
 300        # What else?
 301        # Vb26:
 302        $path = $c->tradition->witness('Vb26')->path;
 303        splice( @$path, 618, 0, $c->reading('rdg_1/46.1.1') ) if $end > 46;
 304        # Vb13:
 305        $path = $c->tradition->witness('Vb13')->path;
 306        splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
 307        $path = $c->tradition->witness('Vb13')->uncorrected_path;
 308        splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58;
 309        # Vb20 a.c.:
 310        $path = $c->tradition->witness('Vb20')->uncorrected_path;
 311        splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94;
 312        # Vb5:
 313        $path = $c->tradition->witness('Vb5')->path;
 314        splice( @$path, 1436, 0, $c->reading('rdg_1/106.5.1') ) if $end > 106;
 315        # extraneous:
 316        $c->del_reading( 'rdg_2/147.6.13' );
 317        $c->del_reading( 'rdg_2/147.6.14' );
 318        $c->del_reading( 'rdg_2/147.6.15' );
 319
 320     } else {
 321        my $c = $collation;
 322        my $end = $SHORTEND ? $SHORTEND : 155;
 323        # Vb5:
 324        my $path = $c->tradition->witness('Vb5')->path;
 325        splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106;
 326        # Vb11:
 327        $path = $c->tradition->witness('Vb11')->path;
 328        if( $end > 16 ) {
 329            $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
 330            splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) );
 331        }
 332        # Vb13:
 333        $path = $c->tradition->witness('Vb13')->path;
 334        splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
 335        $path = $c->tradition->witness('Vb13')->uncorrected_path;
 336        splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58;
 337        # Vb20 a.c.:
 338        $path = $c->tradition->witness('Vb20')->uncorrected_path;
 339        splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94;
 340        # Vb26:
 341        $path = $c->tradition->witness('Vb26')->path;
 342        splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46;
 343     }
 344
 345     # Now walk paths and calculate positional rank.
 346     $collation->make_witness_paths();
 347     # Have to check relationship validity at this point, because before that
 348     # we had no paths.
 349 #     foreach my $rel ( $collation->relationships ) {
 350 #         next unless $rel->equal_rank;
 351 #         unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
 352 #             warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
 353 #                             $rel->type, $rel->from->name, $rel->to->name );
 354 #         }
 355 #     }
 356     $collation->calculate_ranks();
 357 }
 358
 359 =item B<read_base>
 360
 361 my @line_beginnings = read_base( 'reference.txt', $collation );
 362
 363 Takes a text file and a (presumed empty) collation object, adds the
 364 words as simple linear readings to the collation, and returns a
 365 list of readings that represent the beginning of lines. This collation
 366 is now the starting point for application of apparatus entries in
 367 merge_base, e.g. from a CSV file or a Classical Text Editor file.
 368
 369 =cut
 370
 371 sub read_base {
 372     my( $base_file, $collation ) = @_;
 373
 374     # This array gives the first reading for each line.  We put the
 375     # common starting point in line zero.
 376     my $last_reading = $collation->start();
 377     $base_text_index{$last_reading->name} = 0;
 378     my $lineref_array = [ $last_reading ]; # There is no line zero.
 379
 380     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
 381     my $i = 1;
 382     while(<BASE>) {
 383         # Make the readings, and connect them up for the base, but
 384         # also save the first reading of each line in an array for the
 385         # purpose.
 386         # TODO use configurable reading separator
 387         chomp;
 388         my @words = split;
 389         my $started = 0;
 390         my $wordref = 0;
 391         my $lineref = scalar @$lineref_array;
 392         last if $SHORTEND && $lineref > $SHORTEND;
 393         foreach my $w ( @words ) {
 394             my $readingref = join( ',', $lineref, ++$wordref );
 395             my $reading = $collation->add_reading( $readingref );
 396             $reading->text( $w );
 397             unless( $started ) {
 398                 push( @$lineref_array, $reading );
 399                 $started = 1;
 400             }
 401             # Add edge paths in the graph, for easier tracking when
 402             # we start applying corrections.  These paths will be
 403             # removed when we're done.
 404             my $path = $collation->add_path( $last_reading, $reading,
 405                                              $collation->baselabel );
 406             $last_reading = $reading;
 407
 408             # Note an array index for the reading, for later correction splices.
 409             $base_text_index{$readingref} = $i++;
 410         }
 411     }
 412     close BASE;
 413     # Ending point for all texts
 414     $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
 415     push( @$lineref_array, $collation->end );
 416     $base_text_index{$collation->end->name} = $i;
 417
 418     return( @$lineref_array );
 419 }
 420
 421 sub set_relationships {
 422     my( $collation, $app, $lemma, $variants ) = @_;
 423     foreach my $rkey ( keys %$variants ) {
 424         my $var = $variants->{$rkey}->{'reading'};
 425         my $type = $app->{sprintf( "_%s_type", $rkey )};
 426         my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
 427         my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
 428
 429         my %rel_options = ();
 430         $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
 431         $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
 432
 433         if( $type =~ /^(inv|tr|rep)$/i ) {
 434             # Transposition or repetition: look for nodes with the
 435             # same label but different IDs and mark them.
 436             $type = 'repetition' if $type =~ /^rep/i;
 437             $rel_options{'type'} = $type;
 438             $rel_options{'equal_rank'} = undef;
 439             my %labels;
 440             foreach my $r ( @$lemma ) {
 441                 $labels{cmp_str( $r )} = $r;
 442             }
 443             foreach my $r( @$var ) {
 444                 if( exists $labels{$r->label} &&
 445                     $r->name ne $labels{$r->label}->name ) {
 446                     if( $type eq 'repetition' ) {
 447                         # Repetition
 448                         $collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
 449                     } else {
 450                         # Transposition
 451                         $r->set_identical( $labels{$r->label} );
 452                     }
 453                 }
 454             }
 455         } elsif( $type =~ /^(gr|sp(el)?)$/i ) {
 456
 457             # Grammar/spelling/lexical: this can be a one-to-one or
 458             # one-to-many mapping.  We should think about merging
 459             # readings if it is one-to-many.
 460
 461             $type = 'grammatical' if $type =~ /gr/i;
 462             $type = 'spelling' if $type =~ /sp/i;
 463             $type = 'repetition' if $type =~ /rep/i;
 464             # $type = 'lexical' if $type =~ /lex/i;
 465             $rel_options{'type'} = $type;
 466             $rel_options{'equal_rank'} = 1;
 467             if( @$lemma == @$var ) {
 468                 foreach my $i ( 0 .. $#{$lemma} ) {
 469                     $collation->add_relationship( $var->[$i], $lemma->[$i],
 470                         \%rel_options );
 471                 }
 472             } else {
 473                 # An uneven many-to-many mapping.  Skip for now.
 474                 # We really want to make a segment out of whatever we have.
 475                 # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
 476                 # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
 477                 # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
 478                 if( @$lemma == 1 && @$var == 1 ) {
 479                     $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
 480                 }
 481             }
 482         } elsif( $type !~ /^(add|om|lex)$/i ) {
 483             warn "Unrecognized type $type";
 484         }
 485     }
 486 }
 487
 488
 489
 490 sub apply_edits {
 491     my( $collation, $edit_sequence, $debug ) = @_;
 492     my @lemma_text = $collation->reading_sequence( $collation->start,
 493                                            $collation->reading( '#END#' ) );
 494     my $drift = 0;
 495     foreach my $correction ( @$edit_sequence ) {
 496         my( $lemma_start, $length, $items ) = @$correction;
 497         my $offset = $base_text_index{$lemma_start};
 498         my $realoffset = $offset + $drift;
 499         if( $debug ||
 500             $lemma_text[$realoffset]->name ne $lemma_start ) {
 501             my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
 502             my @base_phrase;
 503             my $i = $realoffset;
 504             my $l = $collation->reading( $lemma_start );
 505             while( $i < $realoffset+$length ) {
 506                 push( @base_phrase, $l );
 507                 $l = $collation->next_reading( $l );
 508                 $i++;
 509             }
 510
 511             print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
 512                                   "with %s (%s) with drift %d\n",
 513                                   join( ' ', map {$_->label} @base_phrase ),
 514                                   join( ' ', map {$_->name} @base_phrase ),
 515                                   $realoffset,
 516                                   join( ' ', map {$_->label} @$items ),
 517                                   join( ' ', map {$_->name} @$items ),
 518                                   $drift,
 519                                   ) if $debug;
 520
 521             if( $lemma_text[$realoffset]->name ne $lemma_start ) {
 522                 warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
 523                                "but %s (%s) is there instead",
 524                                join( ' ', map {$_->label} @base_phrase ),
 525                                join( ' ', map {$_->name} @base_phrase ),
 526                                join( ' ', map {$_->label} @$items ),
 527                                join( ' ', map {$_->name} @$items ),
 528                                join( ' ', map {$_->label} @this_phrase ),
 529                                join( ' ', map {$_->name} @this_phrase ),
 530                       ) );
 531                 # next;
 532             }
 533         }
 534         splice( @lemma_text, $realoffset, $length, @$items );
 535         $drift += @$items - $length;
 536     }
 537     return @lemma_text;
 538 }
 539
 540
 541 # Helper function. Given a witness sigil, if it is a post-correctione
 542 # sigil,return the base witness.  If not, return a false value.
 543 sub _is_post_corr {
 544     my( $sigil ) = @_;
 545     if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
 546         return $1;
 547     }
 548     return undef;
 549 }
 550
 551
 552 =back
 553
 554 =head1 LICENSE
 555
 556 This package is free software and is provided "as is" without express
 557 or implied warranty.  You can redistribute it and/or modify it under
 558 the same terms as Perl itself.
 559
 560 =head1 AUTHOR
 561
 562 Tara L Andrews, aurum@cpan.org
 563
 564 =cut
 565
 566 1;