lib/Text/Tradition/Parser/BaseText.pm

   1 package Text::Tradition::Parser::BaseText;
   2
   3 use strict;
   4 use warnings;
   5 use Module::Load;
   6 use Algorithm::Diff;
   7
   8 =head1 NAME
   9
  10 Text::Tradition::Parser::BaseText
  11
  12 =head1 SYNOPSIS
  13
  14 use Text::Tradition::Parser::BaseText qw( merge_base );
  15 merge_base( $graph, 'reference.txt', @apparatus_entries )
  16
  17 =head1 DESCRIPTION
  18
  19 For an overview of the package, see the documentation for the
  20 Text::Tradition::Graph module.
  21
  22 This module is meant for use with certain of the other Parser classes
  23 - whenever a list of variants is given with reference to a base text,
  24 these must be joined into a single collation.  The parser should
  25 therefore make a list of variants and their locations, and BaseText
  26 will join those listed variants onto the reference text.
  27
  28 =head1 SUBROUTINES
  29
  30 =over
  31
  32 =item B<parse>
  33
  34 parse( $graph, %opts );
  35
  36 Takes an initialized graph and a set of options, which must include:
  37 - 'base' - the base text referenced by the variants
  38 - 'format' - the format of the variant list
  39 - 'data' - the variants, in the given format.
  40
  41 =cut
  42
  43 my $DETRANSPOSE = 0;
  44 sub parse {
  45     my( $tradition, %opts ) = @_;
  46
  47     my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
  48     load( $format_mod );
  49     my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
  50     $DETRANSPOSE = 1 if $opts{'linear'};
  51     merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
  52 }
  53
  54 =item B<merge_base>
  55
  56 merge_base( $graph, 'reference.txt', @apparatus_entries )
  57
  58 Takes three arguments: a newly-initialized Text::Tradition::Graph
  59 object, a text file containing the reference text, and a list of
  60 variants (apparatus entries).  Adds the base text to the graph, and
  61 joins the variants to that.
  62
  63 The list of variants is an array of hash references; each hash takes
  64 the form
  65  { '_id' => line reference,
  66    'rdg_0' => lemma reading,
  67    'rdg_1' => first variant,
  68    ...  # and so on until all distinct readings are listed
  69    'WitnessA' => 'rdg_0',
  70    'WitnessB' => 'rdg_1',
  71    ...  # and so on until all witnesses are listed with their readings
  72  }
  73
  74 Any hash key that is not of the form /^rdg_\d+$/ and that does not
  75 begin with an underscore is assumed to be a witness name.  Any 'meta'
  76 information to be passed must be passed in a key with a leading
  77 underscore in its name.
  78
  79 =cut
  80
  81 my $SHORT = 25;  # Debug var - set this to limit the number of lines parsed
  82
  83 my %base_text_index;
  84 my $edits_required = {};
  85
  86 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
  87
  88 sub merge_base {
  89     my( $collation, $base_file, @app_entries ) = @_;
  90     my @base_line_starts = read_base( $base_file, $collation );
  91
  92     my %all_witnesses;
  93     my @unwitnessed_lemma_nodes;
  94     foreach my $app ( @app_entries ) {
  95         my( $line, $num ) = split( /\./, $app->{_id} );
  96         # DEBUG with a short graph
  97         last if $SHORT && $line > $SHORT;
  98         # DEBUG for problematic entries
  99         my $scrutinize = '';
 100         my $first_line_reading = $base_line_starts[ $line ];
 101         my $too_far = $base_line_starts[ $line+1 ];
 102
 103         my $lemma = $app->{rdg_0};
 104         my $seq = 1;
 105         # Is this the Nth occurrence of this reading in the line?
 106         if( $lemma =~ s/(_)?(\d)$// ) {
 107             $seq = $2;
 108         }
 109         my @lemma_words = split( /\s+/, $lemma );
 110
 111         # Now search for the lemma words within this line.
 112         my $lemma_start = $first_line_reading;
 113         my $lemma_end;
 114         my %seen;
 115         while( $lemma_start ne $too_far ) {
 116             # Loop detection
 117             if( $seen{ $lemma_start->name() } ) {
 118                 warn "Detected loop at " . $lemma_start->name() .
 119                     ", ref $line,$num";
 120                 last;
 121             }
 122             $seen{ $lemma_start->name() } = 1;
 123
 124             # Try to match the lemma.
 125             my $unmatch = 0;
 126             print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
 127                 $lemma_words[0] . "...\n"
 128                 if "$line.$num" eq $scrutinize;
 129             if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
 130                 # Skip it if we need a match that is not the first.
 131                 if( --$seq < 1 ) {
 132                     # Now we have to compare the rest of the words here.
 133                     if( scalar( @lemma_words ) > 1 ) {
 134                         my $next_reading =
 135                             $collation->next_reading( $lemma_start );
 136                         foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 137                             printf STDERR "Now matching %s against %s\n",
 138                                     cmp_str($next_reading), $w
 139                                 if "$line.$num" eq $scrutinize;
 140                             if( $w ne cmp_str($next_reading) ) {
 141                                 $unmatch = 1;
 142                                 last;
 143                             } else {
 144                                 $lemma_end = $next_reading;
 145                                 $next_reading =
 146                                     $collation->next_reading( $lemma_end );
 147                             }
 148                         }
 149                     } else {
 150                         $lemma_end = $lemma_start;
 151                     }
 152                 } else {
 153                     $unmatch = 1;
 154                 }
 155             }
 156             last unless ( $unmatch || !defined( $lemma_end ) );
 157             $lemma_end = undef;
 158             $lemma_start = $collation->next_reading( $lemma_start );
 159         }
 160
 161         unless( $lemma_end ) {
 162             warn "No match found for @lemma_words at $line.$num";
 163             next;
 164         }
 165
 166         # Now we have found the lemma; we will record an 'edit', in
 167         # terms of a splice operation, for each subsequent reading.
 168         # We also note which witnesses take the given edit.
 169
 170         my @lemma_set = $collation->reading_sequence( $lemma_start,
 171                                                       $lemma_end );
 172         my @reading_sets = [ @lemma_set ];
 173
 174         # For each reading that is not rdg_0, we create the variant
 175         # reading nodes, and store the range as an edit operation on
 176         # the base text.
 177         my $variant_objects;
 178         my %pc_seen; # Keep track of mss with explicit post-corr data
 179         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
 180             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 181             push( @unwitnessed_lemma_nodes, @lemma_set )
 182                 if !@mss && $k eq 'rdg_0';
 183
 184             # Keep track of what witnesses we have seen.
 185             @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
 186             # Keep track of which witnesses bear corrected readings here.
 187             foreach my $m ( @mss ) {
 188                 my $base = _is_post_corr( $m );
 189                 next unless $base;
 190                 $pc_seen{$base} = 1;
 191             }
 192             next if $k eq 'rdg_0';
 193
 194             # TODO don't hardcode the reading split operation
 195             my @variant = split( /\s+/, $app->{$k} );
 196             @variant = () if $app->{$k} eq '/'; # This is an omission.
 197
 198             # Make the variant into a set of readings.
 199             my @variant_readings;
 200             my $ctr = 0;
 201             foreach my $vw ( @variant ) {
 202                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 203                 my $vwreading = $collation->add_reading( $vwname );
 204                 $vwreading->text( $vw );
 205                 push( @variant_readings, $vwreading );
 206             }
 207
 208             $variant_objects->{$k} = { 'mss' => \@mss,
 209                                        'reading' => \@variant_readings,
 210             };
 211             push( @reading_sets, \@variant_readings );
 212         }
 213
 214         # Now collate and collapse the identical readings within the
 215         # collated sets.  Modifies the reading sets that were passed.
 216         $DB::single = 1 if "$line.$num" eq '16.2';
 217         collate_variants( $collation, @reading_sets );
 218
 219         # Now create the splice-edit objects that will be used
 220         # to reconstruct each witness.
 221
 222         foreach my $rkey ( keys %$variant_objects ) {
 223             # Object is argument list for splice, so:
 224             # offset, length, replacements
 225             my $edit_object = [ $base_text_index{$lemma_start->name},
 226                                 scalar( @lemma_set ),
 227                                 $variant_objects->{$rkey}->{reading} ];
 228             foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
 229                 # Is this a p.c. entry?
 230                 my $base = _is_post_corr( $ms );
 231                 if( $base ) { # this is a post-corr witness
 232                     my $pc_key = $base . "_post";
 233                     _add_hash_entry( $edits_required, $pc_key, $edit_object );
 234                 } else { # this is an ante-corr witness
 235                     my $pc_key = $ms . "_post";
 236                     _add_hash_entry( $edits_required, $ms, $edit_object );
 237                     unless( $pc_seen{$ms} ) {
 238                         # If this witness carries no correction, add this
 239                         # same object to its post-corrected state.
 240                         _add_hash_entry( $edits_required, $pc_key,
 241                                          $edit_object );
 242                     }
 243                 }
 244             }
 245         }
 246     } # Finished going through the apparatus entries
 247
 248     # Now make the witness objects, and create their text sequences
 249     foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
 250         my $witness_obj = $collation->tradition->add_witness( sigil => $w );
 251         my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w} );
 252         my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"} )
 253             if exists( $edits_required->{$w."_post"} );
 254
 255         # Now save these paths in my witness object
 256         if( @post_corr_seq ) {
 257             $witness_obj->path( \@post_corr_seq );
 258             my @ante_corr = make_witness_uncorrections( \@post_corr_seq,
 259                                                         \@ante_corr_seq );
 260             $witness_obj->ante_corr( \@ante_corr );
 261         } else {
 262             $witness_obj->path( \@ante_corr_seq );
 263         }
 264     }
 265
 266     # Now remove our 'base text' edges, which is to say, the only
 267     # ones we have created so far.  Also remove any nodes that didn't
 268     # appear in any witnesses.
 269     foreach ( $collation->paths() ) {
 270         $collation->del_path( $_ );
 271     }
 272     foreach( @unwitnessed_lemma_nodes ) {
 273         $collation->del_reading( $_ );
 274     }
 275
 276     # Now walk paths and calculate positions.
 277     my @common_readings =
 278         $collation->make_witness_paths();
 279     $collation->calculate_positions( @common_readings );
 280 }
 281
 282 =item B<read_base>
 283
 284 my @line_beginnings = read_base( 'reference.txt', $collation );
 285
 286 Takes a text file and a (presumed empty) collation object, adds the
 287 words as simple linear readings to the collation, and returns a
 288 list of readings that represent the beginning of lines. This collation
 289 is now the starting point for application of apparatus entries in
 290 merge_base, e.g. from a CSV file or a Classical Text Editor file.
 291
 292 =cut
 293
 294 sub read_base {
 295     my( $base_file, $collation ) = @_;
 296
 297     # This array gives the first reading for each line.  We put the
 298     # common starting point in line zero.
 299     my $last_reading = $collation->start();
 300     $base_text_index{$last_reading->name} = 0;
 301     my $lineref_array = [ $last_reading ]; # There is no line zero.
 302
 303     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
 304     my $i = 1;
 305     while(<BASE>) {
 306         # Make the readings, and connect them up for the base, but
 307         # also save the first reading of each line in an array for the
 308         # purpose.
 309         # TODO use configurable reading separator
 310         chomp;
 311         my @words = split;
 312         my $started = 0;
 313         my $wordref = 0;
 314         my $lineref = scalar @$lineref_array;
 315         last if $SHORT && $lineref > $SHORT;
 316         foreach my $w ( @words ) {
 317             my $readingref = join( ',', $lineref, ++$wordref );
 318             my $reading = $collation->add_reading( $readingref );
 319             $reading->text( $w );
 320             unless( $started ) {
 321                 push( @$lineref_array, $reading );
 322                 $started = 1;
 323             }
 324             # Add edge paths in the graph, for easier tracking when
 325             # we start applying corrections.  These paths will be
 326             # removed when we're done.
 327             my $path = $collation->add_path( $last_reading, $reading,
 328                                              $collation->baselabel );
 329             $last_reading = $reading;
 330
 331             # Note an array index for the reading, for later correction splices.
 332             $base_text_index{$readingref} = $i++;
 333         }
 334     }
 335     close BASE;
 336     # Ending point for all texts
 337     my $endpoint = $collation->add_reading( '#END#' );
 338     $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
 339     push( @$lineref_array, $endpoint );
 340     $base_text_index{$endpoint->name} = $i;
 341
 342     return( @$lineref_array );
 343 }
 344
 345 =item B<collate_variants>
 346
 347 collate_variants( $collation, @reading_ranges )
 348
 349 Given a set of readings in the form
 350 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
 351 walks through each to identify those readings that are identical.  The
 352 collation is a Text::Tradition::Collation object; the elements of
 353 @readings are Text::Tradition::Collation::Reading objects that appear
 354 on the collation graph.
 355
 356 TODO: Handle collapsed and non-collapsed transpositions.
 357
 358 =cut
 359
 360 sub collate_variants {
 361     my( $collation, @reading_sets ) = @_;
 362
 363     # Merge the nodes across the sets so that there is only one node
 364     # for any given reading.  Use diff to identify the 'same' nodes.
 365
 366     my $lemma_set = shift @reading_sets;
 367
 368     my @unique;
 369     push( @unique, @$lemma_set );
 370
 371     while( @reading_sets ) {
 372         my $variant_set = shift @reading_sets;
 373         if( $DETRANSPOSE ) {
 374             # Use diff to do this job
 375             my $diff = Algorithm::Diff->new( \@unique, $variant_set,
 376                                              {'keyGen' => \&_collation_hash} );
 377             my @new_unique;
 378             while( $diff->Next ) {
 379                 if( $diff->Same ) {
 380                     # merge the nodes
 381                     my @l = $diff->Items( 1 );
 382                     my @v = $diff->Items( 2 );
 383                     foreach my $i ( 0 .. $#l ) {
 384                         $collation->merge_readings( $l[$i], $v[$i] );
 385                     }
 386                     # splice the lemma nodes into the variant set
 387                     my( $offset ) = $diff->Get( 'min2' );
 388                     splice( @$variant_set, $offset, scalar( @l ), @l );
 389                     push( @new_unique, @l );
 390                 } else {
 391                     # Keep the old unique readings
 392                     push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
 393                     # Add the new readings to the 'unique' list
 394                     push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
 395                 }
 396             }
 397             @unique = @new_unique;
 398         } else {
 399             # It becomes a much simpler job
 400             $DB::single = 1;
 401             my @distinct;
 402             foreach my $idx ( 0 .. $#{$variant_set} ) {
 403                 my $vw = $variant_set->[$idx];
 404                 my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
 405                 if( @same ) {
 406                     $collation->merge_readings( $same[0], $vw );
 407                     $variant_set->[$idx] = $same[0];
 408                 } else {
 409                     push( @distinct, $vw );
 410                 }
 411             }
 412             push( @unique, @distinct );
 413         }
 414     }
 415
 416     return;
 417 }
 418
 419
 420 sub _collation_hash {
 421     my $node = shift;
 422     return cmp_str( $node );
 423 }
 424
 425 sub apply_edits {
 426     my( $collation, $edit_sequence ) = @_;
 427     my @lemma_names = sort { $base_text_index{$a} <=> $base_text_index{$b} }
 428         keys %base_text_index;
 429     my @lemma_text = map { $collation->reading( $_ ) } @lemma_names;
 430
 431     my $drift = 0;
 432     foreach my $correction ( @$edit_sequence ) {
 433         my( $offset, $length, $items ) = @$correction;
 434         my $realoffset = $offset + $drift;
 435         splice( @lemma_text, $realoffset, $length, @$items );
 436         $drift += @$items - $length;
 437     }
 438     return @lemma_text;
 439 }
 440
 441 sub make_witness_uncorrections {
 442     my( $path, $uncorr_path ) = @_;
 443     my $diff = Algorithm::Diff->new( $path, $uncorr_path,
 444                                      { 'keyGen' => \&_collation_hash } );
 445     # We basically just want to make a bunch of splice arguments that
 446     # will reconstruct the ante-corr text from the post-corr.
 447     my @diff_list;
 448     while( $diff->Next ) {
 449         next if $diff->Same;
 450         my( $offset ) = $diff->Get( 'min1' );
 451         my $length = scalar( $diff->Items( 1 ) );
 452         my $items = []; push( @$items, $diff->Items( 2 ) );
 453         push( @diff_list, [ $offset, $length, $items ] );
 454     }
 455     return @diff_list;
 456 }
 457
 458
 459 # Helper function. Given a witness sigil, if it is a post-correctione
 460 # sigil,return the base witness.  If not, return a false value.
 461 sub _is_post_corr {
 462     my( $sigil ) = @_;
 463     if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
 464         return $1;
 465     }
 466     return undef;
 467 }
 468
 469 sub _add_hash_entry {
 470     my( $hash, $key, $entry ) = @_;
 471     if( exists $hash->{$key} ) {
 472         push( @{$hash->{$key}}, $entry );
 473     } else {
 474         $hash->{$key} = [ $entry ];
 475     }
 476 }
 477
 478
 479 =item B<cmp_str>
 480
 481 Pretend you never saw this method.  Really it needs to not be hardcoded.
 482
 483 =cut
 484
 485 sub cmp_str {
 486     my( $reading ) = @_;
 487     my $word = $reading->label();
 488     $word = lc( $word );
 489     $word =~ s/\W//g;
 490     $word =~ s/v/u/g;
 491     $word =~ s/j/i/g;
 492     $word =~ s/cha/ca/g;
 493     $word =~ s/quatuor/quattuor/g;
 494     $word =~ s/ioannes/iohannes/g;
 495     return $word;
 496 }
 497
 498 =back
 499
 500 =head1 LICENSE
 501
 502 This package is free software and is provided "as is" without express
 503 or implied warranty.  You can redistribute it and/or modify it under
 504 the same terms as Perl itself.
 505
 506 =head1 AUTHOR
 507
 508 Tara L Andrews, aurum@cpan.org
 509
 510 =cut
 511
 512 1;