lib/Text/Tradition/Parser/BaseText.pm

   1 package Text::Tradition::Parser::BaseText;
   2
   3 use strict;
   4 use warnings;
   5 use Module::Load;
   6 use Algorithm::Diff;
   7
   8 =head1 NAME
   9
  10 Text::Tradition::Parser::BaseText
  11
  12 =head1 SYNOPSIS
  13
  14 use Text::Tradition::Parser::BaseText qw( merge_base );
  15 merge_base( $graph, 'reference.txt', @apparatus_entries )
  16
  17 =head1 DESCRIPTION
  18
  19 For an overview of the package, see the documentation for the
  20 Text::Tradition::Graph module.
  21
  22 This module is meant for use with certain of the other Parser classes
  23 - whenever a list of variants is given with reference to a base text,
  24 these must be joined into a single collation.  The parser should
  25 therefore make a list of variants and their locations, and BaseText
  26 will join those listed variants onto the reference text.
  27
  28 =head1 SUBROUTINES
  29
  30 =over
  31
  32 =item B<parse>
  33
  34 parse( $graph, %opts );
  35
  36 Takes an initialized graph and a set of options, which must include:
  37 - 'base' - the base text referenced by the variants
  38 - 'format' - the format of the variant list
  39 - 'data' - the variants, in the given format.
  40
  41 =cut
  42
  43 sub parse {
  44     my( $tradition, %opts ) = @_;
  45
  46     my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
  47     load( $format_mod );
  48     my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
  49     merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
  50 }
  51
  52 =item B<merge_base>
  53
  54 merge_base( $graph, 'reference.txt', @apparatus_entries )
  55
  56 Takes three arguments: a newly-initialized Text::Tradition::Graph
  57 object, a text file containing the reference text, and a list of
  58 variants (apparatus entries).  Adds the base text to the graph, and
  59 joins the variants to that.
  60
  61 The list of variants is an array of hash references; each hash takes
  62 the form
  63  { '_id' => line reference,
  64    'rdg_0' => lemma reading,
  65    'rdg_1' => first variant,
  66    ...  # and so on until all distinct readings are listed
  67    'WitnessA' => 'rdg_0',
  68    'WitnessB' => 'rdg_1',
  69    ...  # and so on until all witnesses are listed with their readings
  70  }
  71
  72 Any hash key that is not of the form /^rdg_\d+$/ and that does not
  73 begin with an underscore is assumed to be a witness name.  Any 'meta'
  74 information to be passed must be passed in a key with a leading
  75 underscore in its name.
  76
  77 =cut
  78
  79 my $SHORT = undef;  # Debug var - set this to limit the number of lines parsed
  80
  81 my %base_text_index;
  82 my $edits_required;
  83
  84 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
  85
  86 sub merge_base {
  87     my( $collation, $base_file, @app_entries ) = @_;
  88     my @base_line_starts = read_base( $base_file, $collation );
  89
  90     my %all_witnesses;
  91     foreach my $app ( @app_entries ) {
  92         my( $line, $num ) = split( /\./, $app->{_id} );
  93         # DEBUG with a short graph
  94         last if $SHORT && $line > $SHORT;
  95         # DEBUG for problematic entries
  96         my $scrutinize = '';
  97         my $first_line_reading = $base_line_starts[ $line ];
  98         my $too_far = $base_line_starts[ $line+1 ];
  99
 100         my $lemma = $app->{rdg_0};
 101         my $seq = 1;
 102         # Is this the Nth occurrence of this reading in the line?
 103         if( $lemma =~ s/(_)?(\d)$// ) {
 104             $seq = $2;
 105         }
 106         my @lemma_words = split( /\s+/, $lemma );
 107
 108         # Now search for the lemma words within this line.
 109         my $lemma_start = $first_line_reading;
 110         my $lemma_end;
 111         my %seen;
 112         while( $lemma_start ne $too_far ) {
 113             # Loop detection
 114             if( $seen{ $lemma_start->name() } ) {
 115                 warn "Detected loop at " . $lemma_start->name() .
 116                     ", ref $line,$num";
 117                 last;
 118             }
 119             $seen{ $lemma_start->name() } = 1;
 120
 121             # Try to match the lemma.
 122             my $unmatch = 0;
 123             print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
 124                 $lemma_words[0] . "...\n"
 125                 if "$line.$num" eq $scrutinize;
 126             if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
 127                 # Skip it if we need a match that is not the first.
 128                 if( --$seq < 1 ) {
 129                     # Now we have to compare the rest of the words here.
 130                     if( scalar( @lemma_words ) > 1 ) {
 131                         my $next_reading =
 132                             $collation->next_reading( $lemma_start );
 133                         foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 134                             printf STDERR "Now matching %s against %s\n",
 135                                     cmp_str($next_reading), $w
 136                                 if "$line.$num" eq $scrutinize;
 137                             if( $w ne cmp_str($next_reading) ) {
 138                                 $unmatch = 1;
 139                                 last;
 140                             } else {
 141                                 $lemma_end = $next_reading;
 142                                 $next_reading =
 143                                     $collation->next_reading( $lemma_end );
 144                             }
 145                         }
 146                     } else {
 147                         $lemma_end = $lemma_start;
 148                     }
 149                 } else {
 150                     $unmatch = 1;
 151                 }
 152             }
 153             last unless ( $unmatch || !defined( $lemma_end ) );
 154             $lemma_end = undef;
 155             $lemma_start = $collation->next_reading( $lemma_start );
 156         }
 157
 158         unless( $lemma_end ) {
 159             warn "No match found for @lemma_words at $line.$num";
 160             next;
 161         }
 162
 163         # Now we have found the lemma; we will record an 'edit', in
 164         # terms of a splice operation, for each subsequent reading.
 165         # We also note which witnesses take the given edit.
 166
 167         my @lemma_set = $collation->reading_sequence( $lemma_start, $lemma_end );
 168         my @reading_sets = [ @lemma_set ];
 169
 170         # For each reading that is not rdg_0, we create the variant
 171         # reading nodes, and store the range as an edit operation on
 172         # the base text.
 173         my $variant_objects;
 174         my %pc_lemma; # Keep track of mss that have been corrected back to lemma
 175         my %pc_variant; # Keep track of mss with other corrections
 176         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
 177             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 178             # Keep track of what witnesses we have seen.
 179             @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
 180             my $pc_hash = $k eq 'rdg_0' ? \%pc_lemma : \%pc_variant;
 181
 182             # Keep track of which witnesses bear corrected readings here.
 183             foreach my $m ( @mss ) {
 184                 my $base = _is_post_corr( $m );
 185                 next unless $base;
 186                 $pc_hash->{$base} = 1;
 187             }
 188             next if $k eq 'rdg_0';
 189
 190             # TODO don't hardcode the reading split operation
 191             my @variant = split( /\s+/, $app->{$k} );
 192             @variant = () if $app->{$k} eq '/'; # This is an omission.
 193
 194             # Make the variant into a set of readings.
 195             my @variant_readings;
 196             my $ctr = 0;
 197             foreach my $vw ( @variant ) {
 198                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 199                 my $vwreading = $collation->add_reading( $vwname );
 200                 $vwreading->text( $vw );
 201                 push( @variant_readings, $vwreading );
 202             }
 203
 204             $variant_objects->{$k} = { 'mss' => \@mss,
 205                                        'reading' => \@variant_readings,
 206             };
 207             push( @reading_sets, \@variant_readings );
 208         }
 209
 210         # Now collate and collapse the identical readings within the
 211         # collated sets.  Modifies the reading sets that were passed.
 212         collate_variants( $collation, @reading_sets );
 213
 214         # Now create the splice-edit objects that will be used
 215         # to reconstruct each witness.
 216
 217         foreach my $rkey ( keys %$variant_objects ) {
 218             # Object is argument list for splice, so:
 219             # offset, length, replacements
 220             my $edit_object = [ $base_text_index{$lemma_start->name},
 221                                 scalar( @lemma_set ),
 222                                 $variant_objects->{$rkey}->{reading} ];
 223             foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
 224                 # Is this a p.c. entry?
 225                 my $base = _is_post_corr( $ms );
 226                 if( $base ) { # this is a post-corr witness
 227                     my $pc_key = $base . "_post";
 228                     _add_hash_entry( $edits_required, $pc_key, $edit_object );
 229                 } else { # this is an ante-corr witness
 230                     my $pc_key = $ms . "_post";
 231                     _add_hash_entry( $edits_required, $_, $edit_object );
 232                     unless( !$pc_lemma{$ms} && !$pc_variant{$ms} ) {
 233                         # If this witness carries no correction, add this same object
 234                         # to its post-corrected state.
 235                         # TODO combine these hashes?
 236                         _add_hash_entry( $edits_required, $pc_key, $edit_object );
 237                     }
 238                 }
 239             }
 240         }
 241     } # Finished going through the apparatus entries
 242
 243     # Now make the witness objects, and create their text sequences
 244     foreach my $w ( grep { $_ !~ /_base$/ } keys %$edits_required ) {
 245         my $witness_obj = $collation->tradition->add_witness( sigil => $w );
 246         my @ante_corr_seq = apply_edits( $edits_required->{$w} );
 247         my @post_corr_seq = apply_edits( $edits_required->{$w."_post"} )
 248             if exists( $edits_required->{$w."_post"} );
 249
 250         # Now how to save these paths in my witness object?
 251         if( @post_corr_seq ) {
 252             $witness_obj->add_path( @post_corr_seq );
 253             $witness_obj->add_uncorrected_path( @ante_corr_seq );
 254         } else {
 255             $witness_obj->add_path( @ante_corr_seq );
 256         }
 257     }
 258
 259     # TODO Now remove all the 'base text' links.
 260
 261     # Now walk paths and calculate positions.
 262     my @common_readings =
 263         $collation->walk_and_expand_base( $collation->reading( '#END#' ) );
 264     $collation->calculate_positions( @common_readings );
 265 }
 266
 267 =item B<read_base>
 268
 269 my @line_beginnings = read_base( 'reference.txt', $collation );
 270
 271 Takes a text file and a (presumed empty) collation object, adds the
 272 words as simple linear readings to the collation, and returns a
 273 list of readings that represent the beginning of lines. This collation
 274 is now the starting point for application of apparatus entries in
 275 merge_base, e.g. from a CSV file or a Classical Text Editor file.
 276
 277 =cut
 278
 279 sub read_base {
 280     my( $base_file, $collation ) = @_;
 281
 282     # This array gives the first reading for each line.  We put the
 283     # common starting point in line zero.
 284     my $last_reading = $collation->start();
 285     my $lineref_array = [ $last_reading ]; # There is no line zero.
 286
 287     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
 288     my $i = 0;
 289     while(<BASE>) {
 290         # Make the readings, and connect them up for the base, but
 291         # also save the first reading of each line in an array for the
 292         # purpose.
 293         # TODO use configurable reading separator
 294         chomp;
 295         my @words = split;
 296         my $started = 0;
 297         my $wordref = 0;
 298         my $lineref = scalar @$lineref_array;
 299         last if $SHORT && $lineref > $SHORT;
 300         foreach my $w ( @words ) {
 301             my $readingref = join( ',', $lineref, ++$wordref );
 302             my $reading = $collation->add_reading( $readingref );
 303             $reading->text( $w );
 304             unless( $started ) {
 305                 push( @$lineref_array, $reading );
 306                 $started = 1;
 307             }
 308             # Add edge paths in the graph, for easier tracking when
 309             # we start applying corrections.  These paths will be
 310             # removed when we're done.
 311             my $path = $collation->add_path( $last_reading, $reading,
 312                                              $collation->baselabel );
 313             $last_reading = $reading;
 314
 315             # Note an array index for the reading, for later correction splices.
 316             $base_text_index{$readingref} = $i++;
 317         }
 318     }
 319     close BASE;
 320     # Ending point for all texts
 321     my $endpoint = $collation->add_reading( '#END#' );
 322     $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
 323     push( @$lineref_array, $endpoint );
 324
 325     return( @$lineref_array );
 326 }
 327
 328 =item B<collate_variants>
 329
 330 collate_variants( $collation, @reading_ranges )
 331
 332 Given a set of readings in the form
 333 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
 334 walks through each to identify those readings that are identical.  The
 335 collation is a Text::Tradition::Collation object; the elements of
 336 @readings are Text::Tradition::Collation::Reading objects that appear
 337 on the collation graph.
 338
 339 TODO: Handle collapsed and non-collapsed transpositions.
 340
 341 =cut
 342
 343 sub collate_variants {
 344     my( $collation, @reading_sets ) = @_;
 345     # my $detranspose = 1;  # TODO handle merging transposed nodes
 346
 347     # Merge the nodes across the sets so that there is only one node
 348     # for any given reading.  Use diff to identify the 'same' nodes.
 349
 350     my $lemma_set = shift @reading_sets;
 351
 352     my @unique;
 353     push( @unique, @$lemma_set );
 354
 355     while( @reading_sets ) {
 356         my $variant_set = shift @reading_sets;
 357         my $diff = Algorithm::Diff->new( \@unique, $variant_set, \&_collation_hash );
 358         my @new_unique;
 359         push( @new_unique, @unique );
 360         while( $diff->Next ) {
 361             if( $diff->Same ) {
 362                 # merge the nodes
 363                 my @l = $diff->Items( 1 );
 364                 my @v = $diff->Items( 2 );
 365                 foreach my $i ( 0 .. $#l ) {
 366                     $collation->merge_readings( $l[$i], $v[$i] );
 367                 }
 368                 # splice the lemma nodes into the variant set
 369                 splice( @$variant_set, $diff->Get( 'min2' ), scalar( @l ), @l );
 370                 push( @new_unique, @l );
 371             } else {
 372                 # Keep the old unique readings
 373                 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
 374                 # Add the new readings to the 'unique' list
 375                 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
 376             }
 377         }
 378         @unique = @new_unique;
 379     }
 380
 381     return;
 382 }
 383
 384
 385 sub _collation_hash {
 386     my $node = shift;
 387     return _cmp_str( $node->label );
 388 }
 389
 390 sub apply_edits {
 391     my $edit_sequence = shift;
 392     my @lemma_text = map { $base_text_index{$_} } sort( keys %base_text_index );
 393
 394     my $drift = 0;
 395     foreach my $correction ( @$edit_sequence ) {
 396         my( $offset, $length, $items ) = @$correction;
 397         my $realoffset = $offset + $drift;
 398         splice( @lemma_text, $realoffset, $length, @$items );
 399         $drift += @$items - $length;
 400     }
 401     return \@lemma_text;
 402 }
 403
 404
 405 # Helper function. Given a witness sigil, if it is a post-correctione
 406 # sigil,return the base witness.  If not, return a false value.
 407 sub _is_post_corr {
 408     my( $sigil ) = @_;
 409     if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
 410         return $1;
 411     }
 412     return undef;
 413 }
 414
 415 =item B<cmp_str>
 416
 417 Pretend you never saw this method.  Really it needs to not be hardcoded.
 418
 419 =cut
 420
 421 sub cmp_str {
 422     my( $reading ) = @_;
 423     my $word = $reading->label();
 424     $word = lc( $word );
 425     $word =~ s/\W//g;
 426     $word =~ s/v/u/g;
 427     $word =~ s/j/i/g;
 428     $word =~ s/cha/ca/g;
 429     $word =~ s/quatuor/quattuor/g;
 430     $word =~ s/ioannes/iohannes/g;
 431     return $word;
 432 }
 433
 434 =back
 435
 436 =head1 LICENSE
 437
 438 This package is free software and is provided "as is" without express
 439 or implied warranty.  You can redistribute it and/or modify it under
 440 the same terms as Perl itself.
 441
 442 =head1 AUTHOR
 443
 444 Tara L Andrews, aurum@cpan.org
 445
 446 =cut
 447
 448 1;