1 package Text::Tradition::Parser::BaseText;
10 Text::Tradition::Parser::BaseText
14 use Text::Tradition::Parser::BaseText qw( merge_base );
15 merge_base( $graph, 'reference.txt', @apparatus_entries )
19 For an overview of the package, see the documentation for the
20 Text::Tradition::Graph module.
22 This module is meant for use with certain of the other Parser classes
23 - whenever a list of variants is given with reference to a base text,
24 these must be joined into a single collation. The parser should
25 therefore make a list of variants and their locations, and BaseText
26 will join those listed variants onto the reference text.
34 parse( $graph, %opts );
36 Takes an initialized graph and a set of options, which must include:
37 - 'base' - the base text referenced by the variants
38 - 'format' - the format of the variant list
39 - 'data' - the variants, in the given format.
44 my( $tradition, %opts ) = @_;
46 my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
48 my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
49 merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
54 merge_base( $graph, 'reference.txt', @apparatus_entries )
56 Takes three arguments: a newly-initialized Text::Tradition::Graph
57 object, a text file containing the reference text, and a list of
58 variants (apparatus entries). Adds the base text to the graph, and
59 joins the variants to that.
61 The list of variants is an array of hash references; each hash takes
63 { '_id' => line reference,
64 'rdg_0' => lemma reading,
65 'rdg_1' => first variant,
66 ... # and so on until all distinct readings are listed
67 'WitnessA' => 'rdg_0',
68 'WitnessB' => 'rdg_1',
69 ... # and so on until all witnesses are listed with their readings
72 Any hash key that is not of the form /^rdg_\d+$/ and that does not
73 begin with an underscore is assumed to be a witness name. Any 'meta'
74 information to be passed must be passed in a key with a leading
75 underscore in its name.
79 my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
82 my $edits_required = {};
84 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
87 my( $collation, $base_file, @app_entries ) = @_;
88 my @base_line_starts = read_base( $base_file, $collation );
91 my @unwitnessed_lemma_nodes;
92 foreach my $app ( @app_entries ) {
93 my( $line, $num ) = split( /\./, $app->{_id} );
94 # DEBUG with a short graph
95 last if $SHORTEND && $line > $SHORTEND;
96 # DEBUG for problematic entries
98 my $first_line_reading = $base_line_starts[ $line ];
99 my $too_far = $base_line_starts[ $line+1 ];
101 my $lemma = $app->{rdg_0};
103 # Is this the Nth occurrence of this reading in the line?
104 if( $lemma =~ s/(_)?(\d)$// ) {
107 my @lemma_words = split( /\s+/, $lemma );
109 # Now search for the lemma words within this line.
110 my $lemma_start = $first_line_reading;
113 while( $lemma_start ne $too_far ) {
115 if( $seen{ $lemma_start->name() } ) {
116 warn "Detected loop at " . $lemma_start->name() .
120 $seen{ $lemma_start->name() } = 1;
122 # Try to match the lemma.
124 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
125 $lemma_words[0] . "...\n"
126 if "$line.$num" eq $scrutinize;
127 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
128 # Skip it if we need a match that is not the first.
130 # Now we have to compare the rest of the words here.
131 if( scalar( @lemma_words ) > 1 ) {
133 $collation->next_reading( $lemma_start );
134 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
135 printf STDERR "Now matching %s against %s\n",
136 cmp_str($next_reading), $w
137 if "$line.$num" eq $scrutinize;
138 if( $w ne cmp_str($next_reading) ) {
142 $lemma_end = $next_reading;
144 $collation->next_reading( $lemma_end );
148 $lemma_end = $lemma_start;
154 last unless ( $unmatch || !defined( $lemma_end ) );
156 $lemma_start = $collation->next_reading( $lemma_start );
159 unless( $lemma_end ) {
160 warn "No match found for @lemma_words at $line.$num";
164 # Now we have found the lemma; we will record an 'edit', in
165 # terms of a splice operation, for each subsequent reading.
166 # We also note which witnesses take the given edit.
168 my @lemma_set = $collation->reading_sequence( $lemma_start,
170 my @reading_sets = [ @lemma_set ];
172 # For each reading that is not rdg_0, we create the variant
173 # reading nodes, and store the range as an edit operation on
176 my %pc_seen; # Keep track of mss with explicit post-corr data
177 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
178 my @mss = grep { $app->{$_} eq $k } keys( %$app );
180 # Keep track of lemma nodes that don't actually appear in
181 # any MSS; we will want to remove them from the collation.
182 push( @unwitnessed_lemma_nodes, @lemma_set )
183 if !@mss && $k eq 'rdg_0';
185 # Keep track of what witnesses we have seen.
186 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
187 # Keep track of which witnesses bear corrected readings here.
188 foreach my $m ( @mss ) {
189 my $base = _is_post_corr( $m );
193 next if $k eq 'rdg_0';
195 # Parse the variant into reading tokens.
196 # TODO don't hardcode the reading split operation
197 my @variant = split( /\s+/, $app->{$k} );
198 @variant = () if $app->{$k} eq '/'; # This is an omission.
200 my @variant_readings;
202 foreach my $vw ( @variant ) {
203 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
204 my $vwreading = $collation->add_reading( $vwname );
205 $vwreading->text( $vw );
206 push( @variant_readings, $vwreading );
209 $variant_objects->{$k} = { 'mss' => \@mss,
210 'reading' => \@variant_readings,
212 push( @reading_sets, \@variant_readings );
215 # Now collate and collapse the identical readings within the
216 # collated sets. Modifies the reading sets that were passed.
217 collate_variants( $collation, @reading_sets );
219 # Record any stated relationships between the nodes and the lemma.
220 set_relationships( $collation, $app, \@lemma_set, $variant_objects );
222 # Now create the splice-edit objects that will be used
223 # to reconstruct each witness.
225 foreach my $rkey ( keys %$variant_objects ) {
226 # Object is argument list for splice, so:
227 # offset, length, replacements
228 my $edit_object = [ $lemma_start->name,
229 scalar( @lemma_set ),
230 $variant_objects->{$rkey}->{reading} ];
231 foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
232 # Is this a p.c. entry?
233 my $base = _is_post_corr( $ms );
234 if( $base ) { # this is a post-corr witness
235 my $pc_key = $base . "_post";
236 _add_hash_entry( $edits_required, $pc_key, $edit_object );
237 } else { # this is an ante-corr witness
238 my $pc_key = $ms . "_post";
239 _add_hash_entry( $edits_required, $ms, $edit_object );
240 unless( $pc_seen{$ms} ) {
241 # If this witness carries no correction, add this
242 # same object to its post-corrected state.
243 _add_hash_entry( $edits_required, $pc_key,
249 } # Finished going through the apparatus entries
251 # Now make the witness objects, and create their text sequences
252 foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
253 print STDERR "Creating witness $w\n";
254 my $witness_obj = $collation->tradition->add_witness( sigil => $w );
255 my $debug; # = $w eq 'Vb11';
256 my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
257 my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
258 if exists( $edits_required->{$w."_post"} );
260 my @repeated = _check_for_repeated( @ante_corr_seq );
261 warn "Repeated elements @repeated in $w a.c."
263 @repeated = _check_for_repeated( @post_corr_seq );
264 warn "Repeated elements @repeated in $w p.c."
267 # Now save these paths in my witness object
268 if( @post_corr_seq ) {
269 $witness_obj->path( \@post_corr_seq );
270 $witness_obj->uncorrected_path( \@ante_corr_seq );
272 $witness_obj->path( \@ante_corr_seq );
276 # Now remove our 'base text' edges, which is to say, the only
277 # ones we have created so far. Also remove any unwitnessed
278 # lemma nodes (TODO unless we are treating base as witness)
279 foreach ( $collation->paths() ) {
280 $collation->del_path( $_ );
282 foreach( @unwitnessed_lemma_nodes ) {
283 $collation->del_reading( $_ );
286 ### HACKY HACKY Do some one-off path corrections here.
287 if( $collation->linear ) {
289 my $end = $SHORTEND ? $SHORTEND : 155;
290 my $path = $c->tradition->witness('Vb11')->path;
292 $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
293 splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
298 my $end = $SHORTEND ? $SHORTEND : 155;
300 my $path = $c->tradition->witness('Vb5')->path;
301 splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106;
303 $path = $c->tradition->witness('Vb11')->path;
305 $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
306 splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) );
309 $path = $c->tradition->witness('Vb12')->uncorrected_path;
310 splice( @$path, 1828, 1, $c->reading('rdg_2/137.5.0') ) if $end > 137;
312 $path = $c->tradition->witness('Vb13')->path;
313 splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
315 $path = $c->tradition->witness('Vb20')->uncorrected_path;
316 splice( @$path, 1251, 1, $c->reading( '94,6' ) ) if $end > 94;
318 $path = $c->tradition->witness('Vb26')->path;
319 splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46;
322 # Now walk paths and calculate positions.
323 my @common_readings =
324 $collation->make_witness_paths();
325 $collation->calculate_positions( @common_readings );
328 sub _check_for_repeated {
333 if( exists $unique{$_->name} ) {
334 push( @repeated, $_->name );
336 $unique{$_->name} = 1;
344 my @line_beginnings = read_base( 'reference.txt', $collation );
346 Takes a text file and a (presumed empty) collation object, adds the
347 words as simple linear readings to the collation, and returns a
348 list of readings that represent the beginning of lines. This collation
349 is now the starting point for application of apparatus entries in
350 merge_base, e.g. from a CSV file or a Classical Text Editor file.
355 my( $base_file, $collation ) = @_;
357 # This array gives the first reading for each line. We put the
358 # common starting point in line zero.
359 my $last_reading = $collation->start();
360 $base_text_index{$last_reading->name} = 0;
361 my $lineref_array = [ $last_reading ]; # There is no line zero.
363 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
366 # Make the readings, and connect them up for the base, but
367 # also save the first reading of each line in an array for the
369 # TODO use configurable reading separator
374 my $lineref = scalar @$lineref_array;
375 last if $SHORTEND && $lineref > $SHORTEND;
376 foreach my $w ( @words ) {
377 my $readingref = join( ',', $lineref, ++$wordref );
378 my $reading = $collation->add_reading( $readingref );
379 $reading->text( $w );
381 push( @$lineref_array, $reading );
384 # Add edge paths in the graph, for easier tracking when
385 # we start applying corrections. These paths will be
386 # removed when we're done.
387 my $path = $collation->add_path( $last_reading, $reading,
388 $collation->baselabel );
389 $last_reading = $reading;
391 # Note an array index for the reading, for later correction splices.
392 $base_text_index{$readingref} = $i++;
396 # Ending point for all texts
397 my $endpoint = $collation->add_reading( '#END#' );
398 $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
399 push( @$lineref_array, $endpoint );
400 $base_text_index{$endpoint->name} = $i;
402 return( @$lineref_array );
405 =item B<collate_variants>
407 collate_variants( $collation, @reading_ranges )
409 Given a set of readings in the form
410 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
411 walks through each to identify those readings that are identical. The
412 collation is a Text::Tradition::Collation object; the elements of
413 @readings are Text::Tradition::Collation::Reading objects that appear
414 on the collation graph.
416 TODO: Handle collapsed and non-collapsed transpositions.
420 sub collate_variants {
421 my( $collation, @reading_sets ) = @_;
423 # Two different ways to do this, depending on whether we want
424 # transposed reading nodes to be merged into one (producing a
425 # nonlinear, bidirectional graph) or not (producing a relatively
426 # linear, unidirectional graph.)
427 return $collation->linear ? collate_linearly( @_ )
428 : collate_nonlinearly( @_ );
431 sub collate_linearly {
432 my( $collation, $lemma_set, @variant_sets ) = @_;
435 push( @unique, @$lemma_set );
436 while( @variant_sets ) {
437 my $variant_set = shift @variant_sets;
438 # Use diff to do this job
439 my $diff = Algorithm::Diff->new( \@unique, $variant_set,
440 {'keyGen' => \&_collation_hash} );
443 while( $diff->Next ) {
446 my @l = $diff->Items( 1 );
447 my @v = $diff->Items( 2 );
448 foreach my $i ( 0 .. $#l ) {
449 if( !$merged{$l[$i]->name} ) {
450 $collation->merge_readings( $l[$i], $v[$i] );
451 $merged{$l[$i]->name} = 1;
453 print STDERR "Would have double merged " . $l[$i]->name . "\n";
456 # splice the lemma nodes into the variant set
457 my( $offset ) = $diff->Get( 'min2' );
458 splice( @$variant_set, $offset, scalar( @l ), @l );
459 push( @new_unique, @l );
461 # Keep the old unique readings
462 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
463 # Add the new readings to the 'unique' list
464 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
467 @unique = @new_unique;
471 sub collate_nonlinearly {
472 my( $collation, $lemma_set, @variant_sets ) = @_;
475 push( @unique, @$lemma_set );
476 while( @variant_sets ) {
477 my $variant_set = shift @variant_sets;
478 # Simply match the first reading that carries the same word, so
479 # long as that reading has not yet been used to match another
480 # word in this variant. That way lies loopy madness.
483 foreach my $idx ( 0 .. $#{$variant_set} ) {
484 my $vw = $variant_set->[$idx];
485 my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
488 foreach my $i ( 0 .. $#same ) {
489 unless( $merged{$same[$i]->name} ) {
490 #print STDERR sprintf( "Merging %s into %s\n",
493 $collation->merge_readings( $same[$i], $vw );
494 $merged{$same[$i]->name} = 1;
496 $variant_set->[$idx] = $same[$i];
500 unless( @same && defined($matched) ) {
501 push( @distinct, $vw );
504 push( @unique, @distinct );
510 sub _collation_hash {
512 return cmp_str( $node );
515 sub set_relationships {
516 my( $collation, $app, $lemma, $variants ) = @_;
517 foreach my $rkey ( keys %$variants ) {
518 my $var = $variants->{$rkey}->{'reading'};
519 my $type = $app->{sprintf( "_%s_type", $rkey )};
520 my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
521 my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
523 my %rel_options = ();
524 $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
525 $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
527 if( $type =~ /^(inv|tr|rep)$/i ) {
528 # Transposition or repetition: look for nodes with the
529 # same label but different IDs and mark them.
530 $type = 'repetition' if $type =~ /^rep/i;
531 $rel_options{'type'} = $type;
533 foreach my $r ( @$lemma ) {
534 $labels{cmp_str( $r )} = $r;
536 foreach my $r( @$var ) {
537 if( exists $labels{$r->label} &&
538 $r->name ne $labels{$r->label}->name ) {
539 if( $type eq 'repetition' ) {
541 $collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
544 $r->set_identical( $labels{$r->label} );
548 } elsif( $type =~ /^(gr|lex|sp(el)?)$/i ) {
550 # Grammar/spelling/lexical: this can be a one-to-one or
551 # one-to-many mapping. We should think about merging
552 # readings if it is one-to-many.
554 $type = 'grammatical' if $type =~ /gr/i;
555 $type = 'spelling' if $type =~ /sp/i;
556 $type = 'repetition' if $type =~ /rep/i;
557 $type = 'lexical' if $type =~ /lex/i;
558 $rel_options{'type'} = $type;
559 if( @$lemma == @$var ) {
560 foreach my $i ( 0 .. $#{$lemma} ) {
561 $collation->add_relationship( $var->[$i], $lemma->[$i],
565 # An uneven many-to-many mapping. Make a segment out of
567 my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
568 my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
569 $collation->add_relationship( $varseg, $lemseg, \%rel_options );
571 } elsif( $type !~ /^(add|om)$/i ) {
572 warn "Unrecognized type $type";
580 my( $collation, $edit_sequence, $debug ) = @_;
581 my @lemma_text = $collation->reading_sequence( $collation->start,
582 $collation->reading( '#END#' ) );
584 foreach my $correction ( @$edit_sequence ) {
585 my( $lemma_start, $length, $items ) = @$correction;
586 my $offset = $base_text_index{$lemma_start};
587 my $realoffset = $offset + $drift;
589 $lemma_text[$realoffset]->name ne $lemma_start ) {
590 my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
593 my $l = $collation->reading( $lemma_start );
594 while( $i < $realoffset+$length ) {
595 push( @base_phrase, $l );
596 $l = $collation->next_reading( $l );
600 print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
601 "with %s (%s) with drift %d\n",
602 join( ' ', map {$_->label} @base_phrase ),
603 join( ' ', map {$_->name} @base_phrase ),
605 join( ' ', map {$_->label} @$items ),
606 join( ' ', map {$_->name} @$items ),
610 if( $lemma_text[$realoffset]->name ne $lemma_start ) {
611 warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
612 "but %s (%s) is there instead",
613 join( ' ', map {$_->label} @base_phrase ),
614 join( ' ', map {$_->name} @base_phrase ),
615 join( ' ', map {$_->label} @$items ),
616 join( ' ', map {$_->name} @$items ),
617 join( ' ', map {$_->label} @this_phrase ),
618 join( ' ', map {$_->name} @this_phrase ),
623 splice( @lemma_text, $realoffset, $length, @$items );
624 $drift += @$items - $length;
630 # Helper function. Given a witness sigil, if it is a post-correctione
631 # sigil,return the base witness. If not, return a false value.
634 if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
640 sub _add_hash_entry {
641 my( $hash, $key, $entry ) = @_;
642 if( exists $hash->{$key} ) {
643 push( @{$hash->{$key}}, $entry );
645 $hash->{$key} = [ $entry ];
652 Pretend you never saw this method. Really it needs to not be hardcoded.
658 my $word = $reading->label();
664 $word =~ s/quatuor/quattuor/g;
665 $word =~ s/ioannes/iohannes/g;
673 This package is free software and is provided "as is" without express
674 or implied warranty. You can redistribute it and/or modify it under
675 the same terms as Perl itself.
679 Tara L Andrews, aurum@cpan.org