1 package Text::Tradition::Parser::BaseText;
7 use Text::Tradition::Parser::Util qw( collate_variants cmp_str
8 check_for_repeated add_hash_entry );
12 Text::Tradition::Parser::BaseText
16 use Text::Tradition::Parser::BaseText qw( merge_base );
17 merge_base( $graph, 'reference.txt', @apparatus_entries )
21 For an overview of the package, see the documentation for the
22 Text::Tradition module.
24 This module is meant for use with certain of the other Parser classes
25 - whenever a list of variants is given with reference to a base text,
26 these must be joined into a single collation. The parser should
27 therefore make a list of variants and their locations, and BaseText
28 will join those listed variants onto the reference text.
36 parse( $graph, $opts );
38 Takes an initialized graph and a hashref of options, which must include:
39 - 'base' - the base text referenced by the variants
40 - 'format' - the format of the variant list
41 - 'data' - the variants, in the given format.
46 my( $tradition, $opts ) = @_;
48 my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
50 # TODO Handle a string someday if we ever have a format other than KUL
51 my @apparatus_entries = $format_mod->can('read')->( $opts );
52 merge_base( $tradition->collation, $opts, @apparatus_entries );
57 merge_base( $graph, 'reference.txt', @apparatus_entries )
59 Takes three arguments: a newly-initialized Text::Tradition::Graph
60 object, a text file containing the reference text, and a list of
61 variants (apparatus entries). Adds the base text to the graph, and
62 joins the variants to that.
64 The list of variants is an array of hash references; each hash takes
66 { '_id' => line reference,
67 'rdg_0' => lemma reading,
68 'rdg_1' => first variant,
69 ... # and so on until all distinct readings are listed
70 'WitnessA' => 'rdg_0',
71 'WitnessB' => 'rdg_1',
72 ... # and so on until all witnesses are listed with their readings
75 Any hash key that is not of the form /^rdg_\d+$/ and that does not
76 begin with an underscore is assumed to be a witness name. Any 'meta'
77 information to be passed must be passed in a key with a leading
78 underscore in its name.
82 my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
85 my $edits_required = {};
87 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
90 my( $collation, $opts, @app_entries ) = @_;
91 my @base_line_starts = read_base( $opts->{'base'}, $collation );
94 foreach my $app ( @app_entries ) {
95 my( $line, $num ) = split( /\./, $app->{_id} );
96 # DEBUG with a short graph
97 last if $SHORTEND && $line > $SHORTEND;
98 # DEBUG for problematic entries
100 my $first_line_reading = $base_line_starts[ $line ];
101 my $too_far = $base_line_starts[ $line+1 ];
103 my $lemma = $app->{rdg_0};
105 # Is this the Nth occurrence of this reading in the line?
106 if( $lemma =~ s/(_)?(\d)$// ) {
109 my @lemma_words = split( /\s+/, $lemma );
111 # Now search for the lemma words within this line.
112 my $lemma_start = $first_line_reading;
115 while( $lemma_start ne $too_far ) {
117 if( $seen{ $lemma_start->id() } ) {
118 warn "Detected loop at " . $lemma_start->id() .
122 $seen{ $lemma_start->id() } = 1;
124 # Try to match the lemma.
126 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
127 $lemma_words[0] . "...\n"
128 if "$line.$num" eq $scrutinize;
129 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
130 # Skip it if we need a match that is not the first.
132 # Now we have to compare the rest of the words here.
133 if( scalar( @lemma_words ) > 1 ) {
135 $collation->next_reading( $lemma_start );
136 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
137 printf STDERR "Now matching %s against %s\n",
138 cmp_str($next_reading), $w
139 if "$line.$num" eq $scrutinize;
140 if( $w ne cmp_str($next_reading) ) {
144 $lemma_end = $next_reading;
146 $collation->next_reading( $lemma_end );
150 $lemma_end = $lemma_start;
156 last unless ( $unmatch || !defined( $lemma_end ) );
158 $lemma_start = $collation->next_reading( $lemma_start );
161 unless( $lemma_end ) {
162 warn "No match found for @lemma_words at $line.$num";
166 # Now we have found the lemma; we will record an 'edit', in
167 # terms of a splice operation, for each subsequent reading.
168 # We also note which witnesses take the given edit.
170 my @lemma_set = $collation->reading_sequence( $lemma_start,
172 my @reading_sets = [ @lemma_set ];
174 # For each reading that is not rdg_0, we create the variant
175 # reading nodes, and store the range as an edit operation on
178 my %pc_seen; # Keep track of mss with explicit post-corr data
179 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
180 my @mss = grep { $app->{$_} eq $k } keys( %$app );
182 # Keep track of what witnesses we have seen.
183 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
184 # Keep track of which witnesses bear corrected readings here.
185 foreach my $m ( @mss ) {
186 my $base = _is_post_corr( $m );
190 next if $k eq 'rdg_0';
192 # Parse the variant into reading tokens.
193 # TODO don't hardcode the reading split operation
194 my @variant = split( /\s+/, $app->{$k} );
195 @variant = () if $app->{$k} eq '/'; # This is an omission.
197 my @variant_readings;
199 foreach my $vw ( @variant ) {
200 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
201 my $vwreading = $collation->add_reading( {
204 push( @variant_readings, $vwreading );
207 $variant_objects->{$k} = { 'mss' => \@mss,
208 'reading' => \@variant_readings,
210 push( @reading_sets, \@variant_readings );
213 # Now collate and collapse the identical readings within the
214 # collated sets. Modifies the reading sets that were passed.
215 collate_variants( $collation, @reading_sets );
217 # Record any stated relationships between the nodes and the lemma.
218 set_relationships( $collation, $app, \@lemma_set, $variant_objects );
220 # Now create the splice-edit objects that will be used
221 # to reconstruct each witness.
223 foreach my $rkey ( keys %$variant_objects ) {
224 # Object is argument list for splice, so:
225 # offset, length, replacements
226 my $edit_object = [ $lemma_start->id,
227 scalar( @lemma_set ),
228 $variant_objects->{$rkey}->{reading} ];
229 foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
230 # Is this a p.c. entry?
231 my $base = _is_post_corr( $ms );
232 if( $base ) { # this is a post-corr witness
233 my $pc_key = $base . "_post";
234 add_hash_entry( $edits_required, $pc_key, $edit_object );
235 } else { # this is an ante-corr witness
236 my $pc_key = $ms . "_post";
237 add_hash_entry( $edits_required, $ms, $edit_object );
238 unless( $pc_seen{$ms} ) {
239 # If this witness carries no correction, add this
240 # same object to its post-corrected state.
241 add_hash_entry( $edits_required, $pc_key,
247 } # Finished going through the apparatus entries
249 # Now make the witness objects, and create their text sequences
250 foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
251 print STDERR "Creating witness $w\n";
252 my $witness_obj = $collation->tradition->add_witness(
253 sigil => $w, sourcetype => 'collation' );
254 my $debug; # = $w eq 'Vb11';
255 my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
256 my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
257 if exists( $edits_required->{$w."_post"} );
259 my @repeated = check_for_repeated( @ante_corr_seq );
260 warn "Repeated elements @repeated in $w a.c."
262 @repeated = check_for_repeated( @post_corr_seq );
263 warn "Repeated elements @repeated in $w p.c."
266 # Now save these paths in my witness object
267 if( @post_corr_seq ) {
268 $witness_obj->path( \@post_corr_seq );
269 $witness_obj->uncorrected_path( \@ante_corr_seq );
271 $witness_obj->path( \@ante_corr_seq );
275 # Now remove our 'base text' edges, which is to say, the only
276 # ones we have created so far. Also remove any unwitnessed
277 # lemma nodes (TODO unless we are treating base as witness)
278 foreach ( $collation->paths() ) {
279 $collation->del_path( $_, $collation->baselabel );
282 ### HACKY HACKY Do some one-off path corrections here.
283 if( $opts->{'input'} eq 'KUL' ) {
284 require 'data/boodts/s158.HACK';
285 KUL::HACK::pre_path_hack( $collation );
288 # Now walk paths and calculate positional rank.
289 $collation->make_witness_paths();
290 # Now delete any orphaned readings.
291 foreach my $r ( $collation->sequence->isolated_vertices ) {
292 print STDERR "Deleting unconnected reading $r / " .
293 $collation->reading( $r )->text . "\n";
294 $collation->del_reading( $r );
297 KUL::HACK::post_path_hack( $collation ) if $opts->{'input'} eq 'KUL';
298 # Have to check relationship validity at this point, because before that
300 # foreach my $rel ( $collation->relationships ) {
301 # next unless $rel->equal_rank;
302 # unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
303 # warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
304 # $rel->type, $rel->from->id, $rel->to->id );
307 unless( $opts->{'nocalc'} ) {
308 $collation->calculate_common_readings(); # will implicitly rank
314 my @line_beginnings = read_base( 'reference.txt', $collation );
316 Takes a text file and a (presumed empty) collation object, adds the
317 words as simple linear readings to the collation, and returns a
318 list of readings that represent the beginning of lines. This collation
319 is now the starting point for application of apparatus entries in
320 merge_base, e.g. from a CSV file or a Classical Text Editor file.
325 my( $base_file, $collation ) = @_;
327 # This array gives the first reading for each line. We put the
328 # common starting point in line zero.
329 my $last_reading = $collation->start;
330 $base_text_index{$last_reading->id} = 0;
331 my $lineref_array = [ $last_reading ]; # There is no line zero.
333 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
336 # Make the readings, and connect them up for the base, but
337 # also save the first reading of each line in an array for the
339 # TODO use configurable reading separator
344 my $lineref = scalar @$lineref_array;
345 last if $SHORTEND && $lineref > $SHORTEND;
346 foreach my $w ( @words ) {
347 my $readingref = join( ',', $lineref, ++$wordref );
348 my $reading = $collation->add_reading( { id => $readingref, text => $w } );
350 push( @$lineref_array, $reading );
353 # Add edge paths in the graph, for easier tracking when
354 # we start applying corrections. These paths will be
355 # removed when we're done.
356 my $path = $collation->add_path( $last_reading, $reading,
357 $collation->baselabel );
358 $last_reading = $reading;
360 # Note an array index for the reading, for later correction splices.
361 $base_text_index{$readingref} = $i++;
365 # Ending point for all texts
366 $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
367 push( @$lineref_array, $collation->end );
368 $base_text_index{$collation->end->id} = $i;
370 return( @$lineref_array );
373 sub set_relationships {
374 my( $collation, $app, $lemma, $variants ) = @_;
375 foreach my $rkey ( keys %$variants ) {
376 my $var = $variants->{$rkey}->{'reading'};
377 my $type = $app->{sprintf( "_%s_type", $rkey )};
378 my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
379 my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
381 my %rel_options = ();
382 $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
383 $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
385 if( $type =~ /^(inv|tr|rep)$/i ) {
386 # Transposition or repetition: look for nodes with the
387 # same label but different IDs and mark them.
388 $type = 'repetition' if $type =~ /^rep/i;
389 $rel_options{'type'} = $type;
390 $rel_options{'equal_rank'} = undef;
392 foreach my $r ( @$lemma ) {
393 $labels{cmp_str( $r )} = $r;
395 foreach my $r( @$var ) {
396 if( exists $labels{$r->text} &&
397 $r->id ne $labels{$r->text}->id ) {
398 if( $type eq 'repetition' ) {
401 $collation->add_relationship( $r, $labels{$r->text}, \%rel_options );
402 } catch( Text::Tradition::Error $e ) {
403 warn "Could not set repetition relationship $r -> "
404 . $labels{$r->text} . ": " . $e->message;
409 $r->set_identical( $labels{$r->text} );
410 } catch( Text::Tradition::Error $e ) {
411 warn "Could not set transposition relationship $r -> "
412 . $labels{$r->text} . ": " . $e->message;
417 } elsif( $type =~ /^(gr|sp(el)?)$/i ) {
419 # Grammar/spelling/lexical: this can be a one-to-one or
420 # one-to-many mapping. We should think about merging
421 # readings if it is one-to-many.
423 $type = 'grammatical' if $type =~ /gr/i;
424 $type = 'spelling' if $type =~ /sp/i;
425 $type = 'repetition' if $type =~ /rep/i;
426 # $type = 'lexical' if $type =~ /lex/i;
427 $rel_options{'type'} = $type;
428 $rel_options{'equal_rank'} = 1;
429 if( @$lemma == @$var ) {
430 foreach my $i ( 0 .. $#{$lemma} ) {
432 $collation->add_relationship( $var->[$i], $lemma->[$i],
434 } catch( Text::Tradition::Error $e ) {
435 warn "Could not set $type relationship " . $var->[$i] . " -> "
436 . $lemma->[$i] . ": " . $e->message;
440 # An uneven many-to-many mapping. Skip for now.
441 # We really want to make a segment out of whatever we have.
442 # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
443 # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
444 # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
445 # if( @$lemma == 1 && @$var == 1 ) {
446 # $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
449 } elsif( $type !~ /^(add|om|lex)$/i ) {
450 warn "Unrecognized type $type";
458 my( $collation, $edit_sequence, $debug ) = @_;
459 my @lemma_text = $collation->reading_sequence(
460 $collation->start, $collation->end );
462 foreach my $correction ( @$edit_sequence ) {
463 my( $lemma_start, $length, $items ) = @$correction;
464 my $offset = $base_text_index{$lemma_start};
465 my $realoffset = $offset + $drift;
467 $lemma_text[$realoffset]->id ne $lemma_start ) {
468 my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
471 my $l = $collation->reading( $lemma_start );
472 while( $i < $realoffset+$length ) {
473 push( @base_phrase, $l );
474 $l = $collation->next_reading( $l );
478 print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
479 "with %s (%s) with drift %d\n",
480 join( ' ', map {$_->text} @base_phrase ),
481 join( ' ', map {$_->id} @base_phrase ),
483 join( ' ', map {$_->text} @$items ),
484 join( ' ', map {$_->id} @$items ),
488 if( $lemma_text[$realoffset]->id ne $lemma_start ) {
489 warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
490 "but %s (%s) is there instead",
491 join( ' ', map {$_->text} @base_phrase ),
492 join( ' ', map {$_->id} @base_phrase ),
493 join( ' ', map {$_->text} @$items ),
494 join( ' ', map {$_->id} @$items ),
495 join( ' ', map {$_->text} @this_phrase ),
496 join( ' ', map {$_->id} @this_phrase ),
501 splice( @lemma_text, $realoffset, $length, @$items );
502 $drift += @$items - $length;
508 # Helper function. Given a witness sigil, if it is a post-correctione
509 # sigil,return the base witness. If not, return a false value.
512 if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
523 This package is free software and is provided "as is" without express
524 or implied warranty. You can redistribute it and/or modify it under
525 the same terms as Perl itself.
529 Tara L Andrews, aurum@cpan.org