1 package Text::Tradition::Parser::BaseText;
6 use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );
10 Text::Tradition::Parser::BaseText
14 use Text::Tradition::Parser::BaseText qw( merge_base );
15 merge_base( $graph, 'reference.txt', @apparatus_entries )
19 For an overview of the package, see the documentation for the
20 Text::Tradition module.
22 This module is meant for use with certain of the other Parser classes
23 - whenever a list of variants is given with reference to a base text,
24 these must be joined into a single collation. The parser should
25 therefore make a list of variants and their locations, and BaseText
26 will join those listed variants onto the reference text.
34 parse( $graph, $opts );
36 Takes an initialized graph and a hashref of options, which must include:
37 - 'base' - the base text referenced by the variants
38 - 'format' - the format of the variant list
39 - 'data' - the variants, in the given format.
44 my( $tradition, $opts ) = @_;
46 my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
48 # TODO Handle a string someday if we ever have a format other than KUL
49 my @apparatus_entries = $format_mod->can('read')->( $opts );
50 merge_base( $tradition->collation, $opts->{'base'}, @apparatus_entries );
55 merge_base( $graph, 'reference.txt', @apparatus_entries )
57 Takes three arguments: a newly-initialized Text::Tradition::Graph
58 object, a text file containing the reference text, and a list of
59 variants (apparatus entries). Adds the base text to the graph, and
60 joins the variants to that.
62 The list of variants is an array of hash references; each hash takes
64 { '_id' => line reference,
65 'rdg_0' => lemma reading,
66 'rdg_1' => first variant,
67 ... # and so on until all distinct readings are listed
68 'WitnessA' => 'rdg_0',
69 'WitnessB' => 'rdg_1',
70 ... # and so on until all witnesses are listed with their readings
73 Any hash key that is not of the form /^rdg_\d+$/ and that does not
74 begin with an underscore is assumed to be a witness name. Any 'meta'
75 information to be passed must be passed in a key with a leading
76 underscore in its name.
80 my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
83 my $edits_required = {};
85 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
88 my( $collation, $base_file, @app_entries ) = @_;
89 my @base_line_starts = read_base( $base_file, $collation );
92 foreach my $app ( @app_entries ) {
93 my( $line, $num ) = split( /\./, $app->{_id} );
94 # DEBUG with a short graph
95 last if $SHORTEND && $line > $SHORTEND;
96 # DEBUG for problematic entries
98 my $first_line_reading = $base_line_starts[ $line ];
99 my $too_far = $base_line_starts[ $line+1 ];
101 my $lemma = $app->{rdg_0};
103 # Is this the Nth occurrence of this reading in the line?
104 if( $lemma =~ s/(_)?(\d)$// ) {
107 my @lemma_words = split( /\s+/, $lemma );
109 # Now search for the lemma words within this line.
110 my $lemma_start = $first_line_reading;
113 while( $lemma_start ne $too_far ) {
115 if( $seen{ $lemma_start->id() } ) {
116 warn "Detected loop at " . $lemma_start->id() .
120 $seen{ $lemma_start->id() } = 1;
122 # Try to match the lemma.
124 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
125 $lemma_words[0] . "...\n"
126 if "$line.$num" eq $scrutinize;
127 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
128 # Skip it if we need a match that is not the first.
130 # Now we have to compare the rest of the words here.
131 if( scalar( @lemma_words ) > 1 ) {
133 $collation->next_reading( $lemma_start );
134 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
135 printf STDERR "Now matching %s against %s\n",
136 cmp_str($next_reading), $w
137 if "$line.$num" eq $scrutinize;
138 if( $w ne cmp_str($next_reading) ) {
142 $lemma_end = $next_reading;
144 $collation->next_reading( $lemma_end );
148 $lemma_end = $lemma_start;
154 last unless ( $unmatch || !defined( $lemma_end ) );
156 $lemma_start = $collation->next_reading( $lemma_start );
159 unless( $lemma_end ) {
160 warn "No match found for @lemma_words at $line.$num";
164 # Now we have found the lemma; we will record an 'edit', in
165 # terms of a splice operation, for each subsequent reading.
166 # We also note which witnesses take the given edit.
168 my @lemma_set = $collation->reading_sequence( $lemma_start,
170 my @reading_sets = [ @lemma_set ];
172 # For each reading that is not rdg_0, we create the variant
173 # reading nodes, and store the range as an edit operation on
176 my %pc_seen; # Keep track of mss with explicit post-corr data
177 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
178 my @mss = grep { $app->{$_} eq $k } keys( %$app );
180 # Keep track of what witnesses we have seen.
181 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
182 # Keep track of which witnesses bear corrected readings here.
183 foreach my $m ( @mss ) {
184 my $base = _is_post_corr( $m );
188 next if $k eq 'rdg_0';
190 # Parse the variant into reading tokens.
191 # TODO don't hardcode the reading split operation
192 my @variant = split( /\s+/, $app->{$k} );
193 @variant = () if $app->{$k} eq '/'; # This is an omission.
195 my @variant_readings;
197 foreach my $vw ( @variant ) {
198 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
199 my $vwreading = $collation->add_reading( {
202 push( @variant_readings, $vwreading );
205 $variant_objects->{$k} = { 'mss' => \@mss,
206 'reading' => \@variant_readings,
208 push( @reading_sets, \@variant_readings );
211 # Now collate and collapse the identical readings within the
212 # collated sets. Modifies the reading sets that were passed.
213 collate_variants( $collation, @reading_sets );
215 # Record any stated relationships between the nodes and the lemma.
216 set_relationships( $collation, $app, \@lemma_set, $variant_objects );
218 # Now create the splice-edit objects that will be used
219 # to reconstruct each witness.
221 foreach my $rkey ( keys %$variant_objects ) {
222 # Object is argument list for splice, so:
223 # offset, length, replacements
224 my $edit_object = [ $lemma_start->id,
225 scalar( @lemma_set ),
226 $variant_objects->{$rkey}->{reading} ];
227 foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
228 # Is this a p.c. entry?
229 my $base = _is_post_corr( $ms );
230 if( $base ) { # this is a post-corr witness
231 my $pc_key = $base . "_post";
232 add_hash_entry( $edits_required, $pc_key, $edit_object );
233 } else { # this is an ante-corr witness
234 my $pc_key = $ms . "_post";
235 add_hash_entry( $edits_required, $ms, $edit_object );
236 unless( $pc_seen{$ms} ) {
237 # If this witness carries no correction, add this
238 # same object to its post-corrected state.
239 add_hash_entry( $edits_required, $pc_key,
245 } # Finished going through the apparatus entries
247 # Now make the witness objects, and create their text sequences
248 foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
249 print STDERR "Creating witness $w\n";
250 my $witness_obj = $collation->tradition->add_witness( sigil => $w );
251 my $debug; # = $w eq 'Vb11';
252 my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
253 my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
254 if exists( $edits_required->{$w."_post"} );
256 my @repeated = check_for_repeated( @ante_corr_seq );
257 warn "Repeated elements @repeated in $w a.c."
259 @repeated = check_for_repeated( @post_corr_seq );
260 warn "Repeated elements @repeated in $w p.c."
263 # Now save these paths in my witness object
264 if( @post_corr_seq ) {
265 $witness_obj->path( \@post_corr_seq );
266 $witness_obj->uncorrected_path( \@ante_corr_seq );
268 $witness_obj->path( \@ante_corr_seq );
272 # Now remove our 'base text' edges, which is to say, the only
273 # ones we have created so far. Also remove any unwitnessed
274 # lemma nodes (TODO unless we are treating base as witness)
275 foreach ( $collation->paths() ) {
276 $collation->del_path( $_, $collation->baselabel );
279 ### HACKY HACKY Do some one-off path corrections here.
280 require( 'data/boodts/s158.HACK' );
281 KUL::HACK::pre_path_hack( $collation );
283 # Now walk paths and calculate positional rank.
284 $collation->make_witness_paths();
285 # Now delete any orphaned readings.
286 foreach my $r ( $collation->sequence->isolated_vertices ) {
287 print STDERR "Deleting unconnected reading $r / " .
288 $collation->reading( $r )->text . "\n";
289 $collation->del_reading( $r );
292 KUL::HACK::post_path_hack( $collation );
293 # Have to check relationship validity at this point, because before that
295 # foreach my $rel ( $collation->relationships ) {
296 # next unless $rel->equal_rank;
297 # unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
298 # warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
299 # $rel->type, $rel->from->id, $rel->to->id );
302 $collation->calculate_ranks();
307 my @line_beginnings = read_base( 'reference.txt', $collation );
309 Takes a text file and a (presumed empty) collation object, adds the
310 words as simple linear readings to the collation, and returns a
311 list of readings that represent the beginning of lines. This collation
312 is now the starting point for application of apparatus entries in
313 merge_base, e.g. from a CSV file or a Classical Text Editor file.
318 my( $base_file, $collation ) = @_;
320 # This array gives the first reading for each line. We put the
321 # common starting point in line zero.
322 my $last_reading = $collation->start;
323 $base_text_index{$last_reading->id} = 0;
324 my $lineref_array = [ $last_reading ]; # There is no line zero.
326 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
329 # Make the readings, and connect them up for the base, but
330 # also save the first reading of each line in an array for the
332 # TODO use configurable reading separator
337 my $lineref = scalar @$lineref_array;
338 last if $SHORTEND && $lineref > $SHORTEND;
339 foreach my $w ( @words ) {
340 my $readingref = join( ',', $lineref, ++$wordref );
341 my $reading = $collation->add_reading( { id => $readingref, text => $w } );
343 push( @$lineref_array, $reading );
346 # Add edge paths in the graph, for easier tracking when
347 # we start applying corrections. These paths will be
348 # removed when we're done.
349 my $path = $collation->add_path( $last_reading, $reading,
350 $collation->baselabel );
351 $last_reading = $reading;
353 # Note an array index for the reading, for later correction splices.
354 $base_text_index{$readingref} = $i++;
358 # Ending point for all texts
359 $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
360 push( @$lineref_array, $collation->end );
361 $base_text_index{$collation->end->id} = $i;
363 return( @$lineref_array );
366 sub set_relationships {
367 my( $collation, $app, $lemma, $variants ) = @_;
368 foreach my $rkey ( keys %$variants ) {
369 my $var = $variants->{$rkey}->{'reading'};
370 my $type = $app->{sprintf( "_%s_type", $rkey )};
371 my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
372 my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
374 my %rel_options = ();
375 $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
376 $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
378 if( $type =~ /^(inv|tr|rep)$/i ) {
379 # Transposition or repetition: look for nodes with the
380 # same label but different IDs and mark them.
381 $type = 'repetition' if $type =~ /^rep/i;
382 $rel_options{'type'} = $type;
383 $rel_options{'equal_rank'} = undef;
385 foreach my $r ( @$lemma ) {
386 $labels{cmp_str( $r )} = $r;
388 foreach my $r( @$var ) {
389 if( exists $labels{$r->text} &&
390 $r->id ne $labels{$r->text}->id ) {
391 if( $type eq 'repetition' ) {
393 $collation->add_relationship( $r, $labels{$r->text}, \%rel_options );
396 $r->set_identical( $labels{$r->text} );
400 } elsif( $type =~ /^(gr|sp(el)?)$/i ) {
402 # Grammar/spelling/lexical: this can be a one-to-one or
403 # one-to-many mapping. We should think about merging
404 # readings if it is one-to-many.
406 $type = 'grammatical' if $type =~ /gr/i;
407 $type = 'spelling' if $type =~ /sp/i;
408 $type = 'repetition' if $type =~ /rep/i;
409 # $type = 'lexical' if $type =~ /lex/i;
410 $rel_options{'type'} = $type;
411 $rel_options{'equal_rank'} = 1;
412 if( @$lemma == @$var ) {
413 foreach my $i ( 0 .. $#{$lemma} ) {
414 $collation->add_relationship( $var->[$i], $lemma->[$i],
418 # An uneven many-to-many mapping. Skip for now.
419 # We really want to make a segment out of whatever we have.
420 # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
421 # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
422 # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
423 if( @$lemma == 1 && @$var == 1 ) {
424 $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
427 } elsif( $type !~ /^(add|om|lex)$/i ) {
428 warn "Unrecognized type $type";
436 my( $collation, $edit_sequence, $debug ) = @_;
437 my @lemma_text = $collation->reading_sequence(
438 $collation->start, $collation->end );
440 foreach my $correction ( @$edit_sequence ) {
441 my( $lemma_start, $length, $items ) = @$correction;
442 my $offset = $base_text_index{$lemma_start};
443 my $realoffset = $offset + $drift;
445 $lemma_text[$realoffset]->id ne $lemma_start ) {
446 my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
449 my $l = $collation->reading( $lemma_start );
450 while( $i < $realoffset+$length ) {
451 push( @base_phrase, $l );
452 $l = $collation->next_reading( $l );
456 print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
457 "with %s (%s) with drift %d\n",
458 join( ' ', map {$_->text} @base_phrase ),
459 join( ' ', map {$_->id} @base_phrase ),
461 join( ' ', map {$_->text} @$items ),
462 join( ' ', map {$_->id} @$items ),
466 if( $lemma_text[$realoffset]->id ne $lemma_start ) {
467 warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
468 "but %s (%s) is there instead",
469 join( ' ', map {$_->text} @base_phrase ),
470 join( ' ', map {$_->id} @base_phrase ),
471 join( ' ', map {$_->text} @$items ),
472 join( ' ', map {$_->id} @$items ),
473 join( ' ', map {$_->text} @this_phrase ),
474 join( ' ', map {$_->id} @this_phrase ),
479 splice( @lemma_text, $realoffset, $length, @$items );
480 $drift += @$items - $length;
486 # Helper function. Given a witness sigil, if it is a post-correctione
487 # sigil,return the base witness. If not, return a false value.
490 if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
501 This package is free software and is provided "as is" without express
502 or implied warranty. You can redistribute it and/or modify it under
503 the same terms as Perl itself.
507 Tara L Andrews, aurum@cpan.org