1 package Text::Tradition::Parser::BaseText;
6 use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );
10 Text::Tradition::Parser::BaseText
14 use Text::Tradition::Parser::BaseText qw( merge_base );
15 merge_base( $graph, 'reference.txt', @apparatus_entries )
19 For an overview of the package, see the documentation for the
20 Text::Tradition module.
22 This module is meant for use with certain of the other Parser classes
23 - whenever a list of variants is given with reference to a base text,
24 these must be joined into a single collation. The parser should
25 therefore make a list of variants and their locations, and BaseText
26 will join those listed variants onto the reference text.
34 parse( $graph, $opts );
36 Takes an initialized graph and a hashref of options, which must include:
37 - 'base' - the base text referenced by the variants
38 - 'format' - the format of the variant list
39 - 'data' - the variants, in the given format.
44 my( $tradition, $opts ) = @_;
46 my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
48 # TODO Handle a string someday if we ever have a format other than KUL
49 my @apparatus_entries = $format_mod->can('read')->( $opts );
50 merge_base( $tradition->collation, $opts->{'base'}, @apparatus_entries );
55 merge_base( $graph, 'reference.txt', @apparatus_entries )
57 Takes three arguments: a newly-initialized Text::Tradition::Graph
58 object, a text file containing the reference text, and a list of
59 variants (apparatus entries). Adds the base text to the graph, and
60 joins the variants to that.
62 The list of variants is an array of hash references; each hash takes
64 { '_id' => line reference,
65 'rdg_0' => lemma reading,
66 'rdg_1' => first variant,
67 ... # and so on until all distinct readings are listed
68 'WitnessA' => 'rdg_0',
69 'WitnessB' => 'rdg_1',
70 ... # and so on until all witnesses are listed with their readings
73 Any hash key that is not of the form /^rdg_\d+$/ and that does not
74 begin with an underscore is assumed to be a witness name. Any 'meta'
75 information to be passed must be passed in a key with a leading
76 underscore in its name.
80 my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
83 my $edits_required = {};
85 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
88 my( $collation, $base_file, @app_entries ) = @_;
89 my @base_line_starts = read_base( $base_file, $collation );
92 my @unwitnessed_lemma_nodes;
93 foreach my $app ( @app_entries ) {
94 my( $line, $num ) = split( /\./, $app->{_id} );
95 # DEBUG with a short graph
96 last if $SHORTEND && $line > $SHORTEND;
97 # DEBUG for problematic entries
99 my $first_line_reading = $base_line_starts[ $line ];
100 my $too_far = $base_line_starts[ $line+1 ];
102 my $lemma = $app->{rdg_0};
104 # Is this the Nth occurrence of this reading in the line?
105 if( $lemma =~ s/(_)?(\d)$// ) {
108 my @lemma_words = split( /\s+/, $lemma );
110 # Now search for the lemma words within this line.
111 my $lemma_start = $first_line_reading;
114 while( $lemma_start ne $too_far ) {
116 if( $seen{ $lemma_start->name() } ) {
117 warn "Detected loop at " . $lemma_start->name() .
121 $seen{ $lemma_start->name() } = 1;
123 # Try to match the lemma.
125 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
126 $lemma_words[0] . "...\n"
127 if "$line.$num" eq $scrutinize;
128 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
129 # Skip it if we need a match that is not the first.
131 # Now we have to compare the rest of the words here.
132 if( scalar( @lemma_words ) > 1 ) {
134 $collation->next_reading( $lemma_start );
135 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
136 printf STDERR "Now matching %s against %s\n",
137 cmp_str($next_reading), $w
138 if "$line.$num" eq $scrutinize;
139 if( $w ne cmp_str($next_reading) ) {
143 $lemma_end = $next_reading;
145 $collation->next_reading( $lemma_end );
149 $lemma_end = $lemma_start;
155 last unless ( $unmatch || !defined( $lemma_end ) );
157 $lemma_start = $collation->next_reading( $lemma_start );
160 unless( $lemma_end ) {
161 warn "No match found for @lemma_words at $line.$num";
165 # Now we have found the lemma; we will record an 'edit', in
166 # terms of a splice operation, for each subsequent reading.
167 # We also note which witnesses take the given edit.
169 my @lemma_set = $collation->reading_sequence( $lemma_start,
171 my @reading_sets = [ @lemma_set ];
173 # For each reading that is not rdg_0, we create the variant
174 # reading nodes, and store the range as an edit operation on
177 my %pc_seen; # Keep track of mss with explicit post-corr data
178 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
179 my @mss = grep { $app->{$_} eq $k } keys( %$app );
181 # Keep track of lemma nodes that don't actually appear in
182 # any MSS; we will want to remove them from the collation.
183 push( @unwitnessed_lemma_nodes, @lemma_set )
184 if !@mss && $k eq 'rdg_0';
186 # Keep track of what witnesses we have seen.
187 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
188 # Keep track of which witnesses bear corrected readings here.
189 foreach my $m ( @mss ) {
190 my $base = _is_post_corr( $m );
194 next if $k eq 'rdg_0';
196 # Parse the variant into reading tokens.
197 # TODO don't hardcode the reading split operation
198 my @variant = split( /\s+/, $app->{$k} );
199 @variant = () if $app->{$k} eq '/'; # This is an omission.
201 my @variant_readings;
203 foreach my $vw ( @variant ) {
204 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
205 my $vwreading = $collation->add_reading( $vwname );
206 $vwreading->text( $vw );
207 push( @variant_readings, $vwreading );
210 $variant_objects->{$k} = { 'mss' => \@mss,
211 'reading' => \@variant_readings,
213 push( @reading_sets, \@variant_readings );
216 # Now collate and collapse the identical readings within the
217 # collated sets. Modifies the reading sets that were passed.
218 collate_variants( $collation, @reading_sets );
220 # Record any stated relationships between the nodes and the lemma.
221 set_relationships( $collation, $app, \@lemma_set, $variant_objects );
223 # Now create the splice-edit objects that will be used
224 # to reconstruct each witness.
226 foreach my $rkey ( keys %$variant_objects ) {
227 # Object is argument list for splice, so:
228 # offset, length, replacements
229 my $edit_object = [ $lemma_start->name,
230 scalar( @lemma_set ),
231 $variant_objects->{$rkey}->{reading} ];
232 foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
233 # Is this a p.c. entry?
234 my $base = _is_post_corr( $ms );
235 if( $base ) { # this is a post-corr witness
236 my $pc_key = $base . "_post";
237 add_hash_entry( $edits_required, $pc_key, $edit_object );
238 } else { # this is an ante-corr witness
239 my $pc_key = $ms . "_post";
240 add_hash_entry( $edits_required, $ms, $edit_object );
241 unless( $pc_seen{$ms} ) {
242 # If this witness carries no correction, add this
243 # same object to its post-corrected state.
244 add_hash_entry( $edits_required, $pc_key,
250 } # Finished going through the apparatus entries
252 # Now make the witness objects, and create their text sequences
253 foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
254 print STDERR "Creating witness $w\n";
255 my $witness_obj = $collation->tradition->add_witness( sigil => $w );
256 my $debug; # = $w eq 'Vb11';
257 my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
258 my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
259 if exists( $edits_required->{$w."_post"} );
261 my @repeated = check_for_repeated( @ante_corr_seq );
262 warn "Repeated elements @repeated in $w a.c."
264 @repeated = check_for_repeated( @post_corr_seq );
265 warn "Repeated elements @repeated in $w p.c."
268 # Now save these paths in my witness object
269 if( @post_corr_seq ) {
270 $witness_obj->path( \@post_corr_seq );
271 $witness_obj->uncorrected_path( \@ante_corr_seq );
273 $witness_obj->path( \@ante_corr_seq );
277 # Now remove our 'base text' edges, which is to say, the only
278 # ones we have created so far. Also remove any unwitnessed
279 # lemma nodes (TODO unless we are treating base as witness)
280 foreach ( $collation->paths() ) {
281 $collation->del_path( $_ );
283 foreach( @unwitnessed_lemma_nodes ) {
284 $collation->del_reading( $_ );
285 # TODO do we need to delete any relationship paths here?
288 ### HACKY HACKY Do some one-off path corrections here.
289 require( 'data/boodts/s158.HACK' );
290 KUL::HACK::pre_path_hack( $collation );
292 # Now walk paths and calculate positional rank.
293 $collation->make_witness_paths();
294 KUL::HACK::post_path_hack( $collation );
295 # Have to check relationship validity at this point, because before that
297 # foreach my $rel ( $collation->relationships ) {
298 # next unless $rel->equal_rank;
299 # unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
300 # warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
301 # $rel->type, $rel->from->name, $rel->to->name );
304 $collation->calculate_ranks();
309 my @line_beginnings = read_base( 'reference.txt', $collation );
311 Takes a text file and a (presumed empty) collation object, adds the
312 words as simple linear readings to the collation, and returns a
313 list of readings that represent the beginning of lines. This collation
314 is now the starting point for application of apparatus entries in
315 merge_base, e.g. from a CSV file or a Classical Text Editor file.
320 my( $base_file, $collation ) = @_;
322 # This array gives the first reading for each line. We put the
323 # common starting point in line zero.
324 my $last_reading = $collation->start();
325 $base_text_index{$last_reading->name} = 0;
326 my $lineref_array = [ $last_reading ]; # There is no line zero.
328 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
331 # Make the readings, and connect them up for the base, but
332 # also save the first reading of each line in an array for the
334 # TODO use configurable reading separator
339 my $lineref = scalar @$lineref_array;
340 last if $SHORTEND && $lineref > $SHORTEND;
341 foreach my $w ( @words ) {
342 my $readingref = join( ',', $lineref, ++$wordref );
343 my $reading = $collation->add_reading( $readingref );
344 $reading->text( $w );
346 push( @$lineref_array, $reading );
349 # Add edge paths in the graph, for easier tracking when
350 # we start applying corrections. These paths will be
351 # removed when we're done.
352 my $path = $collation->add_path( $last_reading, $reading,
353 $collation->baselabel );
354 $last_reading = $reading;
356 # Note an array index for the reading, for later correction splices.
357 $base_text_index{$readingref} = $i++;
361 # Ending point for all texts
362 $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
363 push( @$lineref_array, $collation->end );
364 $base_text_index{$collation->end->name} = $i;
366 return( @$lineref_array );
369 sub set_relationships {
370 my( $collation, $app, $lemma, $variants ) = @_;
371 foreach my $rkey ( keys %$variants ) {
372 my $var = $variants->{$rkey}->{'reading'};
373 my $type = $app->{sprintf( "_%s_type", $rkey )};
374 my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
375 my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
377 my %rel_options = ();
378 $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
379 $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
381 if( $type =~ /^(inv|tr|rep)$/i ) {
382 # Transposition or repetition: look for nodes with the
383 # same label but different IDs and mark them.
384 $type = 'repetition' if $type =~ /^rep/i;
385 $rel_options{'type'} = $type;
386 $rel_options{'equal_rank'} = undef;
388 foreach my $r ( @$lemma ) {
389 $labels{cmp_str( $r )} = $r;
391 foreach my $r( @$var ) {
392 if( exists $labels{$r->label} &&
393 $r->name ne $labels{$r->label}->name ) {
394 if( $type eq 'repetition' ) {
396 $collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
399 $r->set_identical( $labels{$r->label} );
403 } elsif( $type =~ /^(gr|sp(el)?)$/i ) {
405 # Grammar/spelling/lexical: this can be a one-to-one or
406 # one-to-many mapping. We should think about merging
407 # readings if it is one-to-many.
409 $type = 'grammatical' if $type =~ /gr/i;
410 $type = 'spelling' if $type =~ /sp/i;
411 $type = 'repetition' if $type =~ /rep/i;
412 # $type = 'lexical' if $type =~ /lex/i;
413 $rel_options{'type'} = $type;
414 $rel_options{'equal_rank'} = 1;
415 if( @$lemma == @$var ) {
416 foreach my $i ( 0 .. $#{$lemma} ) {
417 $collation->add_relationship( $var->[$i], $lemma->[$i],
421 # An uneven many-to-many mapping. Skip for now.
422 # We really want to make a segment out of whatever we have.
423 # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
424 # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
425 # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
426 if( @$lemma == 1 && @$var == 1 ) {
427 $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
430 } elsif( $type !~ /^(add|om|lex)$/i ) {
431 warn "Unrecognized type $type";
439 my( $collation, $edit_sequence, $debug ) = @_;
440 my @lemma_text = $collation->reading_sequence( $collation->start,
441 $collation->reading( '#END#' ) );
443 foreach my $correction ( @$edit_sequence ) {
444 my( $lemma_start, $length, $items ) = @$correction;
445 my $offset = $base_text_index{$lemma_start};
446 my $realoffset = $offset + $drift;
448 $lemma_text[$realoffset]->name ne $lemma_start ) {
449 my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
452 my $l = $collation->reading( $lemma_start );
453 while( $i < $realoffset+$length ) {
454 push( @base_phrase, $l );
455 $l = $collation->next_reading( $l );
459 print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
460 "with %s (%s) with drift %d\n",
461 join( ' ', map {$_->label} @base_phrase ),
462 join( ' ', map {$_->name} @base_phrase ),
464 join( ' ', map {$_->label} @$items ),
465 join( ' ', map {$_->name} @$items ),
469 if( $lemma_text[$realoffset]->name ne $lemma_start ) {
470 warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
471 "but %s (%s) is there instead",
472 join( ' ', map {$_->label} @base_phrase ),
473 join( ' ', map {$_->name} @base_phrase ),
474 join( ' ', map {$_->label} @$items ),
475 join( ' ', map {$_->name} @$items ),
476 join( ' ', map {$_->label} @this_phrase ),
477 join( ' ', map {$_->name} @this_phrase ),
482 splice( @lemma_text, $realoffset, $length, @$items );
483 $drift += @$items - $length;
489 # Helper function. Given a witness sigil, if it is a post-correctione
490 # sigil,return the base witness. If not, return a false value.
493 if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
504 This package is free software and is provided "as is" without express
505 or implied warranty. You can redistribute it and/or modify it under
506 the same terms as Perl itself.
510 Tara L Andrews, aurum@cpan.org