1 package Text::Tradition::Parser::BaseText;
10 Text::Tradition::Parser::BaseText
14 use Text::Tradition::Parser::BaseText qw( merge_base );
15 merge_base( $graph, 'reference.txt', @apparatus_entries )
19 For an overview of the package, see the documentation for the
20 Text::Tradition::Graph module.
22 This module is meant for use with certain of the other Parser classes
23 - whenever a list of variants is given with reference to a base text,
24 these must be joined into a single collation. The parser should
25 therefore make a list of variants and their locations, and BaseText
26 will join those listed variants onto the reference text.
34 parse( $graph, %opts );
36 Takes an initialized graph and a set of options, which must include:
37 - 'base' - the base text referenced by the variants
38 - 'format' - the format of the variant list
39 - 'data' - the variants, in the given format.
45 my( $tradition, %opts ) = @_;
47 my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
49 my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
50 $DETRANSPOSE = 1 if $opts{'linear'};
51 merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
56 merge_base( $graph, 'reference.txt', @apparatus_entries )
58 Takes three arguments: a newly-initialized Text::Tradition::Graph
59 object, a text file containing the reference text, and a list of
60 variants (apparatus entries). Adds the base text to the graph, and
61 joins the variants to that.
63 The list of variants is an array of hash references; each hash takes
65 { '_id' => line reference,
66 'rdg_0' => lemma reading,
67 'rdg_1' => first variant,
68 ... # and so on until all distinct readings are listed
69 'WitnessA' => 'rdg_0',
70 'WitnessB' => 'rdg_1',
71 ... # and so on until all witnesses are listed with their readings
74 Any hash key that is not of the form /^rdg_\d+$/ and that does not
75 begin with an underscore is assumed to be a witness name. Any 'meta'
76 information to be passed must be passed in a key with a leading
77 underscore in its name.
81 my $SHORT = 25; # Debug var - set this to limit the number of lines parsed
84 my $edits_required = {};
86 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
89 my( $collation, $base_file, @app_entries ) = @_;
90 my @base_line_starts = read_base( $base_file, $collation );
93 my @unwitnessed_lemma_nodes;
94 foreach my $app ( @app_entries ) {
95 my( $line, $num ) = split( /\./, $app->{_id} );
96 # DEBUG with a short graph
97 last if $SHORT && $line > $SHORT;
98 # DEBUG for problematic entries
100 my $first_line_reading = $base_line_starts[ $line ];
101 my $too_far = $base_line_starts[ $line+1 ];
103 my $lemma = $app->{rdg_0};
105 # Is this the Nth occurrence of this reading in the line?
106 if( $lemma =~ s/(_)?(\d)$// ) {
109 my @lemma_words = split( /\s+/, $lemma );
111 # Now search for the lemma words within this line.
112 my $lemma_start = $first_line_reading;
115 while( $lemma_start ne $too_far ) {
117 if( $seen{ $lemma_start->name() } ) {
118 warn "Detected loop at " . $lemma_start->name() .
122 $seen{ $lemma_start->name() } = 1;
124 # Try to match the lemma.
126 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
127 $lemma_words[0] . "...\n"
128 if "$line.$num" eq $scrutinize;
129 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
130 # Skip it if we need a match that is not the first.
132 # Now we have to compare the rest of the words here.
133 if( scalar( @lemma_words ) > 1 ) {
135 $collation->next_reading( $lemma_start );
136 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
137 printf STDERR "Now matching %s against %s\n",
138 cmp_str($next_reading), $w
139 if "$line.$num" eq $scrutinize;
140 if( $w ne cmp_str($next_reading) ) {
144 $lemma_end = $next_reading;
146 $collation->next_reading( $lemma_end );
150 $lemma_end = $lemma_start;
156 last unless ( $unmatch || !defined( $lemma_end ) );
158 $lemma_start = $collation->next_reading( $lemma_start );
161 unless( $lemma_end ) {
162 warn "No match found for @lemma_words at $line.$num";
166 # Now we have found the lemma; we will record an 'edit', in
167 # terms of a splice operation, for each subsequent reading.
168 # We also note which witnesses take the given edit.
170 my @lemma_set = $collation->reading_sequence( $lemma_start,
172 my @reading_sets = [ @lemma_set ];
174 # For each reading that is not rdg_0, we create the variant
175 # reading nodes, and store the range as an edit operation on
178 my %pc_seen; # Keep track of mss with explicit post-corr data
179 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
180 my @mss = grep { $app->{$_} eq $k } keys( %$app );
181 push( @unwitnessed_lemma_nodes, @lemma_set )
182 if !@mss && $k eq 'rdg_0';
184 # Keep track of what witnesses we have seen.
185 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
186 # Keep track of which witnesses bear corrected readings here.
187 foreach my $m ( @mss ) {
188 my $base = _is_post_corr( $m );
192 next if $k eq 'rdg_0';
194 # TODO don't hardcode the reading split operation
195 my @variant = split( /\s+/, $app->{$k} );
196 @variant = () if $app->{$k} eq '/'; # This is an omission.
198 # Make the variant into a set of readings.
199 my @variant_readings;
201 foreach my $vw ( @variant ) {
202 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
203 my $vwreading = $collation->add_reading( $vwname );
204 $vwreading->text( $vw );
205 push( @variant_readings, $vwreading );
208 $variant_objects->{$k} = { 'mss' => \@mss,
209 'reading' => \@variant_readings,
211 push( @reading_sets, \@variant_readings );
214 # Now collate and collapse the identical readings within the
215 # collated sets. Modifies the reading sets that were passed.
216 $DB::single = 1 if "$line.$num" eq '16.2';
217 collate_variants( $collation, @reading_sets );
219 # Now create the splice-edit objects that will be used
220 # to reconstruct each witness.
222 foreach my $rkey ( keys %$variant_objects ) {
223 # Object is argument list for splice, so:
224 # offset, length, replacements
225 my $edit_object = [ $base_text_index{$lemma_start->name},
226 scalar( @lemma_set ),
227 $variant_objects->{$rkey}->{reading} ];
228 foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
229 # Is this a p.c. entry?
230 my $base = _is_post_corr( $ms );
231 if( $base ) { # this is a post-corr witness
232 my $pc_key = $base . "_post";
233 _add_hash_entry( $edits_required, $pc_key, $edit_object );
234 } else { # this is an ante-corr witness
235 my $pc_key = $ms . "_post";
236 _add_hash_entry( $edits_required, $ms, $edit_object );
237 unless( $pc_seen{$ms} ) {
238 # If this witness carries no correction, add this
239 # same object to its post-corrected state.
240 _add_hash_entry( $edits_required, $pc_key,
246 } # Finished going through the apparatus entries
248 # Now make the witness objects, and create their text sequences
249 foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
250 my $witness_obj = $collation->tradition->add_witness( sigil => $w );
251 my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w} );
252 my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"} )
253 if exists( $edits_required->{$w."_post"} );
255 # Now save these paths in my witness object
256 if( @post_corr_seq ) {
257 $witness_obj->path( \@post_corr_seq );
258 my @ante_corr = make_witness_uncorrections( \@post_corr_seq,
260 $witness_obj->ante_corr( \@ante_corr );
262 $witness_obj->path( \@ante_corr_seq );
266 # Now remove our 'base text' edges, which is to say, the only
267 # ones we have created so far. Also remove any nodes that didn't
268 # appear in any witnesses.
269 foreach ( $collation->paths() ) {
270 $collation->del_path( $_ );
272 foreach( @unwitnessed_lemma_nodes ) {
273 $collation->del_reading( $_ );
276 # Now walk paths and calculate positions.
277 my @common_readings =
278 $collation->make_witness_paths();
279 $collation->calculate_positions( @common_readings );
284 my @line_beginnings = read_base( 'reference.txt', $collation );
286 Takes a text file and a (presumed empty) collation object, adds the
287 words as simple linear readings to the collation, and returns a
288 list of readings that represent the beginning of lines. This collation
289 is now the starting point for application of apparatus entries in
290 merge_base, e.g. from a CSV file or a Classical Text Editor file.
295 my( $base_file, $collation ) = @_;
297 # This array gives the first reading for each line. We put the
298 # common starting point in line zero.
299 my $last_reading = $collation->start();
300 $base_text_index{$last_reading->name} = 0;
301 my $lineref_array = [ $last_reading ]; # There is no line zero.
303 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
306 # Make the readings, and connect them up for the base, but
307 # also save the first reading of each line in an array for the
309 # TODO use configurable reading separator
314 my $lineref = scalar @$lineref_array;
315 last if $SHORT && $lineref > $SHORT;
316 foreach my $w ( @words ) {
317 my $readingref = join( ',', $lineref, ++$wordref );
318 my $reading = $collation->add_reading( $readingref );
319 $reading->text( $w );
321 push( @$lineref_array, $reading );
324 # Add edge paths in the graph, for easier tracking when
325 # we start applying corrections. These paths will be
326 # removed when we're done.
327 my $path = $collation->add_path( $last_reading, $reading,
328 $collation->baselabel );
329 $last_reading = $reading;
331 # Note an array index for the reading, for later correction splices.
332 $base_text_index{$readingref} = $i++;
336 # Ending point for all texts
337 my $endpoint = $collation->add_reading( '#END#' );
338 $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
339 push( @$lineref_array, $endpoint );
340 $base_text_index{$endpoint->name} = $i;
342 return( @$lineref_array );
345 =item B<collate_variants>
347 collate_variants( $collation, @reading_ranges )
349 Given a set of readings in the form
350 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
351 walks through each to identify those readings that are identical. The
352 collation is a Text::Tradition::Collation object; the elements of
353 @readings are Text::Tradition::Collation::Reading objects that appear
354 on the collation graph.
356 TODO: Handle collapsed and non-collapsed transpositions.
360 sub collate_variants {
361 my( $collation, @reading_sets ) = @_;
363 # Merge the nodes across the sets so that there is only one node
364 # for any given reading. Use diff to identify the 'same' nodes.
366 my $lemma_set = shift @reading_sets;
369 push( @unique, @$lemma_set );
371 while( @reading_sets ) {
372 my $variant_set = shift @reading_sets;
374 # Use diff to do this job
375 my $diff = Algorithm::Diff->new( \@unique, $variant_set,
376 {'keyGen' => \&_collation_hash} );
378 while( $diff->Next ) {
381 my @l = $diff->Items( 1 );
382 my @v = $diff->Items( 2 );
383 foreach my $i ( 0 .. $#l ) {
384 $collation->merge_readings( $l[$i], $v[$i] );
386 # splice the lemma nodes into the variant set
387 my( $offset ) = $diff->Get( 'min2' );
388 splice( @$variant_set, $offset, scalar( @l ), @l );
389 push( @new_unique, @l );
391 # Keep the old unique readings
392 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
393 # Add the new readings to the 'unique' list
394 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
397 @unique = @new_unique;
399 # It becomes a much simpler job
402 foreach my $idx ( 0 .. $#{$variant_set} ) {
403 my $vw = $variant_set->[$idx];
404 my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
406 $collation->merge_readings( $same[0], $vw );
407 $variant_set->[$idx] = $same[0];
409 push( @distinct, $vw );
412 push( @unique, @distinct );
420 sub _collation_hash {
422 return cmp_str( $node );
426 my( $collation, $edit_sequence ) = @_;
427 my @lemma_names = sort { $base_text_index{$a} <=> $base_text_index{$b} }
428 keys %base_text_index;
429 my @lemma_text = map { $collation->reading( $_ ) } @lemma_names;
432 foreach my $correction ( @$edit_sequence ) {
433 my( $offset, $length, $items ) = @$correction;
434 my $realoffset = $offset + $drift;
435 splice( @lemma_text, $realoffset, $length, @$items );
436 $drift += @$items - $length;
441 sub make_witness_uncorrections {
442 my( $path, $uncorr_path ) = @_;
443 my $diff = Algorithm::Diff->new( $path, $uncorr_path,
444 { 'keyGen' => \&_collation_hash } );
445 # We basically just want to make a bunch of splice arguments that
446 # will reconstruct the ante-corr text from the post-corr.
448 while( $diff->Next ) {
450 my( $offset ) = $diff->Get( 'min1' );
451 my $length = scalar( $diff->Items( 1 ) );
452 my $items = []; push( @$items, $diff->Items( 2 ) );
453 push( @diff_list, [ $offset, $length, $items ] );
459 # Helper function. Given a witness sigil, if it is a post-correctione
460 # sigil,return the base witness. If not, return a false value.
463 if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
469 sub _add_hash_entry {
470 my( $hash, $key, $entry ) = @_;
471 if( exists $hash->{$key} ) {
472 push( @{$hash->{$key}}, $entry );
474 $hash->{$key} = [ $entry ];
481 Pretend you never saw this method. Really it needs to not be hardcoded.
487 my $word = $reading->label();
493 $word =~ s/quatuor/quattuor/g;
494 $word =~ s/ioannes/iohannes/g;
502 This package is free software and is provided "as is" without express
503 or implied warranty. You can redistribute it and/or modify it under
504 the same terms as Perl itself.
508 Tara L Andrews, aurum@cpan.org