1 package Text::Tradition::Parser::BaseText;
10 Text::Tradition::Parser::BaseText
14 use Text::Tradition::Parser::BaseText qw( merge_base );
15 merge_base( $graph, 'reference.txt', @apparatus_entries )
19 For an overview of the package, see the documentation for the
20 Text::Tradition::Graph module.
22 This module is meant for use with certain of the other Parser classes
23 - whenever a list of variants is given with reference to a base text,
24 these must be joined into a single collation. The parser should
25 therefore make a list of variants and their locations, and BaseText
26 will join those listed variants onto the reference text.
34 parse( $graph, %opts );
36 Takes an initialized graph and a set of options, which must include:
37 - 'base' - the base text referenced by the variants
38 - 'format' - the format of the variant list
39 - 'data' - the variants, in the given format.
44 my( $tradition, %opts ) = @_;
46 my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
48 my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
49 merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
54 merge_base( $graph, 'reference.txt', @apparatus_entries )
56 Takes three arguments: a newly-initialized Text::Tradition::Graph
57 object, a text file containing the reference text, and a list of
58 variants (apparatus entries). Adds the base text to the graph, and
59 joins the variants to that.
61 The list of variants is an array of hash references; each hash takes
63 { '_id' => line reference,
64 'rdg_0' => lemma reading,
65 'rdg_1' => first variant,
66 ... # and so on until all distinct readings are listed
67 'WitnessA' => 'rdg_0',
68 'WitnessB' => 'rdg_1',
69 ... # and so on until all witnesses are listed with their readings
72 Any hash key that is not of the form /^rdg_\d+$/ and that does not
73 begin with an underscore is assumed to be a witness name. Any 'meta'
74 information to be passed must be passed in a key with a leading
75 underscore in its name.
79 my $SHORTEND; # Debug var - set this to limit the number of lines parsed
82 my $edits_required = {};
84 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
87 my( $collation, $base_file, @app_entries ) = @_;
88 my @base_line_starts = read_base( $base_file, $collation );
91 my @unwitnessed_lemma_nodes;
92 foreach my $app ( @app_entries ) {
93 my( $line, $num ) = split( /\./, $app->{_id} );
94 # DEBUG with a short graph
95 last if $SHORTEND && $line > $SHORTEND;
96 # DEBUG for problematic entries
98 my $first_line_reading = $base_line_starts[ $line ];
99 my $too_far = $base_line_starts[ $line+1 ];
101 my $lemma = $app->{rdg_0};
103 # Is this the Nth occurrence of this reading in the line?
104 if( $lemma =~ s/(_)?(\d)$// ) {
107 my @lemma_words = split( /\s+/, $lemma );
109 # Now search for the lemma words within this line.
110 my $lemma_start = $first_line_reading;
113 while( $lemma_start ne $too_far ) {
115 if( $seen{ $lemma_start->name() } ) {
116 warn "Detected loop at " . $lemma_start->name() .
120 $seen{ $lemma_start->name() } = 1;
122 # Try to match the lemma.
124 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
125 $lemma_words[0] . "...\n"
126 if "$line.$num" eq $scrutinize;
127 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
128 # Skip it if we need a match that is not the first.
130 # Now we have to compare the rest of the words here.
131 if( scalar( @lemma_words ) > 1 ) {
133 $collation->next_reading( $lemma_start );
134 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
135 printf STDERR "Now matching %s against %s\n",
136 cmp_str($next_reading), $w
137 if "$line.$num" eq $scrutinize;
138 if( $w ne cmp_str($next_reading) ) {
142 $lemma_end = $next_reading;
144 $collation->next_reading( $lemma_end );
148 $lemma_end = $lemma_start;
154 last unless ( $unmatch || !defined( $lemma_end ) );
156 $lemma_start = $collation->next_reading( $lemma_start );
159 unless( $lemma_end ) {
160 warn "No match found for @lemma_words at $line.$num";
164 # Now we have found the lemma; we will record an 'edit', in
165 # terms of a splice operation, for each subsequent reading.
166 # We also note which witnesses take the given edit.
168 my @lemma_set = $collation->reading_sequence( $lemma_start,
170 my @reading_sets = [ @lemma_set ];
172 # For each reading that is not rdg_0, we create the variant
173 # reading nodes, and store the range as an edit operation on
176 my %pc_seen; # Keep track of mss with explicit post-corr data
177 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
178 my @mss = grep { $app->{$_} eq $k } keys( %$app );
179 push( @unwitnessed_lemma_nodes, @lemma_set )
180 if !@mss && $k eq 'rdg_0';
182 # Keep track of what witnesses we have seen.
183 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
184 # Keep track of which witnesses bear corrected readings here.
185 foreach my $m ( @mss ) {
186 my $base = _is_post_corr( $m );
190 next if $k eq 'rdg_0';
192 # TODO don't hardcode the reading split operation
193 my @variant = split( /\s+/, $app->{$k} );
194 @variant = () if $app->{$k} eq '/'; # This is an omission.
196 # Make the variant into a set of readings.
197 my @variant_readings;
199 foreach my $vw ( @variant ) {
200 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
201 my $vwreading = $collation->add_reading( $vwname );
202 $vwreading->text( $vw );
203 push( @variant_readings, $vwreading );
206 $variant_objects->{$k} = { 'mss' => \@mss,
207 'reading' => \@variant_readings,
209 push( @reading_sets, \@variant_readings );
212 # Now collate and collapse the identical readings within the
213 # collated sets. Modifies the reading sets that were passed.
214 collate_variants( $collation, @reading_sets );
216 # TODO Here would be a very good place to set up relationships
217 # between the nodes and the lemma.
218 set_relationships( $app, \@lemma_set, $variant_objects );
220 # Now create the splice-edit objects that will be used
221 # to reconstruct each witness.
223 foreach my $rkey ( keys %$variant_objects ) {
224 # Object is argument list for splice, so:
225 # offset, length, replacements
226 my $edit_object = [ $base_text_index{$lemma_start->name},
227 scalar( @lemma_set ),
228 $variant_objects->{$rkey}->{reading} ];
229 foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
230 # Is this a p.c. entry?
231 my $base = _is_post_corr( $ms );
232 if( $base ) { # this is a post-corr witness
233 my $pc_key = $base . "_post";
234 _add_hash_entry( $edits_required, $pc_key, $edit_object );
235 } else { # this is an ante-corr witness
236 my $pc_key = $ms . "_post";
237 _add_hash_entry( $edits_required, $ms, $edit_object );
238 unless( $pc_seen{$ms} ) {
239 # If this witness carries no correction, add this
240 # same object to its post-corrected state.
241 _add_hash_entry( $edits_required, $pc_key,
247 } # Finished going through the apparatus entries
249 # Now make the witness objects, and create their text sequences
250 foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
251 print STDERR "Creating witness $w\n";
252 my $witness_obj = $collation->tradition->add_witness( sigil => $w );
253 my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w} );
254 my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"} )
255 if exists( $edits_required->{$w."_post"} );
257 my @repeated = _check_for_repeated( @ante_corr_seq );
258 warn "Repeated elements @repeated in $w a.c."
260 @repeated = _check_for_repeated( @post_corr_seq );
261 warn "Repeated elements @repeated in $w p.c."
264 # Now save these paths in my witness object
265 if( @post_corr_seq ) {
266 $witness_obj->path( \@post_corr_seq );
267 $witness_obj->uncorrected_path( \@ante_corr_seq );
269 $witness_obj->path( \@ante_corr_seq );
273 # Now remove our 'base text' edges, which is to say, the only
274 # ones we have created so far. Also remove any nodes that didn't
275 # appear in any witnesses.
276 foreach ( $collation->paths() ) {
277 $collation->del_path( $_ );
279 foreach( @unwitnessed_lemma_nodes ) {
280 $collation->del_reading( $_ );
283 # Now walk paths and calculate positions.
284 my @common_readings =
285 $collation->make_witness_paths();
286 $collation->calculate_positions( @common_readings );
289 sub _check_for_repeated {
294 if( exists $unique{$_->name} ) {
295 push( @repeated, $_->name );
297 $unique{$_->name} = 1;
305 my @line_beginnings = read_base( 'reference.txt', $collation );
307 Takes a text file and a (presumed empty) collation object, adds the
308 words as simple linear readings to the collation, and returns a
309 list of readings that represent the beginning of lines. This collation
310 is now the starting point for application of apparatus entries in
311 merge_base, e.g. from a CSV file or a Classical Text Editor file.
316 my( $base_file, $collation ) = @_;
318 # This array gives the first reading for each line. We put the
319 # common starting point in line zero.
320 my $last_reading = $collation->start();
321 $base_text_index{$last_reading->name} = 0;
322 my $lineref_array = [ $last_reading ]; # There is no line zero.
324 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
327 # Make the readings, and connect them up for the base, but
328 # also save the first reading of each line in an array for the
330 # TODO use configurable reading separator
335 my $lineref = scalar @$lineref_array;
336 last if $SHORTEND && $lineref > $SHORTEND;
337 foreach my $w ( @words ) {
338 my $readingref = join( ',', $lineref, ++$wordref );
339 my $reading = $collation->add_reading( $readingref );
340 $reading->text( $w );
342 push( @$lineref_array, $reading );
345 # Add edge paths in the graph, for easier tracking when
346 # we start applying corrections. These paths will be
347 # removed when we're done.
348 my $path = $collation->add_path( $last_reading, $reading,
349 $collation->baselabel );
350 $last_reading = $reading;
352 # Note an array index for the reading, for later correction splices.
353 $base_text_index{$readingref} = $i++;
357 # Ending point for all texts
358 my $endpoint = $collation->add_reading( '#END#' );
359 $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
360 push( @$lineref_array, $endpoint );
361 $base_text_index{$endpoint->name} = $i;
363 return( @$lineref_array );
366 =item B<collate_variants>
368 collate_variants( $collation, @reading_ranges )
370 Given a set of readings in the form
371 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
372 walks through each to identify those readings that are identical. The
373 collation is a Text::Tradition::Collation object; the elements of
374 @readings are Text::Tradition::Collation::Reading objects that appear
375 on the collation graph.
377 TODO: Handle collapsed and non-collapsed transpositions.
381 sub collate_variants {
382 my( $collation, @reading_sets ) = @_;
384 # Merge the nodes across the sets so that there is only one node
385 # for any given reading. Use diff to identify the 'same' nodes.
387 my $lemma_set = shift @reading_sets;
390 push( @unique, @$lemma_set );
392 while( @reading_sets ) {
393 my $variant_set = shift @reading_sets;
394 if( $collation->linear ) {
395 # Use diff to do this job
396 my $diff = Algorithm::Diff->new( \@unique, $variant_set,
397 {'keyGen' => \&_collation_hash} );
400 while( $diff->Next ) {
403 my @l = $diff->Items( 1 );
404 my @v = $diff->Items( 2 );
405 foreach my $i ( 0 .. $#l ) {
406 if( !$merged{$l[$i]->name} ) {
407 $collation->merge_readings( $l[$i], $v[$i] );
408 $merged{$l[$i]->name} = 1;
410 print STDERR "Would have double merged " . $l[$i]->name . "\n";
413 # splice the lemma nodes into the variant set
414 my( $offset ) = $diff->Get( 'min2' );
415 splice( @$variant_set, $offset, scalar( @l ), @l );
416 push( @new_unique, @l );
418 # Keep the old unique readings
419 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
420 # Add the new readings to the 'unique' list
421 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
424 @unique = @new_unique;
426 # It becomes a much simpler job
430 foreach my $idx ( 0 .. $#{$variant_set} ) {
431 my $vw = $variant_set->[$idx];
432 my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
435 foreach my $i ( 0 .. $#same ) {
436 unless( $merged{$same[$i]->name} ) {
437 print STDERR sprintf( "Merging %s into %s\n",
440 $collation->merge_readings( $same[$i], $vw );
441 $merged{$same[$i]->name} = 1;
443 $variant_set->[$idx] = $same[$i];
447 unless( @same && defined($matched) ) {
448 push( @distinct, $vw );
451 push( @unique, @distinct );
459 sub _collation_hash {
461 return cmp_str( $node );
464 sub set_relationships {
465 my( $app, $lemma, $variants ) = @_;
466 foreach my $rkey ( keys %$variants ) {
467 my $var = $variants->{$rkey}->{'reading'};
468 my $typekey = sprintf( "_%s_type", $rkey );
469 my $type = $app->{$typekey};
471 # Transposition: look for nodes with the same label but different IDs
472 # and mark them as transposed-identical.
474 # Lexical / Grammatical / Spelling: look for non-identical nodes.
475 # Need to work out how to handle many-to-many mapping.
482 my( $collation, $edit_sequence ) = @_;
483 my @lemma_names = sort { $base_text_index{$a} <=> $base_text_index{$b} }
484 keys %base_text_index;
485 my @lemma_text = map { $collation->reading( $_ ) } @lemma_names;
488 foreach my $correction ( @$edit_sequence ) {
489 my( $offset, $length, $items ) = @$correction;
490 my $realoffset = $offset + $drift;
491 splice( @lemma_text, $realoffset, $length, @$items );
492 $drift += @$items - $length;
498 # Helper function. Given a witness sigil, if it is a post-correctione
499 # sigil,return the base witness. If not, return a false value.
502 if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
508 sub _add_hash_entry {
509 my( $hash, $key, $entry ) = @_;
510 if( exists $hash->{$key} ) {
511 push( @{$hash->{$key}}, $entry );
513 $hash->{$key} = [ $entry ];
520 Pretend you never saw this method. Really it needs to not be hardcoded.
526 my $word = $reading->label();
532 $word =~ s/quatuor/quattuor/g;
533 $word =~ s/ioannes/iohannes/g;
541 This package is free software and is provided "as is" without express
542 or implied warranty. You can redistribute it and/or modify it under
543 the same terms as Perl itself.
547 Tara L Andrews, aurum@cpan.org