1 package Text::Tradition::Parser::BaseText;
10 Text::Tradition::Parser::BaseText
14 use Text::Tradition::Parser::BaseText qw( merge_base );
15 merge_base( $graph, 'reference.txt', @apparatus_entries )
19 For an overview of the package, see the documentation for the
20 Text::Tradition::Graph module.
22 This module is meant for use with certain of the other Parser classes
23 - whenever a list of variants is given with reference to a base text,
24 these must be joined into a single collation. The parser should
25 therefore make a list of variants and their locations, and BaseText
26 will join those listed variants onto the reference text.
34 parse( $graph, %opts );
36 Takes an initialized graph and a set of options, which must include:
37 - 'base' - the base text referenced by the variants
38 - 'format' - the format of the variant list
39 - 'data' - the variants, in the given format.
44 my( $tradition, %opts ) = @_;
46 my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
48 my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
49 merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
54 merge_base( $graph, 'reference.txt', @apparatus_entries )
56 Takes three arguments: a newly-initialized Text::Tradition::Graph
57 object, a text file containing the reference text, and a list of
58 variants (apparatus entries). Adds the base text to the graph, and
59 joins the variants to that.
61 The list of variants is an array of hash references; each hash takes
63 { '_id' => line reference,
64 'rdg_0' => lemma reading,
65 'rdg_1' => first variant,
66 ... # and so on until all distinct readings are listed
67 'WitnessA' => 'rdg_0',
68 'WitnessB' => 'rdg_1',
69 ... # and so on until all witnesses are listed with their readings
72 Any hash key that is not of the form /^rdg_\d+$/ and that does not
73 begin with an underscore is assumed to be a witness name. Any 'meta'
74 information to be passed must be passed in a key with a leading
75 underscore in its name.
79 my $SHORT = undef; # Debug var - set this to limit the number of lines parsed
84 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
87 my( $collation, $base_file, @app_entries ) = @_;
88 my @base_line_starts = read_base( $base_file, $collation );
91 foreach my $app ( @app_entries ) {
92 my( $line, $num ) = split( /\./, $app->{_id} );
93 # DEBUG with a short graph
94 last if $SHORT && $line > $SHORT;
95 # DEBUG for problematic entries
97 my $first_line_reading = $base_line_starts[ $line ];
98 my $too_far = $base_line_starts[ $line+1 ];
100 my $lemma = $app->{rdg_0};
102 # Is this the Nth occurrence of this reading in the line?
103 if( $lemma =~ s/(_)?(\d)$// ) {
106 my @lemma_words = split( /\s+/, $lemma );
108 # Now search for the lemma words within this line.
109 my $lemma_start = $first_line_reading;
112 while( $lemma_start ne $too_far ) {
114 if( $seen{ $lemma_start->name() } ) {
115 warn "Detected loop at " . $lemma_start->name() .
119 $seen{ $lemma_start->name() } = 1;
121 # Try to match the lemma.
123 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
124 $lemma_words[0] . "...\n"
125 if "$line.$num" eq $scrutinize;
126 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
127 # Skip it if we need a match that is not the first.
129 # Now we have to compare the rest of the words here.
130 if( scalar( @lemma_words ) > 1 ) {
132 $collation->next_reading( $lemma_start );
133 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
134 printf STDERR "Now matching %s against %s\n",
135 cmp_str($next_reading), $w
136 if "$line.$num" eq $scrutinize;
137 if( $w ne cmp_str($next_reading) ) {
141 $lemma_end = $next_reading;
143 $collation->next_reading( $lemma_end );
147 $lemma_end = $lemma_start;
153 last unless ( $unmatch || !defined( $lemma_end ) );
155 $lemma_start = $collation->next_reading( $lemma_start );
158 unless( $lemma_end ) {
159 warn "No match found for @lemma_words at $line.$num";
163 # Now we have found the lemma; we will record an 'edit', in
164 # terms of a splice operation, for each subsequent reading.
165 # We also note which witnesses take the given edit.
167 my @lemma_set = $collation->reading_sequence( $lemma_start, $lemma_end );
168 my @reading_sets = [ @lemma_set ];
170 # For each reading that is not rdg_0, we create the variant
171 # reading nodes, and store the range as an edit operation on
174 my %pc_lemma; # Keep track of mss that have been corrected back to lemma
175 my %pc_variant; # Keep track of mss with other corrections
176 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
177 my @mss = grep { $app->{$_} eq $k } keys( %$app );
178 # Keep track of what witnesses we have seen.
179 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
180 my $pc_hash = $k eq 'rdg_0' ? \%pc_lemma : \%pc_variant;
182 # Keep track of which witnesses bear corrected readings here.
183 foreach my $m ( @mss ) {
184 my $base = _is_post_corr( $m );
186 $pc_hash->{$base} = 1;
188 next if $k eq 'rdg_0';
190 # TODO don't hardcode the reading split operation
191 my @variant = split( /\s+/, $app->{$k} );
192 @variant = () if $app->{$k} eq '/'; # This is an omission.
194 # Make the variant into a set of readings.
195 my @variant_readings;
197 foreach my $vw ( @variant ) {
198 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
199 my $vwreading = $collation->add_reading( $vwname );
200 $vwreading->text( $vw );
201 push( @variant_readings, $vwreading );
204 $variant_objects->{$k} = { 'mss' => \@mss,
205 'reading' => \@variant_readings,
207 push( @reading_sets, \@variant_readings );
210 # Now collate and collapse the identical readings within the
211 # collated sets. Modifies the reading sets that were passed.
212 collate_variants( $collation, @reading_sets );
214 # Now create the splice-edit objects that will be used
215 # to reconstruct each witness.
217 foreach my $rkey ( keys %$variant_objects ) {
218 # Object is argument list for splice, so:
219 # offset, length, replacements
220 my $edit_object = [ $base_text_index{$lemma_start->name},
221 scalar( @lemma_set ),
222 $variant_objects->{$rkey}->{reading} ];
223 foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
224 # Is this a p.c. entry?
225 my $base = _is_post_corr( $ms );
226 if( $base ) { # this is a post-corr witness
227 my $pc_key = $base . "_post";
228 _add_hash_entry( $edits_required, $pc_key, $edit_object );
229 } else { # this is an ante-corr witness
230 my $pc_key = $ms . "_post";
231 _add_hash_entry( $edits_required, $_, $edit_object );
232 unless( !$pc_lemma{$ms} && !$pc_variant{$ms} ) {
233 # If this witness carries no correction, add this same object
234 # to its post-corrected state.
235 # TODO combine these hashes?
236 _add_hash_entry( $edits_required, $pc_key, $edit_object );
241 } # Finished going through the apparatus entries
243 # Now make the witness objects, and create their text sequences
244 foreach my $w ( grep { $_ !~ /_base$/ } keys %$edits_required ) {
245 my $witness_obj = $collation->tradition->add_witness( sigil => $w );
246 my @ante_corr_seq = apply_edits( $edits_required->{$w} );
247 my @post_corr_seq = apply_edits( $edits_required->{$w."_post"} )
248 if exists( $edits_required->{$w."_post"} );
250 # Now how to save these paths in my witness object?
251 if( @post_corr_seq ) {
252 $witness_obj->add_path( @post_corr_seq );
253 $witness_obj->add_uncorrected_path( @ante_corr_seq );
255 $witness_obj->add_path( @ante_corr_seq );
259 # TODO Now remove all the 'base text' links.
261 # Now walk paths and calculate positions.
262 my @common_readings =
263 $collation->walk_and_expand_base( $collation->reading( '#END#' ) );
264 $collation->calculate_positions( @common_readings );
269 my @line_beginnings = read_base( 'reference.txt', $collation );
271 Takes a text file and a (presumed empty) collation object, adds the
272 words as simple linear readings to the collation, and returns a
273 list of readings that represent the beginning of lines. This collation
274 is now the starting point for application of apparatus entries in
275 merge_base, e.g. from a CSV file or a Classical Text Editor file.
280 my( $base_file, $collation ) = @_;
282 # This array gives the first reading for each line. We put the
283 # common starting point in line zero.
284 my $last_reading = $collation->start();
285 my $lineref_array = [ $last_reading ]; # There is no line zero.
287 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
290 # Make the readings, and connect them up for the base, but
291 # also save the first reading of each line in an array for the
293 # TODO use configurable reading separator
298 my $lineref = scalar @$lineref_array;
299 last if $SHORT && $lineref > $SHORT;
300 foreach my $w ( @words ) {
301 my $readingref = join( ',', $lineref, ++$wordref );
302 my $reading = $collation->add_reading( $readingref );
303 $reading->text( $w );
305 push( @$lineref_array, $reading );
308 # Add edge paths in the graph, for easier tracking when
309 # we start applying corrections. These paths will be
310 # removed when we're done.
311 my $path = $collation->add_path( $last_reading, $reading,
312 $collation->baselabel );
313 $last_reading = $reading;
315 # Note an array index for the reading, for later correction splices.
316 $base_text_index{$readingref} = $i++;
320 # Ending point for all texts
321 my $endpoint = $collation->add_reading( '#END#' );
322 $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
323 push( @$lineref_array, $endpoint );
325 return( @$lineref_array );
328 =item B<collate_variants>
330 collate_variants( $collation, @reading_ranges )
332 Given a set of readings in the form
333 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
334 walks through each to identify those readings that are identical. The
335 collation is a Text::Tradition::Collation object; the elements of
336 @readings are Text::Tradition::Collation::Reading objects that appear
337 on the collation graph.
339 TODO: Handle collapsed and non-collapsed transpositions.
343 sub collate_variants {
344 my( $collation, @reading_sets ) = @_;
345 # my $detranspose = 1; # TODO handle merging transposed nodes
347 # Merge the nodes across the sets so that there is only one node
348 # for any given reading. Use diff to identify the 'same' nodes.
350 my $lemma_set = shift @reading_sets;
353 push( @unique, @$lemma_set );
355 while( @reading_sets ) {
356 my $variant_set = shift @reading_sets;
357 my $diff = Algorithm::Diff->new( \@unique, $variant_set, \&_collation_hash );
359 push( @new_unique, @unique );
360 while( $diff->Next ) {
363 my @l = $diff->Items( 1 );
364 my @v = $diff->Items( 2 );
365 foreach my $i ( 0 .. $#l ) {
366 $collation->merge_readings( $l[$i], $v[$i] );
368 # splice the lemma nodes into the variant set
369 splice( @$variant_set, $diff->Get( 'min2' ), scalar( @l ), @l );
370 push( @new_unique, @l );
372 # Keep the old unique readings
373 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
374 # Add the new readings to the 'unique' list
375 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
378 @unique = @new_unique;
385 sub _collation_hash {
387 return _cmp_str( $node->label );
391 my $edit_sequence = shift;
392 my @lemma_text = map { $base_text_index{$_} } sort( keys %base_text_index );
395 foreach my $correction ( @$edit_sequence ) {
396 my( $offset, $length, $items ) = @$correction;
397 my $realoffset = $offset + $drift;
398 splice( @lemma_text, $realoffset, $length, @$items );
399 $drift += @$items - $length;
405 # Helper function. Given a witness sigil, if it is a post-correctione
406 # sigil,return the base witness. If not, return a false value.
409 if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
417 Pretend you never saw this method. Really it needs to not be hardcoded.
423 my $word = $reading->label();
429 $word =~ s/quatuor/quattuor/g;
430 $word =~ s/ioannes/iohannes/g;
438 This package is free software and is provided "as is" without express
439 or implied warranty. You can redistribute it and/or modify it under
440 the same terms as Perl itself.
444 Tara L Andrews, aurum@cpan.org