1 package Text::Tradition::Parser::BaseText;
9 Text::Tradition::Parser::BaseText
13 use Text::Tradition::Parser::BaseText qw( merge_base );
14 merge_base( $graph, 'reference.txt', @apparatus_entries )
18 For an overview of the package, see the documentation for the
19 Text::Tradition::Graph module.
21 This module is meant for use with certain of the other Parser classes
22 - whenever a list of variants is given with reference to a base text,
23 these must be joined into a single collation. The parser should
24 therefore make a list of variants and their locations, and BaseText
25 will join those listed variants onto the reference text.
33 parse( $graph, %opts );
35 Takes an initialized graph and a set of options, which must include:
36 - 'base' - the base text referenced by the variants
37 - 'format' - the format of the variant list
38 - 'data' - the variants, in the given format.
43 my( $tradition, %opts ) = @_;
45 my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
47 my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
48 merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
53 merge_base( $graph, 'reference.txt', @apparatus_entries )
55 Takes three arguments: a newly-initialized Text::Tradition::Graph
56 object, a text file containing the reference text, and a list of
57 variants (apparatus entries). Adds the base text to the graph, and
58 joins the variants to that.
60 The list of variants is an array of hash references; each hash takes
62 { '_id' => line reference,
63 'rdg_0' => lemma reading,
64 'rdg_1' => first variant,
65 ... # and so on until all distinct readings are listed
66 'WitnessA' => 'rdg_0',
67 'WitnessB' => 'rdg_1',
68 ... # and so on until all witnesses are listed with their readings
71 Any hash key that is not of the form /^rdg_\d+$/ and that does not
72 begin with an underscore is assumed to be a witness name. Any 'meta'
73 information to be passed must be passed in a key with a leading
74 underscore in its name.
79 my( $collation, $base_file, @app_entries ) = @_;
80 my @base_line_starts = read_base( $base_file, $collation );
83 foreach my $app ( @app_entries ) {
84 my( $line, $num ) = split( /\./, $app->{_id} );
85 # DEBUG with a short graph
87 # DEBUG for problematic entries
88 my $scrutinize = "7.3";
89 my $first_line_reading = $base_line_starts[ $line ];
90 my $too_far = $base_line_starts[ $line+1 ];
92 my $lemma = $app->{rdg_0};
94 # Is this the Nth occurrence of this reading in the line?
95 if( $lemma =~ s/(_)?(\d)$// ) {
98 my @lemma_words = split( /\s+/, $lemma );
100 # Now search for the lemma words within this line.
101 my $lemma_start = $first_line_reading;
104 while( $lemma_start ne $too_far ) {
106 if( $seen{ $lemma_start->name() } ) {
107 warn "Detected loop at " . $lemma_start->name() .
111 $seen{ $lemma_start->name() } = 1;
113 # Try to match the lemma.
115 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
116 $lemma_words[0] . "...\n"
117 if "$line.$num" eq $scrutinize;
118 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
119 # Skip it if we need a match that is not the first.
121 # Now we have to compare the rest of the words here.
122 if( scalar( @lemma_words ) > 1 ) {
124 $collation->next_reading( $lemma_start );
125 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
126 printf STDERR "Now matching %s against %s\n",
127 cmp_str($next_reading), $w
128 if "$line.$num" eq $scrutinize;
129 if( $w ne cmp_str($next_reading) ) {
133 $lemma_end = $next_reading;
135 $collation->next_reading( $lemma_end );
139 $lemma_end = $lemma_start;
145 last unless ( $unmatch || !defined( $lemma_end ) );
147 $lemma_start = $collation->next_reading( $lemma_start );
150 unless( $lemma_end ) {
151 warn "No match found for @lemma_words at $line.$num";
154 # These are no longer common readings; unmark them as such.
155 my @lemma_readings = $collation->reading_sequence( $lemma_start,
157 map { $_->set_attribute( 'class', 'lemma' ) } @lemma_readings;
160 # Now we have our lemma readings; we add the variant readings
163 # Keep track of the start and end point of each reading for later
165 my @readings = ( $lemma_start, $lemma_end );
167 # For each reading that is not rdg_0, we make a chain of readings
168 # and connect them to the anchor. Edges are named after the mss
170 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
171 if( $k eq 'rdg_0' ) { # that's the lemma
172 # The lemma is already in the graph, but we need to look for
173 # any explicit post-correctione readings and add the
175 my @mss = grep { $app->{$_} eq $k } keys( %$app );
176 foreach my $m ( @mss ) {
177 my $base = _is_post_corr( $m );
179 my @lem = $collation->reading_sequence( $lemma_start, $lemma_end );
180 foreach my $i ( 0 .. $#lem-1 ) {
181 $collation->add_path( $lem[$i], $lem[$i++], $m );
185 my @variant = split( /\s+/, $app->{$k} );
186 @variant = () if $app->{$k} eq '/'; # This is an omission.
187 my @mss = grep { $app->{$_} eq $k } keys( %$app );
190 print STDERR "Skipping '@variant' at $line.$num: no mss\n";
194 # Keep track of what witnesses we have seen.
195 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
197 # Make the variant into a set of readings.
199 my $last_reading = $collation->prior_reading( $lemma_start );
201 foreach my $vw ( @variant ) {
202 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
203 my $vwreading = $collation->add_reading( $vwname );
204 $vwreading->text( $vw );
205 $vwreading->make_variant();
207 $collation->add_path( $last_reading, $vwreading, $_ );
209 $var_start = $vwreading unless $var_start;
210 $last_reading = $vwreading;
212 # Now hook it up at the end.
214 $collation->add_path( $last_reading,
215 $collation->next_reading( $lemma_end ),
219 if( $var_start ) { # if it wasn't an empty reading
220 push( @readings, $var_start, $last_reading );
224 # Now collate and collapse the identical readings within the collation.
225 collate_variants( $collation, @readings );
228 # Now make the witness objects
229 foreach my $w ( keys %all_witnesses ) {
230 my $base = _is_post_corr( $w );
232 my $pctag = substr( $w, length( $base ) );
233 my $existing_wit = $collation->tradition->witness( $base );
234 unless( $existing_wit ) {
235 $existing_wit = $collation->tradition->add_witness( sigil => $base );
237 $existing_wit->post_correctione( $pctag );
239 $collation->tradition->add_witness( sigil => $w )
240 unless $collation->tradition->witness( $w );
244 # Now walk paths and calculate positions.
245 my @common_readings =
246 $collation->walk_and_expand_base( $collation->reading( '#END#' ) );
247 $collation->calculate_positions( @common_readings );
252 my @line_beginnings = read_base( 'reference.txt', $collation );
254 Takes a text file and a (presumed empty) collation object, adds the
255 words as simple linear readings to the collation, and returns a
256 list of readings that represent the beginning of lines. This collation
257 is now the starting point for application of apparatus entries in
258 merge_base, e.g. from a CSV file or a Classical Text Editor file.
263 my( $base_file, $collation ) = @_;
265 # This array gives the first reading for each line. We put the
266 # common starting point in line zero.
267 my $last_reading = $collation->start();
268 my $lineref_array = [ $last_reading ]; # There is no line zero.
270 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
272 # Make the readings, and connect them up for the base, but
273 # also save the first reading of each line in an array for the
275 # TODO use configurable reading separator
280 my $lineref = scalar @$lineref_array;
281 foreach my $w ( @words ) {
282 my $readingref = join( ',', $lineref, ++$wordref );
283 my $reading = $collation->add_reading( $readingref );
284 $reading->text( $w );
285 $reading->make_common();
287 push( @$lineref_array, $reading );
290 if( $last_reading ) {
291 my $path = $collation->add_path( $last_reading, $reading,
293 $path->set_attribute( 'class', 'basetext' );
294 $last_reading = $reading;
295 } # TODO there should be no else here...
299 # Ending point for all texts
300 my $endpoint = $collation->add_reading( '#END#' );
301 $collation->add_path( $last_reading, $endpoint, "base text" );
302 push( @$lineref_array, $endpoint );
304 return( @$lineref_array );
307 =item B<collate_variants>
309 collate_variants( $collation, @readings )
311 Given a set of readings in the form
312 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
313 walks through each to identify those readings that are identical. The
314 collation is a Text::Tradition::Collation object; the elements of
315 @readings are Text::Tradition::Collation::Reading objects that appear
316 on the collation graph.
318 TODO: Handle collapsed and non-collapsed transpositions.
322 sub collate_variants {
323 my( $collation, @readings ) = @_;
324 my $lemma_start = shift @readings;
325 my $lemma_end = shift @readings;
328 # Start the list of distinct readings with those readings in the lemma.
329 my @distinct_readings;
330 while( $lemma_start ne $lemma_end ) {
331 push( @distinct_readings, [ $lemma_start, 'base text' ] );
332 $lemma_start = $collation->next_reading( $lemma_start );
334 push( @distinct_readings, [ $lemma_end, 'base text' ] );
337 while( scalar @readings ) {
338 my( $var_start, $var_end ) = splice( @readings, 0, 2 );
340 # I want to look at the readings in the variant and lemma, and
341 # collapse readings that are the same word. This is mini-collation.
342 # Each word in the 'main' list can only be collapsed once with a
343 # word from the current reading.
346 # Get the variant witnesses. They will all be going along the
347 # same path, so just use the first one as representative for
348 # the purpose of following the path.
349 my @var_wits = map { $_->label } $var_start->outgoing();
350 my $rep_wit = $var_wits[0];
352 my @variant_readings;
353 while( $var_start ne $var_end ) {
354 push( @variant_readings, $var_start );
355 $var_start = $collation->next_reading( $var_start, $rep_wit );
357 push( @variant_readings, $var_end );
359 # Go through the variant readings, and if we find a lemma reading that
360 # hasn't yet been collapsed with a reading, equate them. If we do
361 # not, keep them to push onto the end of all_readings.
362 # TODO replace this with proper mini-collation
363 my @remaining_readings;
366 foreach my $w ( @variant_readings ) {
367 my $word = $w->label();
369 foreach my $idx ( $last_index .. $#distinct_readings ) {
370 my( $l, $pathlabel ) = @{$distinct_readings[$idx]};
371 if( $word eq cmp_str( $l ) ) {
372 next if exists( $collapsed{ $l->label } )
373 && $collapsed{ $l->label } eq $l;
375 $last_index = $idx if $detranspose;
376 # Collapse the readings.
377 printf STDERR "Merging readings %s/%s and %s/%s\n",
378 $l->name, $l->label, $w->name, $w->label;
379 $collation->merge_readings( $l, $w );
380 $collapsed{ $l->label } = $l;
381 # Now collapse any multiple paths to and from the reading.
382 remove_duplicate_paths( $collation,
383 $collation->prior_reading( $l, $rep_wit ), $l );
384 remove_duplicate_paths( $collation, $l,
385 $collation->next_reading( $l, $rep_wit ) );
389 push( @remaining_readings, [ $w, $rep_wit ] ) unless $matched;
391 push( @distinct_readings, @remaining_readings) if scalar( @remaining_readings );
395 =item B<remove_duplicate_paths>
397 remove_duplicate_paths( $collation, $from, $to );
399 Given two readings, reduce the number of paths between those readings to
400 a set of unique paths.
404 # TODO wonder if this is necessary
405 sub remove_duplicate_paths {
406 my( $collation, $from, $to ) = @_;
408 foreach my $p ( $from->edges_to( $to ) ) {
409 if( exists $seen_paths{$p->name} ) {
410 $collation->del_path( $p );
412 $seen_paths{$p->name} = 1;
417 # Helper function. Given a witness sigil, if it is a post-correctione
418 # sigil,return the base witness. If not, return a false value.
421 if( $sigil =~ /^(.*?)(\s*\(p\.\s*c\.\))$/ ) {
429 Pretend you never saw this method. Really it needs to not be hardcoded.
435 my $word = $reading->label();
441 $word =~ s/quatuor/quattuor/g;
442 $word =~ s/ioannes/iohannes/g;
450 This package is free software and is provided "as is" without express
451 or implied warranty. You can redistribute it and/or modify it under
452 the same terms as Perl itself.
456 Tara L Andrews, aurum@cpan.org