1 package Text::Tradition::Parser::BaseText;
9 Text::Tradition::Parser::BaseText
13 use Text::Tradition::Parser::BaseText qw( merge_base );
14 merge_base( $graph, 'reference.txt', @apparatus_entries )
18 For an overview of the package, see the documentation for the
19 Text::Tradition::Graph module.
21 This module is meant for use with certain of the other Parser classes
22 - whenever a list of variants is given with reference to a base text,
23 these must be joined into a single collation. The parser should
24 therefore make a list of variants and their locations, and BaseText
25 will join those listed variants onto the reference text.
33 parse( $graph, %opts );
35 Takes an initialized graph and a set of options, which must include:
36 - 'base' - the base text referenced by the variants
37 - 'format' - the format of the variant list
38 - 'data' - the variants, in the given format.
43 my( $tradition, %opts ) = @_;
45 my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
47 my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
48 merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
53 merge_base( $graph, 'reference.txt', @apparatus_entries )
55 Takes three arguments: a newly-initialized Text::Tradition::Graph
56 object, a text file containing the reference text, and a list of
57 variants (apparatus entries). Adds the base text to the graph, and
58 joins the variants to that.
60 The list of variants is an array of hash references; each hash takes
62 { '_id' => line reference,
63 'rdg_0' => lemma reading,
64 'rdg_1' => first variant,
65 ... # and so on until all distinct readings are listed
66 'WitnessA' => 'rdg_0',
67 'WitnessB' => 'rdg_1',
68 ... # and so on until all witnesses are listed with their readings
71 Any hash key that is not of the form /^rdg_\d+$/ and that does not
72 begin with an underscore is assumed to be a witness name. Any 'meta'
73 information to be passed must be passed in a key with a leading
74 underscore in its name.
81 my( $collation, $base_file, @app_entries ) = @_;
82 my @base_line_starts = read_base( $base_file, $collation );
85 foreach my $app ( @app_entries ) {
86 my( $line, $num ) = split( /\./, $app->{_id} );
87 # DEBUG with a short graph
88 last if $SHORT && $line > $SHORT;
89 # DEBUG for problematic entries
91 my $first_line_reading = $base_line_starts[ $line ];
92 my $too_far = $base_line_starts[ $line+1 ];
94 my $lemma = $app->{rdg_0};
96 # Is this the Nth occurrence of this reading in the line?
97 if( $lemma =~ s/(_)?(\d)$// ) {
100 my @lemma_words = split( /\s+/, $lemma );
102 # Now search for the lemma words within this line.
103 my $lemma_start = $first_line_reading;
106 while( $lemma_start ne $too_far ) {
108 if( $seen{ $lemma_start->name() } ) {
109 warn "Detected loop at " . $lemma_start->name() .
113 $seen{ $lemma_start->name() } = 1;
115 # Try to match the lemma.
117 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
118 $lemma_words[0] . "...\n"
119 if "$line.$num" eq $scrutinize;
120 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
121 # Skip it if we need a match that is not the first.
123 # Now we have to compare the rest of the words here.
124 if( scalar( @lemma_words ) > 1 ) {
126 $collation->next_reading( $lemma_start );
127 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
128 printf STDERR "Now matching %s against %s\n",
129 cmp_str($next_reading), $w
130 if "$line.$num" eq $scrutinize;
131 if( $w ne cmp_str($next_reading) ) {
135 $lemma_end = $next_reading;
137 $collation->next_reading( $lemma_end );
141 $lemma_end = $lemma_start;
147 last unless ( $unmatch || !defined( $lemma_end ) );
149 $lemma_start = $collation->next_reading( $lemma_start );
152 unless( $lemma_end ) {
153 warn "No match found for @lemma_words at $line.$num";
156 # These are no longer common readings; unmark them as such.
157 my @lemma_readings = $collation->reading_sequence( $lemma_start,
159 map { $_->make_variant } @lemma_readings;
162 # Now we have our lemma readings; we add the variant readings
165 # Keep track of the start and end point of each reading for later
167 my @readings = ( $lemma_start, $lemma_end );
169 # For each reading that is not rdg_0, we make a chain of readings
170 # and connect them to the anchor. Edges are named after the mss
172 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
173 if( $k eq 'rdg_0' ) { # that's the lemma
174 # The lemma is already in the graph, but we need to look for
175 # any explicit post-correctione readings and add the
177 my @mss = grep { $app->{$_} eq $k } keys( %$app );
178 # Keep track of what witnesses we have seen.
179 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
180 foreach my $m ( @mss ) {
181 my $base = _is_post_corr( $m );
183 my @lem = $collation->reading_sequence( $lemma_start, $lemma_end );
184 $collation->add_path( $collation->prior_reading( $lem[0] ), $lem[0], $m );
185 foreach my $i ( 0 .. $#lem-1 ) {
186 $collation->add_path( $lem[$i], $lem[++$i], $m );
188 $collation->add_path( $lem[-1], $collation->next_reading( $lem[-1] ), $m );
192 my @variant = split( /\s+/, $app->{$k} );
193 @variant = () if $app->{$k} eq '/'; # This is an omission.
194 my @mss = grep { $app->{$_} eq $k } keys( %$app );
197 print STDERR "Skipping '@variant' at $line.$num: no mss\n";
201 # Keep track of what witnesses we have seen.
202 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
204 # Make the variant into a set of readings.
206 my $last_reading = $collation->prior_reading( $lemma_start );
208 foreach my $vw ( @variant ) {
209 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
210 my $vwreading = $collation->add_reading( $vwname );
211 $vwreading->text( $vw );
212 $vwreading->make_variant();
214 $collation->add_path( $last_reading, $vwreading, $_ );
216 $var_start = $vwreading unless $var_start;
217 $last_reading = $vwreading;
219 # Now hook it up at the end.
221 $collation->add_path( $last_reading,
222 $collation->next_reading( $lemma_end ),
226 if( $var_start ) { # if it wasn't an empty reading
227 push( @readings, $var_start, $last_reading );
231 # Now collate and collapse the identical readings within the collation.
232 collate_variants( $collation, @readings );
235 # Now make the witness objects
236 foreach my $w ( keys %all_witnesses ) {
237 my $base = _is_post_corr( $w );
239 my $pctag = substr( $w, length( $base ) );
240 my $existing_wit = $collation->tradition->witness( $base );
241 unless( $existing_wit ) {
242 $existing_wit = $collation->tradition->add_witness( sigil => $base );
244 $existing_wit->post_correctione( $pctag );
246 $collation->tradition->add_witness( sigil => $w )
247 unless $collation->tradition->witness( $w );
251 # Now walk paths and calculate positions.
252 my @common_readings =
253 $collation->walk_and_expand_base( $collation->reading( '#END#' ) );
254 $collation->calculate_positions( @common_readings );
259 my @line_beginnings = read_base( 'reference.txt', $collation );
261 Takes a text file and a (presumed empty) collation object, adds the
262 words as simple linear readings to the collation, and returns a
263 list of readings that represent the beginning of lines. This collation
264 is now the starting point for application of apparatus entries in
265 merge_base, e.g. from a CSV file or a Classical Text Editor file.
270 my( $base_file, $collation ) = @_;
272 # This array gives the first reading for each line. We put the
273 # common starting point in line zero.
274 my $last_reading = $collation->start();
275 my $lineref_array = [ $last_reading ]; # There is no line zero.
277 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
279 # Make the readings, and connect them up for the base, but
280 # also save the first reading of each line in an array for the
282 # TODO use configurable reading separator
287 my $lineref = scalar @$lineref_array;
288 last if $SHORT && $lineref > $SHORT;
289 foreach my $w ( @words ) {
290 my $readingref = join( ',', $lineref, ++$wordref );
291 my $reading = $collation->add_reading( $readingref );
292 $reading->text( $w );
293 $reading->make_common();
295 push( @$lineref_array, $reading );
298 if( $last_reading ) {
299 my $path = $collation->add_path( $last_reading, $reading,
300 $collation->baselabel );
301 $path->set_attribute( 'class', 'basetext' );
302 $last_reading = $reading;
303 } # TODO there should be no else here...
307 # Ending point for all texts
308 my $endpoint = $collation->add_reading( '#END#' );
309 $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
310 push( @$lineref_array, $endpoint );
312 return( @$lineref_array );
315 =item B<collate_variants>
317 collate_variants( $collation, @readings )
319 Given a set of readings in the form
320 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
321 walks through each to identify those readings that are identical. The
322 collation is a Text::Tradition::Collation object; the elements of
323 @readings are Text::Tradition::Collation::Reading objects that appear
324 on the collation graph.
326 TODO: Handle collapsed and non-collapsed transpositions.
330 sub collate_variants {
331 my( $collation, @readings ) = @_;
332 my $lemma_start = shift @readings;
333 my $lemma_end = shift @readings;
336 # Start the list of distinct readings with those readings in the lemma.
337 my @distinct_readings;
338 while( $lemma_start ne $lemma_end ) {
339 push( @distinct_readings, [ $lemma_start, $collation->baselabel ] );
340 $lemma_start = $collation->next_reading( $lemma_start );
342 push( @distinct_readings, [ $lemma_end, $collation->baselabel ] );
345 while( scalar @readings ) {
346 my( $var_start, $var_end ) = splice( @readings, 0, 2 );
348 # I want to look at the readings in the variant and lemma, and
349 # collapse readings that are the same word. This is mini-collation.
350 # Each word in the 'main' list can only be collapsed once with a
351 # word from the current reading.
354 # Get the variant witnesses. They will all be going along the
355 # same path, so just use the first one as representative for
356 # the purpose of following the path.
357 my @var_wits = map { $_->label } $var_start->outgoing();
358 my $rep_wit = $var_wits[0];
360 my @variant_readings;
361 while( $var_start ne $var_end ) {
362 push( @variant_readings, $var_start );
363 $var_start = $collation->next_reading( $var_start, $rep_wit );
365 push( @variant_readings, $var_end );
367 # Go through the variant readings, and if we find a lemma reading that
368 # hasn't yet been collapsed with a reading, equate them. If we do
369 # not, keep them to push onto the end of all_readings.
370 # TODO replace this with proper mini-collation
371 my @remaining_readings;
374 foreach my $w ( @variant_readings ) {
375 my $word = $w->label();
377 foreach my $idx ( $last_index .. $#distinct_readings ) {
378 my( $l, $pathlabel ) = @{$distinct_readings[$idx]};
379 if( $word eq cmp_str( $l ) ) {
380 next if exists( $collapsed{ $l->label } )
381 && $collapsed{ $l->label } eq $l;
383 $last_index = $idx if $detranspose;
384 # Collapse the readings.
385 printf STDERR "Merging readings %s/%s and %s/%s\n",
386 $l->name, $l->label, $w->name, $w->label;
387 $collation->merge_readings( $l, $w );
388 $collapsed{ $l->label } = $l;
389 # Now collapse any multiple paths to and from the reading.
390 remove_duplicate_paths( $collation,
391 $collation->prior_reading( $l, $rep_wit ), $l );
392 remove_duplicate_paths( $collation, $l,
393 $collation->next_reading( $l, $rep_wit ) );
397 push( @remaining_readings, [ $w, $rep_wit ] ) unless $matched;
399 push( @distinct_readings, @remaining_readings) if scalar( @remaining_readings );
403 =item B<remove_duplicate_paths>
405 remove_duplicate_paths( $collation, $from, $to );
407 Given two readings, reduce the number of paths between those readings to
408 a set of unique paths.
412 # TODO wonder if this is necessary
413 sub remove_duplicate_paths {
414 my( $collation, $from, $to ) = @_;
416 foreach my $p ( $from->edges_to( $to ) ) {
417 if( exists $seen_paths{$p->name} ) {
418 $collation->del_path( $p );
420 $seen_paths{$p->name} = 1;
425 # Helper function. Given a witness sigil, if it is a post-correctione
426 # sigil,return the base witness. If not, return a false value.
429 if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
437 Pretend you never saw this method. Really it needs to not be hardcoded.
443 my $word = $reading->label();
449 $word =~ s/quatuor/quattuor/g;
450 $word =~ s/ioannes/iohannes/g;
458 This package is free software and is provided "as is" without express
459 or implied warranty. You can redistribute it and/or modify it under
460 the same terms as Perl itself.
464 Tara L Andrews, aurum@cpan.org