1 package Text::Tradition::Parser::BaseText;
10 Text::Tradition::Parser::BaseText
14 use Text::Tradition::Parser::BaseText qw( merge_base );
15 merge_base( $graph, 'reference.txt', @apparatus_entries )
19 For an overview of the package, see the documentation for the
20 Text::Tradition::Graph module.
22 This module is meant for use with certain of the other Parser classes
23 - whenever a list of variants is given with reference to a base text,
24 these must be joined into a single collation. The parser should
25 therefore make a list of variants and their locations, and BaseText
26 will join those listed variants onto the reference text.
34 parse( $graph, %opts );
36 Takes an initialized graph and a set of options, which must include:
37 - 'base' - the base text referenced by the variants
38 - 'format' - the format of the variant list
39 - 'data' - the variants, in the given format.
44 my( $tradition, %opts ) = @_;
46 my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
48 my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
49 merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
54 merge_base( $graph, 'reference.txt', @apparatus_entries )
56 Takes three arguments: a newly-initialized Text::Tradition::Graph
57 object, a text file containing the reference text, and a list of
58 variants (apparatus entries). Adds the base text to the graph, and
59 joins the variants to that.
61 The list of variants is an array of hash references; each hash takes
63 { '_id' => line reference,
64 'rdg_0' => lemma reading,
65 'rdg_1' => first variant,
66 ... # and so on until all distinct readings are listed
67 'WitnessA' => 'rdg_0',
68 'WitnessB' => 'rdg_1',
69 ... # and so on until all witnesses are listed with their readings
72 Any hash key that is not of the form /^rdg_\d+$/ and that does not
73 begin with an underscore is assumed to be a witness name. Any 'meta'
74 information to be passed must be passed in a key with a leading
75 underscore in its name.
79 my $SHORTEND = 20; # Debug var - set this to limit the number of lines parsed
82 my $edits_required = {};
84 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
87 my( $collation, $base_file, @app_entries ) = @_;
88 my @base_line_starts = read_base( $base_file, $collation );
91 my @unwitnessed_lemma_nodes;
92 foreach my $app ( @app_entries ) {
93 my( $line, $num ) = split( /\./, $app->{_id} );
94 # DEBUG with a short graph
95 last if $SHORTEND && $line > $SHORTEND;
96 # DEBUG for problematic entries
98 my $first_line_reading = $base_line_starts[ $line ];
99 my $too_far = $base_line_starts[ $line+1 ];
101 my $lemma = $app->{rdg_0};
103 # Is this the Nth occurrence of this reading in the line?
104 if( $lemma =~ s/(_)?(\d)$// ) {
107 my @lemma_words = split( /\s+/, $lemma );
109 # Now search for the lemma words within this line.
110 my $lemma_start = $first_line_reading;
113 while( $lemma_start ne $too_far ) {
115 if( $seen{ $lemma_start->name() } ) {
116 warn "Detected loop at " . $lemma_start->name() .
120 $seen{ $lemma_start->name() } = 1;
122 # Try to match the lemma.
124 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
125 $lemma_words[0] . "...\n"
126 if "$line.$num" eq $scrutinize;
127 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
128 # Skip it if we need a match that is not the first.
130 # Now we have to compare the rest of the words here.
131 if( scalar( @lemma_words ) > 1 ) {
133 $collation->next_reading( $lemma_start );
134 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
135 printf STDERR "Now matching %s against %s\n",
136 cmp_str($next_reading), $w
137 if "$line.$num" eq $scrutinize;
138 if( $w ne cmp_str($next_reading) ) {
142 $lemma_end = $next_reading;
144 $collation->next_reading( $lemma_end );
148 $lemma_end = $lemma_start;
154 last unless ( $unmatch || !defined( $lemma_end ) );
156 $lemma_start = $collation->next_reading( $lemma_start );
159 unless( $lemma_end ) {
160 warn "No match found for @lemma_words at $line.$num";
164 # Now we have found the lemma; we will record an 'edit', in
165 # terms of a splice operation, for each subsequent reading.
166 # We also note which witnesses take the given edit.
168 my @lemma_set = $collation->reading_sequence( $lemma_start,
170 my @reading_sets = [ @lemma_set ];
172 # For each reading that is not rdg_0, we create the variant
173 # reading nodes, and store the range as an edit operation on
176 my %pc_seen; # Keep track of mss with explicit post-corr data
177 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
178 my @mss = grep { $app->{$_} eq $k } keys( %$app );
179 push( @unwitnessed_lemma_nodes, @lemma_set )
180 if !@mss && $k eq 'rdg_0';
182 # Keep track of what witnesses we have seen.
183 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
184 # Keep track of which witnesses bear corrected readings here.
185 foreach my $m ( @mss ) {
186 my $base = _is_post_corr( $m );
190 next if $k eq 'rdg_0';
192 # TODO don't hardcode the reading split operation
193 my @variant = split( /\s+/, $app->{$k} );
194 @variant = () if $app->{$k} eq '/'; # This is an omission.
196 # Make the variant into a set of readings.
197 my @variant_readings;
199 foreach my $vw ( @variant ) {
200 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
201 my $vwreading = $collation->add_reading( $vwname );
202 $vwreading->text( $vw );
203 push( @variant_readings, $vwreading );
206 $variant_objects->{$k} = { 'mss' => \@mss,
207 'reading' => \@variant_readings,
209 push( @reading_sets, \@variant_readings );
212 # Now collate and collapse the identical readings within the
213 # collated sets. Modifies the reading sets that were passed.
214 collate_variants( $collation, @reading_sets );
216 # TODO Here would be a very good place to set up relationships
217 # between the nodes and the lemma.
218 set_relationships( $collation, $app, \@lemma_set, $variant_objects );
220 # Now create the splice-edit objects that will be used
221 # to reconstruct each witness.
223 foreach my $rkey ( keys %$variant_objects ) {
224 # Object is argument list for splice, so:
225 # offset, length, replacements
226 my $edit_object = [ $lemma_start->name,
227 scalar( @lemma_set ),
228 $variant_objects->{$rkey}->{reading} ];
229 foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
230 # Is this a p.c. entry?
231 my $base = _is_post_corr( $ms );
232 if( $base ) { # this is a post-corr witness
233 my $pc_key = $base . "_post";
234 _add_hash_entry( $edits_required, $pc_key, $edit_object );
235 } else { # this is an ante-corr witness
236 my $pc_key = $ms . "_post";
237 _add_hash_entry( $edits_required, $ms, $edit_object );
238 unless( $pc_seen{$ms} ) {
239 # If this witness carries no correction, add this
240 # same object to its post-corrected state.
241 _add_hash_entry( $edits_required, $pc_key,
247 } # Finished going through the apparatus entries
249 # Now make the witness objects, and create their text sequences
250 foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
251 print STDERR "Creating witness $w\n";
252 my $witness_obj = $collation->tradition->add_witness( sigil => $w );
253 my $debug = undef; # $w eq 'Vb10';
254 my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
255 my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
256 if exists( $edits_required->{$w."_post"} );
258 my @repeated = _check_for_repeated( @ante_corr_seq );
259 warn "Repeated elements @repeated in $w a.c."
261 @repeated = _check_for_repeated( @post_corr_seq );
262 warn "Repeated elements @repeated in $w p.c."
265 # Now save these paths in my witness object
266 if( @post_corr_seq ) {
267 $witness_obj->path( \@post_corr_seq );
268 $witness_obj->uncorrected_path( \@ante_corr_seq );
270 $witness_obj->path( \@ante_corr_seq );
274 # Now remove our 'base text' edges, which is to say, the only
275 # ones we have created so far. Also remove any nodes that didn't
276 # appear in any witnesses.
277 foreach ( $collation->paths() ) {
278 $collation->del_path( $_ );
280 foreach( @unwitnessed_lemma_nodes ) {
281 $collation->del_reading( $_ );
284 # Now walk paths and calculate positions.
285 my @common_readings =
286 $collation->make_witness_paths();
287 $collation->calculate_positions( @common_readings );
290 sub _check_for_repeated {
295 if( exists $unique{$_->name} ) {
296 push( @repeated, $_->name );
298 $unique{$_->name} = 1;
306 my @line_beginnings = read_base( 'reference.txt', $collation );
308 Takes a text file and a (presumed empty) collation object, adds the
309 words as simple linear readings to the collation, and returns a
310 list of readings that represent the beginning of lines. This collation
311 is now the starting point for application of apparatus entries in
312 merge_base, e.g. from a CSV file or a Classical Text Editor file.
317 my( $base_file, $collation ) = @_;
319 # This array gives the first reading for each line. We put the
320 # common starting point in line zero.
321 my $last_reading = $collation->start();
322 $base_text_index{$last_reading->name} = 0;
323 my $lineref_array = [ $last_reading ]; # There is no line zero.
325 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
328 # Make the readings, and connect them up for the base, but
329 # also save the first reading of each line in an array for the
331 # TODO use configurable reading separator
336 my $lineref = scalar @$lineref_array;
337 last if $SHORTEND && $lineref > $SHORTEND;
338 foreach my $w ( @words ) {
339 my $readingref = join( ',', $lineref, ++$wordref );
340 my $reading = $collation->add_reading( $readingref );
341 $reading->text( $w );
343 push( @$lineref_array, $reading );
346 # Add edge paths in the graph, for easier tracking when
347 # we start applying corrections. These paths will be
348 # removed when we're done.
349 my $path = $collation->add_path( $last_reading, $reading,
350 $collation->baselabel );
351 $last_reading = $reading;
353 # Note an array index for the reading, for later correction splices.
354 $base_text_index{$readingref} = $i++;
358 # Ending point for all texts
359 my $endpoint = $collation->add_reading( '#END#' );
360 $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
361 push( @$lineref_array, $endpoint );
362 $base_text_index{$endpoint->name} = $i;
364 return( @$lineref_array );
367 =item B<collate_variants>
369 collate_variants( $collation, @reading_ranges )
371 Given a set of readings in the form
372 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
373 walks through each to identify those readings that are identical. The
374 collation is a Text::Tradition::Collation object; the elements of
375 @readings are Text::Tradition::Collation::Reading objects that appear
376 on the collation graph.
378 TODO: Handle collapsed and non-collapsed transpositions.
382 sub collate_variants {
383 my( $collation, @reading_sets ) = @_;
385 # Two different ways to do this, depending on whether we want
386 # transposed reading nodes to be merged into one (producing a
387 # nonlinear, bidirectional graph) or not (producing a relatively
388 # linear, unidirectional graph.)
389 return $collation->linear ? collate_linearly( @_ )
390 : collate_nonlinearly( @_ );
393 sub collate_linearly {
394 my( $collation, $lemma_set, @variant_sets ) = @_;
397 push( @unique, @$lemma_set );
398 while( @variant_sets ) {
399 my $variant_set = shift @variant_sets;
400 # Use diff to do this job
401 my $diff = Algorithm::Diff->new( \@unique, $variant_set,
402 {'keyGen' => \&_collation_hash} );
405 while( $diff->Next ) {
408 my @l = $diff->Items( 1 );
409 my @v = $diff->Items( 2 );
410 foreach my $i ( 0 .. $#l ) {
411 if( !$merged{$l[$i]->name} ) {
412 $collation->merge_readings( $l[$i], $v[$i] );
413 $merged{$l[$i]->name} = 1;
415 print STDERR "Would have double merged " . $l[$i]->name . "\n";
418 # splice the lemma nodes into the variant set
419 my( $offset ) = $diff->Get( 'min2' );
420 splice( @$variant_set, $offset, scalar( @l ), @l );
421 push( @new_unique, @l );
423 # Keep the old unique readings
424 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
425 # Add the new readings to the 'unique' list
426 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
429 @unique = @new_unique;
433 sub collate_nonlinearly {
434 my( $collation, $lemma_set, @variant_sets ) = @_;
437 push( @unique, @$lemma_set );
438 while( @variant_sets ) {
439 my $variant_set = shift @variant_sets;
440 # Simply match the first reading that carries the same word, so
441 # long as that reading has not yet been used to match another
442 # word in this variant. That way lies loopy madness.
445 foreach my $idx ( 0 .. $#{$variant_set} ) {
446 my $vw = $variant_set->[$idx];
447 my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
450 foreach my $i ( 0 .. $#same ) {
451 unless( $merged{$same[$i]->name} ) {
452 print STDERR sprintf( "Merging %s into %s\n",
455 $collation->merge_readings( $same[$i], $vw );
456 $merged{$same[$i]->name} = 1;
458 $variant_set->[$idx] = $same[$i];
462 unless( @same && defined($matched) ) {
463 push( @distinct, $vw );
466 push( @unique, @distinct );
472 sub _collation_hash {
474 return cmp_str( $node );
477 sub set_relationships {
478 my( $collation, $app, $lemma, $variants ) = @_;
479 foreach my $rkey ( keys %$variants ) {
480 my $var = $variants->{$rkey}->{'reading'};
481 my $typekey = sprintf( "_%s_type", $rkey );
482 my $type = $app->{$typekey};
484 if( $type =~ /^(inv|tr|rep)$/i ) {
485 # Transposition or repetition: look for nodes with the
486 # same label but different IDs and mark them.
487 $type = 'repetition' if $type =~ /^rep/i;
488 $DB::single = 1 if $type eq 'repetition';
490 foreach my $r ( @$lemma ) {
491 $labels{cmp_str( $r )} = $r;
493 foreach my $r( @$var ) {
494 if( exists $labels{$r->label} &&
495 $r->name ne $labels{$r->label}->name ) {
496 if( $type eq 'repetition' ) {
498 $collation->add_relationship( $type, $r, $labels{$r->label} );
501 $r->set_identical( $labels{$r->label} );
505 } elsif( $type =~ /^(gr|sp(el)?)$/i ) {
506 # Grammar/spelling: this can be a one-to-one or one-to-many
507 # mapping. We should think about merging readings if it is
509 $type = 'grammatical' if $type =~ /gr/i;
510 $type = 'spelling' if $type =~ /sp/i;
511 $type = 'repetition' if $type =~ /rep/i;
512 if( @$lemma == @$var ) {
513 foreach my $i ( 0 .. $#{$lemma} ) {
514 $collation->add_relationship( $type, $var->[$i],
517 } elsif ( @$lemma > @$var && @$var == 1 ) {
518 # Merge the lemma readings into one
519 ## TODO This is a bad solution. We need a real one-to-many
521 my $ln1 = shift @$lemma;
522 foreach my $ln ( @$lemma ) {
523 $collation->merge_readings( $ln1, $ln, ' ' );
526 $collation->add_relationship( $type, $var->[0], $lemma->[0] );
527 } elsif ( @$lemma < @$var && @$lemma == 1 ) {
528 my $vn1 = shift @$var;
529 foreach my $vn ( @$var ) {
530 $collation->merge_readings( $vn1, $vn, ' ' );
533 $collation->add_relationship( $type, $var->[0], $lemma->[0] );
535 warn "Cannot set $type relationship on a many-to-many variant";
537 } elsif( $type !~ /^(lex|add|om)$/i ) {
538 warn "Unrecognized type $type";
546 my( $collation, $edit_sequence, $debug ) = @_;
547 my @lemma_text = $collation->reading_sequence( $collation->start,
548 $collation->reading( '#END#' ) );
550 foreach my $correction ( @$edit_sequence ) {
551 my( $lemma_start, $length, $items ) = @$correction;
552 my $offset = $base_text_index{$lemma_start};
553 my $realoffset = $offset + $drift;
555 $lemma_text[$realoffset]->name ne $lemma_start ) {
556 my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
559 my $l = $collation->reading( $lemma_start );
560 while( $i < $realoffset+$length ) {
561 push( @base_phrase, $l );
562 $l = $collation->next_reading( $l );
566 print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
567 "with %s (%s) with drift %d\n",
568 join( ' ', map {$_->label} @base_phrase ),
569 join( ' ', map {$_->name} @base_phrase ),
571 join( ' ', map {$_->label} @$items ),
572 join( ' ', map {$_->name} @$items ),
576 warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
577 "but %s (%s) is there instead",
578 join( ' ', map {$_->label} @base_phrase ),
579 join( ' ', map {$_->name} @base_phrase ),
580 join( ' ', map {$_->label} @$items ),
581 join( ' ', map {$_->name} @$items ),
582 join( ' ', map {$_->label} @this_phrase ),
583 join( ' ', map {$_->name} @this_phrase ),
585 if $lemma_text[$realoffset]->name ne $lemma_start;
587 splice( @lemma_text, $realoffset, $length, @$items );
588 $drift += @$items - $length;
594 # Helper function. Given a witness sigil, if it is a post-correctione
595 # sigil,return the base witness. If not, return a false value.
598 if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
604 sub _add_hash_entry {
605 my( $hash, $key, $entry ) = @_;
606 if( exists $hash->{$key} ) {
607 push( @{$hash->{$key}}, $entry );
609 $hash->{$key} = [ $entry ];
616 Pretend you never saw this method. Really it needs to not be hardcoded.
622 my $word = $reading->label();
628 $word =~ s/quatuor/quattuor/g;
629 $word =~ s/ioannes/iohannes/g;
637 This package is free software and is provided "as is" without express
638 or implied warranty. You can redistribute it and/or modify it under
639 the same terms as Perl itself.
643 Tara L Andrews, aurum@cpan.org