Merge branch 'master' of github.com:tla/stemmatology
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm
CommitLineData
e58153d6 1package Text::Tradition::Parser::BaseText;
b49c4318 2
3use strict;
4use warnings;
52ce987f 5use Module::Load;
4ca00eca 6use Algorithm::Diff;
b49c4318 7
2ceca8c3 8=head1 NAME
9
10Text::Tradition::Parser::BaseText
11
12=head1 SYNOPSIS
13
14use Text::Tradition::Parser::BaseText qw( merge_base );
15merge_base( $graph, 'reference.txt', @apparatus_entries )
16
17=head1 DESCRIPTION
18
19For an overview of the package, see the documentation for the
20Text::Tradition::Graph module.
21
22This module is meant for use with certain of the other Parser classes
23- whenever a list of variants is given with reference to a base text,
24these must be joined into a single collation. The parser should
25therefore make a list of variants and their locations, and BaseText
26will join those listed variants onto the reference text.
27
28=head1 SUBROUTINES
29
30=over
31
52ce987f 32=item B<parse>
33
34parse( $graph, %opts );
35
36Takes an initialized graph and a set of options, which must include:
37- 'base' - the base text referenced by the variants
38- 'format' - the format of the variant list
39- 'data' - the variants, in the given format.
40
41=cut
42
43sub parse {
e2902068 44 my( $tradition, %opts ) = @_;
52ce987f 45
46 my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
47 load( $format_mod );
48 my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
e2902068 49 merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
52ce987f 50}
51
2ceca8c3 52=item B<merge_base>
53
54merge_base( $graph, 'reference.txt', @apparatus_entries )
55
56Takes three arguments: a newly-initialized Text::Tradition::Graph
57object, a text file containing the reference text, and a list of
58variants (apparatus entries). Adds the base text to the graph, and
59joins the variants to that.
60
61The list of variants is an array of hash references; each hash takes
62the form
63 { '_id' => line reference,
64 'rdg_0' => lemma reading,
65 'rdg_1' => first variant,
66 ... # and so on until all distinct readings are listed
67 'WitnessA' => 'rdg_0',
68 'WitnessB' => 'rdg_1',
69 ... # and so on until all witnesses are listed with their readings
70 }
71
72Any hash key that is not of the form /^rdg_\d+$/ and that does not
73begin with an underscore is assumed to be a witness name. Any 'meta'
74information to be passed must be passed in a key with a leading
75underscore in its name.
76
77=cut
78
3265b0ce 79my $SHORTEND = 20; # Debug var - set this to limit the number of lines parsed
4ca00eca 80
81my %base_text_index;
6a222840 82my $edits_required = {};
4ca00eca 83
84# edits_required -> wit -> [ { start_idx, end_idx, items } ]
930ff666 85
b49c4318 86sub merge_base {
e2902068 87 my( $collation, $base_file, @app_entries ) = @_;
88 my @base_line_starts = read_base( $base_file, $collation );
b49c4318 89
52ce987f 90 my %all_witnesses;
6a222840 91 my @unwitnessed_lemma_nodes;
b49c4318 92 foreach my $app ( @app_entries ) {
93 my( $line, $num ) = split( /\./, $app->{_id} );
94 # DEBUG with a short graph
15d2d3df 95 last if $SHORTEND && $line > $SHORTEND;
2ceca8c3 96 # DEBUG for problematic entries
4ca00eca 97 my $scrutinize = '';
e2902068 98 my $first_line_reading = $base_line_starts[ $line ];
b49c4318 99 my $too_far = $base_line_starts[ $line+1 ];
100
101 my $lemma = $app->{rdg_0};
102 my $seq = 1;
103 # Is this the Nth occurrence of this reading in the line?
104 if( $lemma =~ s/(_)?(\d)$// ) {
105 $seq = $2;
106 }
107 my @lemma_words = split( /\s+/, $lemma );
108
109 # Now search for the lemma words within this line.
e2902068 110 my $lemma_start = $first_line_reading;
b49c4318 111 my $lemma_end;
112 my %seen;
113 while( $lemma_start ne $too_far ) {
114 # Loop detection
115 if( $seen{ $lemma_start->name() } ) {
116 warn "Detected loop at " . $lemma_start->name() .
117 ", ref $line,$num";
118 last;
119 }
120 $seen{ $lemma_start->name() } = 1;
121
122 # Try to match the lemma.
123 my $unmatch = 0;
124 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
125 $lemma_words[0] . "...\n"
126 if "$line.$num" eq $scrutinize;
127 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
128 # Skip it if we need a match that is not the first.
129 if( --$seq < 1 ) {
130 # Now we have to compare the rest of the words here.
131 if( scalar( @lemma_words ) > 1 ) {
e2902068 132 my $next_reading =
133 $collation->next_reading( $lemma_start );
b49c4318 134 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
135 printf STDERR "Now matching %s against %s\n",
e2902068 136 cmp_str($next_reading), $w
b49c4318 137 if "$line.$num" eq $scrutinize;
e2902068 138 if( $w ne cmp_str($next_reading) ) {
b49c4318 139 $unmatch = 1;
140 last;
141 } else {
e2902068 142 $lemma_end = $next_reading;
143 $next_reading =
144 $collation->next_reading( $lemma_end );
b49c4318 145 }
146 }
147 } else {
148 $lemma_end = $lemma_start;
149 }
150 } else {
151 $unmatch = 1;
152 }
153 }
154 last unless ( $unmatch || !defined( $lemma_end ) );
155 $lemma_end = undef;
e2902068 156 $lemma_start = $collation->next_reading( $lemma_start );
b49c4318 157 }
158
159 unless( $lemma_end ) {
160 warn "No match found for @lemma_words at $line.$num";
161 next;
b49c4318 162 }
163
4ca00eca 164 # Now we have found the lemma; we will record an 'edit', in
165 # terms of a splice operation, for each subsequent reading.
166 # We also note which witnesses take the given edit.
167
6a222840 168 my @lemma_set = $collation->reading_sequence( $lemma_start,
169 $lemma_end );
4ca00eca 170 my @reading_sets = [ @lemma_set ];
171
172 # For each reading that is not rdg_0, we create the variant
173 # reading nodes, and store the range as an edit operation on
174 # the base text.
175 my $variant_objects;
6a222840 176 my %pc_seen; # Keep track of mss with explicit post-corr data
b49c4318 177 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
b49c4318 178 my @mss = grep { $app->{$_} eq $k } keys( %$app );
1ed3973e 179
180 # Keep track of lemma nodes that don't actually appear in
181 # any MSS; we will want to remove them from the collation.
6a222840 182 push( @unwitnessed_lemma_nodes, @lemma_set )
183 if !@mss && $k eq 'rdg_0';
184
e2902068 185 # Keep track of what witnesses we have seen.
52ce987f 186 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
4ca00eca 187 # Keep track of which witnesses bear corrected readings here.
188 foreach my $m ( @mss ) {
189 my $base = _is_post_corr( $m );
190 next unless $base;
6a222840 191 $pc_seen{$base} = 1;
4ca00eca 192 }
193 next if $k eq 'rdg_0';
194
1ed3973e 195 # Parse the variant into reading tokens.
4ca00eca 196 # TODO don't hardcode the reading split operation
197 my @variant = split( /\s+/, $app->{$k} );
198 @variant = () if $app->{$k} eq '/'; # This is an omission.
b49c4318 199
4ca00eca 200 my @variant_readings;
b49c4318 201 my $ctr = 0;
b49c4318 202 foreach my $vw ( @variant ) {
203 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
e2902068 204 my $vwreading = $collation->add_reading( $vwname );
205 $vwreading->text( $vw );
4ca00eca 206 push( @variant_readings, $vwreading );
b49c4318 207 }
e49731d7 208
4ca00eca 209 $variant_objects->{$k} = { 'mss' => \@mss,
210 'reading' => \@variant_readings,
211 };
212 push( @reading_sets, \@variant_readings );
213 }
b49c4318 214
4ca00eca 215 # Now collate and collapse the identical readings within the
216 # collated sets. Modifies the reading sets that were passed.
217 collate_variants( $collation, @reading_sets );
218
1ed3973e 219 # Record any stated relationships between the nodes and the lemma.
3265b0ce 220 set_relationships( $collation, $app, \@lemma_set, $variant_objects );
15d2d3df 221
4ca00eca 222 # Now create the splice-edit objects that will be used
223 # to reconstruct each witness.
224
225 foreach my $rkey ( keys %$variant_objects ) {
226 # Object is argument list for splice, so:
227 # offset, length, replacements
c78feb69 228 my $edit_object = [ $lemma_start->name,
4ca00eca 229 scalar( @lemma_set ),
230 $variant_objects->{$rkey}->{reading} ];
231 foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
232 # Is this a p.c. entry?
233 my $base = _is_post_corr( $ms );
234 if( $base ) { # this is a post-corr witness
235 my $pc_key = $base . "_post";
236 _add_hash_entry( $edits_required, $pc_key, $edit_object );
237 } else { # this is an ante-corr witness
238 my $pc_key = $ms . "_post";
6a222840 239 _add_hash_entry( $edits_required, $ms, $edit_object );
240 unless( $pc_seen{$ms} ) {
241 # If this witness carries no correction, add this
242 # same object to its post-corrected state.
243 _add_hash_entry( $edits_required, $pc_key,
244 $edit_object );
4ca00eca 245 }
246 }
e2902068 247 }
4ca00eca 248 }
249 } # Finished going through the apparatus entries
250
251 # Now make the witness objects, and create their text sequences
6a222840 252 foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
15d2d3df 253 print STDERR "Creating witness $w\n";
4ca00eca 254 my $witness_obj = $collation->tradition->add_witness( sigil => $w );
3265b0ce 255 my $debug = undef; # $w eq 'Vb10';
1ed3973e 256 my ( $text_seq, $ac ) = apply_edits( $collation,
257 $edits_required->{$w},
258 $edits_required->{$w."_post"},
259 $debug );
4ca00eca 260
1ed3973e 261 my @repeated = _check_for_repeated( @$text_seq );
262 warn "Repeated elements @repeated in $w"
15d2d3df 263 if @repeated;
6a222840 264 # Now save these paths in my witness object
1ed3973e 265 $witness_obj->path( $text_seq );
266 if( $ac ) {
267 $witness_obj->uncorrected( $ac );
52ce987f 268 }
b49c4318 269 }
e2902068 270
6a222840 271 # Now remove our 'base text' edges, which is to say, the only
1ed3973e 272 # ones we have created so far. Also remove any unwitnessed
273 # lemma nodes (TODO unless we are treating base as witness)
6a222840 274 foreach ( $collation->paths() ) {
275 $collation->del_path( $_ );
276 }
277 foreach( @unwitnessed_lemma_nodes ) {
278 $collation->del_reading( $_ );
279 }
4ca00eca 280
e2902068 281 # Now walk paths and calculate positions.
282 my @common_readings =
6a222840 283 $collation->make_witness_paths();
e2902068 284 $collation->calculate_positions( @common_readings );
b49c4318 285}
286
15d2d3df 287sub _check_for_repeated {
288 my @seq = @_;
289 my %unique;
290 my @repeated;
291 foreach ( @seq ) {
292 if( exists $unique{$_->name} ) {
293 push( @repeated, $_->name );
294 } else {
295 $unique{$_->name} = 1;
296 }
297 }
298 return @repeated;
299}
300
2ceca8c3 301=item B<read_base>
302
e2902068 303my @line_beginnings = read_base( 'reference.txt', $collation );
2ceca8c3 304
e2902068 305Takes a text file and a (presumed empty) collation object, adds the
306words as simple linear readings to the collation, and returns a
307list of readings that represent the beginning of lines. This collation
308is now the starting point for application of apparatus entries in
309merge_base, e.g. from a CSV file or a Classical Text Editor file.
2ceca8c3 310
311=cut
b49c4318 312
313sub read_base {
e2902068 314 my( $base_file, $collation ) = @_;
b49c4318 315
e2902068 316 # This array gives the first reading for each line. We put the
b49c4318 317 # common starting point in line zero.
e2902068 318 my $last_reading = $collation->start();
6a222840 319 $base_text_index{$last_reading->name} = 0;
e2902068 320 my $lineref_array = [ $last_reading ]; # There is no line zero.
b49c4318 321
322 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
6a222840 323 my $i = 1;
b49c4318 324 while(<BASE>) {
e2902068 325 # Make the readings, and connect them up for the base, but
326 # also save the first reading of each line in an array for the
327 # purpose.
328 # TODO use configurable reading separator
b49c4318 329 chomp;
330 my @words = split;
331 my $started = 0;
332 my $wordref = 0;
333 my $lineref = scalar @$lineref_array;
15d2d3df 334 last if $SHORTEND && $lineref > $SHORTEND;
b49c4318 335 foreach my $w ( @words ) {
e2902068 336 my $readingref = join( ',', $lineref, ++$wordref );
337 my $reading = $collation->add_reading( $readingref );
338 $reading->text( $w );
b49c4318 339 unless( $started ) {
e2902068 340 push( @$lineref_array, $reading );
b49c4318 341 $started = 1;
342 }
4ca00eca 343 # Add edge paths in the graph, for easier tracking when
344 # we start applying corrections. These paths will be
345 # removed when we're done.
346 my $path = $collation->add_path( $last_reading, $reading,
347 $collation->baselabel );
348 $last_reading = $reading;
349
350 # Note an array index for the reading, for later correction splices.
351 $base_text_index{$readingref} = $i++;
b49c4318 352 }
353 }
354 close BASE;
355 # Ending point for all texts
e2902068 356 my $endpoint = $collation->add_reading( '#END#' );
930ff666 357 $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
b49c4318 358 push( @$lineref_array, $endpoint );
6a222840 359 $base_text_index{$endpoint->name} = $i;
b49c4318 360
361 return( @$lineref_array );
362}
363
e49731d7 364=item B<collate_variants>
2ceca8c3 365
4ca00eca 366collate_variants( $collation, @reading_ranges )
2ceca8c3 367
e49731d7 368Given a set of readings in the form
369( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
e2902068 370walks through each to identify those readings that are identical. The
371collation is a Text::Tradition::Collation object; the elements of
372@readings are Text::Tradition::Collation::Reading objects that appear
373on the collation graph.
b49c4318 374
2ceca8c3 375TODO: Handle collapsed and non-collapsed transpositions.
376
377=cut
b49c4318 378
e49731d7 379sub collate_variants {
4ca00eca 380 my( $collation, @reading_sets ) = @_;
4ca00eca 381
c78feb69 382 # Two different ways to do this, depending on whether we want
383 # transposed reading nodes to be merged into one (producing a
384 # nonlinear, bidirectional graph) or not (producing a relatively
385 # linear, unidirectional graph.)
386 return $collation->linear ? collate_linearly( @_ )
387 : collate_nonlinearly( @_ );
388}
4ca00eca 389
c78feb69 390sub collate_linearly {
391 my( $collation, $lemma_set, @variant_sets ) = @_;
4ca00eca 392
393 my @unique;
394 push( @unique, @$lemma_set );
c78feb69 395 while( @variant_sets ) {
396 my $variant_set = shift @variant_sets;
397 # Use diff to do this job
398 my $diff = Algorithm::Diff->new( \@unique, $variant_set,
399 {'keyGen' => \&_collation_hash} );
400 my @new_unique;
401 my %merged;
402 while( $diff->Next ) {
403 if( $diff->Same ) {
404 # merge the nodes
405 my @l = $diff->Items( 1 );
406 my @v = $diff->Items( 2 );
407 foreach my $i ( 0 .. $#l ) {
408 if( !$merged{$l[$i]->name} ) {
409 $collation->merge_readings( $l[$i], $v[$i] );
410 $merged{$l[$i]->name} = 1;
411 } else {
412 print STDERR "Would have double merged " . $l[$i]->name . "\n";
6a222840 413 }
e49731d7 414 }
c78feb69 415 # splice the lemma nodes into the variant set
416 my( $offset ) = $diff->Get( 'min2' );
417 splice( @$variant_set, $offset, scalar( @l ), @l );
418 push( @new_unique, @l );
419 } else {
420 # Keep the old unique readings
421 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
422 # Add the new readings to the 'unique' list
423 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
b49c4318 424 }
c78feb69 425 }
426 @unique = @new_unique;
427 }
428}
429
430sub collate_nonlinearly {
431 my( $collation, $lemma_set, @variant_sets ) = @_;
432
433 my @unique;
434 push( @unique, @$lemma_set );
435 while( @variant_sets ) {
436 my $variant_set = shift @variant_sets;
437 # Simply match the first reading that carries the same word, so
438 # long as that reading has not yet been used to match another
439 # word in this variant. That way lies loopy madness.
440 my @distinct;
441 my %merged;
442 foreach my $idx ( 0 .. $#{$variant_set} ) {
443 my $vw = $variant_set->[$idx];
444 my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
445 my $matched;
446 if( @same ) {
447 foreach my $i ( 0 .. $#same ) {
448 unless( $merged{$same[$i]->name} ) {
449 print STDERR sprintf( "Merging %s into %s\n",
450 $vw->name,
451 $same[$i]->name );
452 $collation->merge_readings( $same[$i], $vw );
453 $merged{$same[$i]->name} = 1;
454 $matched = $i;
455 $variant_set->[$idx] = $same[$i];
15d2d3df 456 }
457 }
6a222840 458 }
c78feb69 459 unless( @same && defined($matched) ) {
460 push( @distinct, $vw );
461 }
b49c4318 462 }
c78feb69 463 push( @unique, @distinct );
b49c4318 464 }
4ca00eca 465}
2ceca8c3 466
c78feb69 467
4ca00eca 468
469sub _collation_hash {
470 my $node = shift;
6a222840 471 return cmp_str( $node );
4ca00eca 472}
2ceca8c3 473
15d2d3df 474sub set_relationships {
3265b0ce 475 my( $collation, $app, $lemma, $variants ) = @_;
15d2d3df 476 foreach my $rkey ( keys %$variants ) {
477 my $var = $variants->{$rkey}->{'reading'};
478 my $typekey = sprintf( "_%s_type", $rkey );
479 my $type = $app->{$typekey};
480
051ddba3 481 if( $type =~ /^(inv|tr|rep)$/i ) {
482 # Transposition or repetition: look for nodes with the
483 # same label but different IDs and mark them.
484 $type = 'repetition' if $type =~ /^rep/i;
3265b0ce 485 my %labels;
486 foreach my $r ( @$lemma ) {
051ddba3 487 $labels{cmp_str( $r )} = $r;
3265b0ce 488 }
489 foreach my $r( @$var ) {
490 if( exists $labels{$r->label} &&
491 $r->name ne $labels{$r->label}->name ) {
051ddba3 492 if( $type eq 'repetition' ) {
493 # Repetition
494 $collation->add_relationship( $type, $r, $labels{$r->label} );
495 } else {
496 # Transposition
497 $r->set_identical( $labels{$r->label} );
498 }
3265b0ce 499 }
500 }
051ddba3 501 } elsif( $type =~ /^(gr|sp(el)?)$/i ) {
3265b0ce 502 # Grammar/spelling: this can be a one-to-one or one-to-many
503 # mapping. We should think about merging readings if it is
504 # one-to-many.
505 $type = 'grammatical' if $type =~ /gr/i;
506 $type = 'spelling' if $type =~ /sp/i;
507 $type = 'repetition' if $type =~ /rep/i;
508 if( @$lemma == @$var ) {
509 foreach my $i ( 0 .. $#{$lemma} ) {
510 $collation->add_relationship( $type, $var->[$i],
511 $lemma->[$i] );
512 }
513 } elsif ( @$lemma > @$var && @$var == 1 ) {
514 # Merge the lemma readings into one
051ddba3 515 ## TODO This is a bad solution. We need a real one-to-many
516 ## mapping.
3265b0ce 517 my $ln1 = shift @$lemma;
518 foreach my $ln ( @$lemma ) {
519 $collation->merge_readings( $ln1, $ln, ' ' );
520 }
521 $lemma = [ $ln1 ];
522 $collation->add_relationship( $type, $var->[0], $lemma->[0] );
523 } elsif ( @$lemma < @$var && @$lemma == 1 ) {
524 my $vn1 = shift @$var;
525 foreach my $vn ( @$var ) {
526 $collation->merge_readings( $vn1, $vn, ' ' );
527 }
528 $var = [ $vn1 ];
529 $collation->add_relationship( $type, $var->[0], $lemma->[0] );
530 } else {
531 warn "Cannot set $type relationship on a many-to-many variant";
532 }
533 } elsif( $type !~ /^(lex|add|om)$/i ) {
534 warn "Unrecognized type $type";
535 }
15d2d3df 536 }
537}
538
539
540
4ca00eca 541sub apply_edits {
1ed3973e 542 my( $collation, $edit_sequence, $corrected_edit_sequence, $debug ) = @_;
543
544 # Index the ante- and post-correctione edits that we have, so that
545 # for each spot in the text we can apply the original witness
546 # state and then apply its corrected state, if applicable.
547 my $all_edits = {};
548 foreach my $c ( @$edit_sequence ) {
549 my $lemma_index = $base_text_index{$c->[0]};
550 $all_edits->{$lemma_index}->{'ac'} = $c;
551 # If the text carries no corrections, pc == ac.
552 $all_edits->{$lemma_index}->{'pc'} = $c
553 unless $corrected_edit_sequence;
554 }
555 foreach my $c ( @$corrected_edit_sequence ) {
556 my $lemma_index = $base_text_index{$c->[0]};
557 $all_edits->{$lemma_index}->{'pc'} = $c;
558 }
559
c78feb69 560 my @lemma_text = $collation->reading_sequence( $collation->start,
561 $collation->reading( '#END#' ) );
4ca00eca 562 my $drift = 0;
1ed3973e 563 my @ac_sequence;
564 foreach my $lemma_index ( sort keys %$all_edits ) {
565 my $ac = $all_edits->{$lemma_index}->{'ac'};
566 my $pc = $all_edits->{$lemma_index}->{'pc'};
567 my $realoffset = $lemma_index + $drift;
568 if( $ac && $pc && $ac eq $pc ) {
569 # No correction, just apply the edit
570 my( $lemma_start, $length, $items ) = @$pc;
571 splice( @lemma_text, $realoffset, $length, @$items );
572 $drift += @$items + $length;
573 } elsif ( !$pc ) {
574 # Lemma text is unaltered, save a.c. as an 'uncorrection'
575 my( $lemma_start, $length, $items ) = @$ac;
576 push( @ac_sequence, [ $realoffset, $length, $items ] );
577 } elsif ( !$ac ) {
578 # Apply the edit, save lemma text as an 'uncorrection'
579 my( $lemma_start, $length, $items ) = @$pc;
580 my @old = splice( @lemma_text, $realoffset, $length, @$items );
581 $drift += @$items + $length;
582 push( @ac_sequence, [ $realoffset, scalar( @$items ), \@old ] );
583 } else {
584 # Apply the p.c. edit, then save the a.c. edit as an
585 # 'uncorrection' on the p.c. text
586 my( $lemma_start, $length, $items ) = @$pc;
587 my @old = splice( @lemma_text, $realoffset, $length, @$items );
588 $drift += @$items + $length;
589 push( @ac_sequence, [ $realoffset, scalar( @$items ), \@old ] );
c78feb69 590 }
b49c4318 591 }
1ed3973e 592 return( \@lemma_text, \@ac_sequence );
b49c4318 593}
1ed3973e 594
595# sub _apply_sequence_splice {
596# my( $collation, $sequence, $correction
597
4ca00eca 598
e2902068 599# Helper function. Given a witness sigil, if it is a post-correctione
600# sigil,return the base witness. If not, return a false value.
601sub _is_post_corr {
602 my( $sigil ) = @_;
930ff666 603 if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
e2902068 604 return $1;
605 }
606 return undef;
607}
608
6a222840 609sub _add_hash_entry {
610 my( $hash, $key, $entry ) = @_;
611 if( exists $hash->{$key} ) {
612 push( @{$hash->{$key}}, $entry );
613 } else {
614 $hash->{$key} = [ $entry ];
615 }
616}
617
618
2ceca8c3 619=item B<cmp_str>
620
621Pretend you never saw this method. Really it needs to not be hardcoded.
622
623=cut
624
b49c4318 625sub cmp_str {
e2902068 626 my( $reading ) = @_;
627 my $word = $reading->label();
b49c4318 628 $word = lc( $word );
629 $word =~ s/\W//g;
630 $word =~ s/v/u/g;
631 $word =~ s/j/i/g;
632 $word =~ s/cha/ca/g;
633 $word =~ s/quatuor/quattuor/g;
634 $word =~ s/ioannes/iohannes/g;
635 return $word;
636}
637
2ceca8c3 638=back
639
640=head1 LICENSE
641
642This package is free software and is provided "as is" without express
643or implied warranty. You can redistribute it and/or modify it under
644the same terms as Perl itself.
645
646=head1 AUTHOR
647
648Tara L Andrews, aurum@cpan.org
649
650=cut
651
b49c4318 6521;