Commit | Line | Data |
e58153d6 |
1 | package Text::Tradition::Parser::BaseText; |
b49c4318 |
2 | |
3 | use strict; |
4 | use warnings; |
52ce987f |
5 | use Module::Load; |
4ca00eca |
6 | use Algorithm::Diff; |
b49c4318 |
7 | |
2ceca8c3 |
8 | =head1 NAME |
9 | |
10 | Text::Tradition::Parser::BaseText |
11 | |
12 | =head1 SYNOPSIS |
13 | |
14 | use Text::Tradition::Parser::BaseText qw( merge_base ); |
15 | merge_base( $graph, 'reference.txt', @apparatus_entries ) |
16 | |
17 | =head1 DESCRIPTION |
18 | |
19 | For an overview of the package, see the documentation for the |
20 | Text::Tradition::Graph module. |
21 | |
22 | This module is meant for use with certain of the other Parser classes |
23 | - whenever a list of variants is given with reference to a base text, |
24 | these must be joined into a single collation. The parser should |
25 | therefore make a list of variants and their locations, and BaseText |
26 | will join those listed variants onto the reference text. |
27 | |
28 | =head1 SUBROUTINES |
29 | |
30 | =over |
31 | |
52ce987f |
32 | =item B<parse> |
33 | |
34 | parse( $graph, %opts ); |
35 | |
36 | Takes an initialized graph and a set of options, which must include: |
37 | - 'base' - the base text referenced by the variants |
38 | - 'format' - the format of the variant list |
39 | - 'data' - the variants, in the given format. |
40 | |
41 | =cut |
42 | |
43 | sub parse { |
e2902068 |
44 | my( $tradition, %opts ) = @_; |
52ce987f |
45 | |
46 | my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'}; |
47 | load( $format_mod ); |
48 | my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} ); |
e2902068 |
49 | merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries ); |
52ce987f |
50 | } |
51 | |
2ceca8c3 |
52 | =item B<merge_base> |
53 | |
54 | merge_base( $graph, 'reference.txt', @apparatus_entries ) |
55 | |
56 | Takes three arguments: a newly-initialized Text::Tradition::Graph |
57 | object, a text file containing the reference text, and a list of |
58 | variants (apparatus entries). Adds the base text to the graph, and |
59 | joins the variants to that. |
60 | |
61 | The list of variants is an array of hash references; each hash takes |
62 | the form |
63 | { '_id' => line reference, |
64 | 'rdg_0' => lemma reading, |
65 | 'rdg_1' => first variant, |
66 | ... # and so on until all distinct readings are listed |
67 | 'WitnessA' => 'rdg_0', |
68 | 'WitnessB' => 'rdg_1', |
69 | ... # and so on until all witnesses are listed with their readings |
70 | } |
71 | |
72 | Any hash key that is not of the form /^rdg_\d+$/ and that does not |
73 | begin with an underscore is assumed to be a witness name. Any 'meta' |
74 | information to be passed must be passed in a key with a leading |
75 | underscore in its name. |
76 | |
77 | =cut |
78 | |
b15511bf |
79 | my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed |
4ca00eca |
80 | |
81 | my %base_text_index; |
6a222840 |
82 | my $edits_required = {}; |
4ca00eca |
83 | |
84 | # edits_required -> wit -> [ { start_idx, end_idx, items } ] |
930ff666 |
85 | |
b49c4318 |
86 | sub merge_base { |
e2902068 |
87 | my( $collation, $base_file, @app_entries ) = @_; |
88 | my @base_line_starts = read_base( $base_file, $collation ); |
b49c4318 |
89 | |
52ce987f |
90 | my %all_witnesses; |
6a222840 |
91 | my @unwitnessed_lemma_nodes; |
b49c4318 |
92 | foreach my $app ( @app_entries ) { |
93 | my( $line, $num ) = split( /\./, $app->{_id} ); |
94 | # DEBUG with a short graph |
15d2d3df |
95 | last if $SHORTEND && $line > $SHORTEND; |
2ceca8c3 |
96 | # DEBUG for problematic entries |
4ca00eca |
97 | my $scrutinize = ''; |
e2902068 |
98 | my $first_line_reading = $base_line_starts[ $line ]; |
b49c4318 |
99 | my $too_far = $base_line_starts[ $line+1 ]; |
100 | |
101 | my $lemma = $app->{rdg_0}; |
102 | my $seq = 1; |
103 | # Is this the Nth occurrence of this reading in the line? |
104 | if( $lemma =~ s/(_)?(\d)$// ) { |
105 | $seq = $2; |
106 | } |
107 | my @lemma_words = split( /\s+/, $lemma ); |
108 | |
109 | # Now search for the lemma words within this line. |
e2902068 |
110 | my $lemma_start = $first_line_reading; |
b49c4318 |
111 | my $lemma_end; |
112 | my %seen; |
113 | while( $lemma_start ne $too_far ) { |
114 | # Loop detection |
115 | if( $seen{ $lemma_start->name() } ) { |
116 | warn "Detected loop at " . $lemma_start->name() . |
117 | ", ref $line,$num"; |
118 | last; |
119 | } |
120 | $seen{ $lemma_start->name() } = 1; |
121 | |
122 | # Try to match the lemma. |
123 | my $unmatch = 0; |
124 | print STDERR "Matching " . cmp_str( $lemma_start) . " against " . |
125 | $lemma_words[0] . "...\n" |
126 | if "$line.$num" eq $scrutinize; |
127 | if( cmp_str( $lemma_start ) eq $lemma_words[0] ) { |
128 | # Skip it if we need a match that is not the first. |
129 | if( --$seq < 1 ) { |
130 | # Now we have to compare the rest of the words here. |
131 | if( scalar( @lemma_words ) > 1 ) { |
e2902068 |
132 | my $next_reading = |
133 | $collation->next_reading( $lemma_start ); |
b49c4318 |
134 | foreach my $w ( @lemma_words[1..$#lemma_words] ) { |
135 | printf STDERR "Now matching %s against %s\n", |
e2902068 |
136 | cmp_str($next_reading), $w |
b49c4318 |
137 | if "$line.$num" eq $scrutinize; |
e2902068 |
138 | if( $w ne cmp_str($next_reading) ) { |
b49c4318 |
139 | $unmatch = 1; |
140 | last; |
141 | } else { |
e2902068 |
142 | $lemma_end = $next_reading; |
143 | $next_reading = |
144 | $collation->next_reading( $lemma_end ); |
b49c4318 |
145 | } |
146 | } |
147 | } else { |
148 | $lemma_end = $lemma_start; |
149 | } |
150 | } else { |
151 | $unmatch = 1; |
152 | } |
153 | } |
154 | last unless ( $unmatch || !defined( $lemma_end ) ); |
155 | $lemma_end = undef; |
e2902068 |
156 | $lemma_start = $collation->next_reading( $lemma_start ); |
b49c4318 |
157 | } |
158 | |
159 | unless( $lemma_end ) { |
160 | warn "No match found for @lemma_words at $line.$num"; |
161 | next; |
b49c4318 |
162 | } |
163 | |
4ca00eca |
164 | # Now we have found the lemma; we will record an 'edit', in |
165 | # terms of a splice operation, for each subsequent reading. |
166 | # We also note which witnesses take the given edit. |
167 | |
6a222840 |
168 | my @lemma_set = $collation->reading_sequence( $lemma_start, |
169 | $lemma_end ); |
4ca00eca |
170 | my @reading_sets = [ @lemma_set ]; |
171 | |
172 | # For each reading that is not rdg_0, we create the variant |
173 | # reading nodes, and store the range as an edit operation on |
174 | # the base text. |
175 | my $variant_objects; |
6a222840 |
176 | my %pc_seen; # Keep track of mss with explicit post-corr data |
b49c4318 |
177 | foreach my $k ( grep { /^rdg/ } keys( %$app ) ) { |
b49c4318 |
178 | my @mss = grep { $app->{$_} eq $k } keys( %$app ); |
1ed3973e |
179 | |
180 | # Keep track of lemma nodes that don't actually appear in |
181 | # any MSS; we will want to remove them from the collation. |
6a222840 |
182 | push( @unwitnessed_lemma_nodes, @lemma_set ) |
183 | if !@mss && $k eq 'rdg_0'; |
184 | |
e2902068 |
185 | # Keep track of what witnesses we have seen. |
52ce987f |
186 | @all_witnesses{ @mss } = ( 1 ) x scalar( @mss ); |
4ca00eca |
187 | # Keep track of which witnesses bear corrected readings here. |
188 | foreach my $m ( @mss ) { |
189 | my $base = _is_post_corr( $m ); |
190 | next unless $base; |
6a222840 |
191 | $pc_seen{$base} = 1; |
4ca00eca |
192 | } |
193 | next if $k eq 'rdg_0'; |
194 | |
1ed3973e |
195 | # Parse the variant into reading tokens. |
4ca00eca |
196 | # TODO don't hardcode the reading split operation |
197 | my @variant = split( /\s+/, $app->{$k} ); |
198 | @variant = () if $app->{$k} eq '/'; # This is an omission. |
b49c4318 |
199 | |
4ca00eca |
200 | my @variant_readings; |
b49c4318 |
201 | my $ctr = 0; |
b49c4318 |
202 | foreach my $vw ( @variant ) { |
203 | my $vwname = "$k/$line.$num.$ctr"; $ctr++; |
e2902068 |
204 | my $vwreading = $collation->add_reading( $vwname ); |
205 | $vwreading->text( $vw ); |
4ca00eca |
206 | push( @variant_readings, $vwreading ); |
b49c4318 |
207 | } |
e49731d7 |
208 | |
4ca00eca |
209 | $variant_objects->{$k} = { 'mss' => \@mss, |
210 | 'reading' => \@variant_readings, |
211 | }; |
212 | push( @reading_sets, \@variant_readings ); |
213 | } |
b49c4318 |
214 | |
4ca00eca |
215 | # Now collate and collapse the identical readings within the |
216 | # collated sets. Modifies the reading sets that were passed. |
217 | collate_variants( $collation, @reading_sets ); |
218 | |
1ed3973e |
219 | # Record any stated relationships between the nodes and the lemma. |
3265b0ce |
220 | set_relationships( $collation, $app, \@lemma_set, $variant_objects ); |
15d2d3df |
221 | |
4ca00eca |
222 | # Now create the splice-edit objects that will be used |
223 | # to reconstruct each witness. |
224 | |
225 | foreach my $rkey ( keys %$variant_objects ) { |
226 | # Object is argument list for splice, so: |
227 | # offset, length, replacements |
c78feb69 |
228 | my $edit_object = [ $lemma_start->name, |
4ca00eca |
229 | scalar( @lemma_set ), |
230 | $variant_objects->{$rkey}->{reading} ]; |
231 | foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) { |
232 | # Is this a p.c. entry? |
233 | my $base = _is_post_corr( $ms ); |
234 | if( $base ) { # this is a post-corr witness |
235 | my $pc_key = $base . "_post"; |
236 | _add_hash_entry( $edits_required, $pc_key, $edit_object ); |
237 | } else { # this is an ante-corr witness |
238 | my $pc_key = $ms . "_post"; |
6a222840 |
239 | _add_hash_entry( $edits_required, $ms, $edit_object ); |
240 | unless( $pc_seen{$ms} ) { |
241 | # If this witness carries no correction, add this |
242 | # same object to its post-corrected state. |
243 | _add_hash_entry( $edits_required, $pc_key, |
244 | $edit_object ); |
4ca00eca |
245 | } |
246 | } |
e2902068 |
247 | } |
4ca00eca |
248 | } |
249 | } # Finished going through the apparatus entries |
250 | |
251 | # Now make the witness objects, and create their text sequences |
6a222840 |
252 | foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) { |
15d2d3df |
253 | print STDERR "Creating witness $w\n"; |
4ca00eca |
254 | my $witness_obj = $collation->tradition->add_witness( sigil => $w ); |
b15511bf |
255 | my $debug; # = $w eq 'Vb11'; |
256 | my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug ); |
257 | my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug ) |
258 | if exists( $edits_required->{$w."_post"} ); |
259 | |
260 | my @repeated = _check_for_repeated( @ante_corr_seq ); |
261 | warn "Repeated elements @repeated in $w a.c." |
262 | if @repeated; |
263 | @repeated = _check_for_repeated( @post_corr_seq ); |
264 | warn "Repeated elements @repeated in $w p.c." |
15d2d3df |
265 | if @repeated; |
b15511bf |
266 | |
6a222840 |
267 | # Now save these paths in my witness object |
b15511bf |
268 | if( @post_corr_seq ) { |
269 | $witness_obj->path( \@post_corr_seq ); |
270 | $witness_obj->uncorrected_path( \@ante_corr_seq ); |
271 | } else { |
272 | $witness_obj->path( \@ante_corr_seq ); |
52ce987f |
273 | } |
b49c4318 |
274 | } |
e2902068 |
275 | |
6a222840 |
276 | # Now remove our 'base text' edges, which is to say, the only |
1ed3973e |
277 | # ones we have created so far. Also remove any unwitnessed |
278 | # lemma nodes (TODO unless we are treating base as witness) |
6a222840 |
279 | foreach ( $collation->paths() ) { |
280 | $collation->del_path( $_ ); |
281 | } |
282 | foreach( @unwitnessed_lemma_nodes ) { |
283 | $collation->del_reading( $_ ); |
284 | } |
4ca00eca |
285 | |
b15511bf |
286 | ### HACKY HACKY Do some one-off path corrections here. |
287 | if( $collation->linear ) { |
4cdd82f1 |
288 | my $c = $collation; |
289 | my $end = $SHORTEND ? $SHORTEND : 155; |
290 | my $path = $c->tradition->witness('Vb11')->path; |
291 | if( $end > 16 ) { |
292 | $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') ); |
293 | splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) ); |
294 | } |
295 | # What else? |
b15511bf |
296 | } else { |
297 | my $c = $collation; |
4cdd82f1 |
298 | my $end = $SHORTEND ? $SHORTEND : 155; |
b15511bf |
299 | # Vb5: |
300 | my $path = $c->tradition->witness('Vb5')->path; |
4cdd82f1 |
301 | splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106; |
b15511bf |
302 | # Vb11: |
303 | $path = $c->tradition->witness('Vb11')->path; |
4cdd82f1 |
304 | if( $end > 16 ) { |
305 | $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') ); |
306 | splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) ); |
307 | } |
b15511bf |
308 | # Vb12 a.c.: |
309 | $path = $c->tradition->witness('Vb12')->uncorrected_path; |
4cdd82f1 |
310 | splice( @$path, 1828, 1, $c->reading('rdg_2/137.5.0') ) if $end > 137; |
b15511bf |
311 | # Vb13: |
312 | $path = $c->tradition->witness('Vb13')->path; |
4cdd82f1 |
313 | splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58; |
b15511bf |
314 | # Vb20 a.c.: |
315 | $path = $c->tradition->witness('Vb20')->uncorrected_path; |
4cdd82f1 |
316 | splice( @$path, 1251, 1, $c->reading( '94,6' ) ) if $end > 94; |
b15511bf |
317 | # Vb26: |
318 | $path = $c->tradition->witness('Vb26')->path; |
4cdd82f1 |
319 | splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46; |
b15511bf |
320 | } |
321 | |
e2902068 |
322 | # Now walk paths and calculate positions. |
323 | my @common_readings = |
6a222840 |
324 | $collation->make_witness_paths(); |
e2902068 |
325 | $collation->calculate_positions( @common_readings ); |
b49c4318 |
326 | } |
327 | |
15d2d3df |
328 | sub _check_for_repeated { |
329 | my @seq = @_; |
330 | my %unique; |
331 | my @repeated; |
332 | foreach ( @seq ) { |
333 | if( exists $unique{$_->name} ) { |
334 | push( @repeated, $_->name ); |
335 | } else { |
336 | $unique{$_->name} = 1; |
337 | } |
338 | } |
339 | return @repeated; |
340 | } |
341 | |
2ceca8c3 |
342 | =item B<read_base> |
343 | |
e2902068 |
344 | my @line_beginnings = read_base( 'reference.txt', $collation ); |
2ceca8c3 |
345 | |
e2902068 |
346 | Takes a text file and a (presumed empty) collation object, adds the |
347 | words as simple linear readings to the collation, and returns a |
348 | list of readings that represent the beginning of lines. This collation |
349 | is now the starting point for application of apparatus entries in |
350 | merge_base, e.g. from a CSV file or a Classical Text Editor file. |
2ceca8c3 |
351 | |
352 | =cut |
b49c4318 |
353 | |
354 | sub read_base { |
e2902068 |
355 | my( $base_file, $collation ) = @_; |
b49c4318 |
356 | |
e2902068 |
357 | # This array gives the first reading for each line. We put the |
b49c4318 |
358 | # common starting point in line zero. |
e2902068 |
359 | my $last_reading = $collation->start(); |
6a222840 |
360 | $base_text_index{$last_reading->name} = 0; |
e2902068 |
361 | my $lineref_array = [ $last_reading ]; # There is no line zero. |
b49c4318 |
362 | |
363 | open( BASE, $base_file ) or die "Could not open file $base_file: $!"; |
6a222840 |
364 | my $i = 1; |
b49c4318 |
365 | while(<BASE>) { |
e2902068 |
366 | # Make the readings, and connect them up for the base, but |
367 | # also save the first reading of each line in an array for the |
368 | # purpose. |
369 | # TODO use configurable reading separator |
b49c4318 |
370 | chomp; |
371 | my @words = split; |
372 | my $started = 0; |
373 | my $wordref = 0; |
374 | my $lineref = scalar @$lineref_array; |
15d2d3df |
375 | last if $SHORTEND && $lineref > $SHORTEND; |
b49c4318 |
376 | foreach my $w ( @words ) { |
e2902068 |
377 | my $readingref = join( ',', $lineref, ++$wordref ); |
378 | my $reading = $collation->add_reading( $readingref ); |
379 | $reading->text( $w ); |
b49c4318 |
380 | unless( $started ) { |
e2902068 |
381 | push( @$lineref_array, $reading ); |
b49c4318 |
382 | $started = 1; |
383 | } |
4ca00eca |
384 | # Add edge paths in the graph, for easier tracking when |
385 | # we start applying corrections. These paths will be |
386 | # removed when we're done. |
387 | my $path = $collation->add_path( $last_reading, $reading, |
388 | $collation->baselabel ); |
389 | $last_reading = $reading; |
390 | |
391 | # Note an array index for the reading, for later correction splices. |
392 | $base_text_index{$readingref} = $i++; |
b49c4318 |
393 | } |
394 | } |
395 | close BASE; |
396 | # Ending point for all texts |
e2902068 |
397 | my $endpoint = $collation->add_reading( '#END#' ); |
930ff666 |
398 | $collation->add_path( $last_reading, $endpoint, $collation->baselabel ); |
b49c4318 |
399 | push( @$lineref_array, $endpoint ); |
6a222840 |
400 | $base_text_index{$endpoint->name} = $i; |
b49c4318 |
401 | |
402 | return( @$lineref_array ); |
403 | } |
404 | |
e49731d7 |
405 | =item B<collate_variants> |
2ceca8c3 |
406 | |
4ca00eca |
407 | collate_variants( $collation, @reading_ranges ) |
2ceca8c3 |
408 | |
e49731d7 |
409 | Given a set of readings in the form |
410 | ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... ) |
e2902068 |
411 | walks through each to identify those readings that are identical. The |
412 | collation is a Text::Tradition::Collation object; the elements of |
413 | @readings are Text::Tradition::Collation::Reading objects that appear |
414 | on the collation graph. |
b49c4318 |
415 | |
2ceca8c3 |
416 | TODO: Handle collapsed and non-collapsed transpositions. |
417 | |
418 | =cut |
b49c4318 |
419 | |
e49731d7 |
420 | sub collate_variants { |
4ca00eca |
421 | my( $collation, @reading_sets ) = @_; |
4ca00eca |
422 | |
c78feb69 |
423 | # Two different ways to do this, depending on whether we want |
424 | # transposed reading nodes to be merged into one (producing a |
425 | # nonlinear, bidirectional graph) or not (producing a relatively |
426 | # linear, unidirectional graph.) |
427 | return $collation->linear ? collate_linearly( @_ ) |
428 | : collate_nonlinearly( @_ ); |
429 | } |
4ca00eca |
430 | |
c78feb69 |
431 | sub collate_linearly { |
432 | my( $collation, $lemma_set, @variant_sets ) = @_; |
4ca00eca |
433 | |
434 | my @unique; |
435 | push( @unique, @$lemma_set ); |
c78feb69 |
436 | while( @variant_sets ) { |
437 | my $variant_set = shift @variant_sets; |
438 | # Use diff to do this job |
439 | my $diff = Algorithm::Diff->new( \@unique, $variant_set, |
440 | {'keyGen' => \&_collation_hash} ); |
441 | my @new_unique; |
442 | my %merged; |
443 | while( $diff->Next ) { |
444 | if( $diff->Same ) { |
445 | # merge the nodes |
446 | my @l = $diff->Items( 1 ); |
447 | my @v = $diff->Items( 2 ); |
448 | foreach my $i ( 0 .. $#l ) { |
449 | if( !$merged{$l[$i]->name} ) { |
450 | $collation->merge_readings( $l[$i], $v[$i] ); |
451 | $merged{$l[$i]->name} = 1; |
452 | } else { |
453 | print STDERR "Would have double merged " . $l[$i]->name . "\n"; |
6a222840 |
454 | } |
e49731d7 |
455 | } |
c78feb69 |
456 | # splice the lemma nodes into the variant set |
457 | my( $offset ) = $diff->Get( 'min2' ); |
458 | splice( @$variant_set, $offset, scalar( @l ), @l ); |
459 | push( @new_unique, @l ); |
460 | } else { |
461 | # Keep the old unique readings |
462 | push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 ); |
463 | # Add the new readings to the 'unique' list |
464 | push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 ); |
b49c4318 |
465 | } |
c78feb69 |
466 | } |
467 | @unique = @new_unique; |
468 | } |
469 | } |
470 | |
471 | sub collate_nonlinearly { |
472 | my( $collation, $lemma_set, @variant_sets ) = @_; |
473 | |
474 | my @unique; |
475 | push( @unique, @$lemma_set ); |
476 | while( @variant_sets ) { |
477 | my $variant_set = shift @variant_sets; |
478 | # Simply match the first reading that carries the same word, so |
479 | # long as that reading has not yet been used to match another |
480 | # word in this variant. That way lies loopy madness. |
481 | my @distinct; |
482 | my %merged; |
483 | foreach my $idx ( 0 .. $#{$variant_set} ) { |
484 | my $vw = $variant_set->[$idx]; |
485 | my @same = grep { cmp_str( $_ ) eq $vw->label } @unique; |
486 | my $matched; |
487 | if( @same ) { |
488 | foreach my $i ( 0 .. $#same ) { |
489 | unless( $merged{$same[$i]->name} ) { |
b15511bf |
490 | #print STDERR sprintf( "Merging %s into %s\n", |
491 | # $vw->name, |
492 | # $same[$i]->name ); |
c78feb69 |
493 | $collation->merge_readings( $same[$i], $vw ); |
494 | $merged{$same[$i]->name} = 1; |
495 | $matched = $i; |
496 | $variant_set->[$idx] = $same[$i]; |
15d2d3df |
497 | } |
498 | } |
6a222840 |
499 | } |
c78feb69 |
500 | unless( @same && defined($matched) ) { |
501 | push( @distinct, $vw ); |
502 | } |
b49c4318 |
503 | } |
c78feb69 |
504 | push( @unique, @distinct ); |
b49c4318 |
505 | } |
4ca00eca |
506 | } |
2ceca8c3 |
507 | |
c78feb69 |
508 | |
4ca00eca |
509 | |
510 | sub _collation_hash { |
511 | my $node = shift; |
6a222840 |
512 | return cmp_str( $node ); |
4ca00eca |
513 | } |
2ceca8c3 |
514 | |
15d2d3df |
515 | sub set_relationships { |
3265b0ce |
516 | my( $collation, $app, $lemma, $variants ) = @_; |
15d2d3df |
517 | foreach my $rkey ( keys %$variants ) { |
518 | my $var = $variants->{$rkey}->{'reading'}; |
b15511bf |
519 | my $type = $app->{sprintf( "_%s_type", $rkey )}; |
520 | my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )}; |
521 | my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )}; |
522 | |
523 | my %rel_options = (); |
524 | $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/; |
525 | $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/; |
15d2d3df |
526 | |
051ddba3 |
527 | if( $type =~ /^(inv|tr|rep)$/i ) { |
528 | # Transposition or repetition: look for nodes with the |
529 | # same label but different IDs and mark them. |
530 | $type = 'repetition' if $type =~ /^rep/i; |
4cdd82f1 |
531 | $rel_options{'type'} = $type; |
3265b0ce |
532 | my %labels; |
533 | foreach my $r ( @$lemma ) { |
051ddba3 |
534 | $labels{cmp_str( $r )} = $r; |
3265b0ce |
535 | } |
536 | foreach my $r( @$var ) { |
537 | if( exists $labels{$r->label} && |
538 | $r->name ne $labels{$r->label}->name ) { |
051ddba3 |
539 | if( $type eq 'repetition' ) { |
540 | # Repetition |
b15511bf |
541 | $collation->add_relationship( $r, $labels{$r->label}, \%rel_options ); |
051ddba3 |
542 | } else { |
543 | # Transposition |
544 | $r->set_identical( $labels{$r->label} ); |
545 | } |
3265b0ce |
546 | } |
547 | } |
b15511bf |
548 | } elsif( $type =~ /^(gr|lex|sp(el)?)$/i ) { |
549 | |
550 | # Grammar/spelling/lexical: this can be a one-to-one or |
551 | # one-to-many mapping. We should think about merging |
552 | # readings if it is one-to-many. |
553 | |
3265b0ce |
554 | $type = 'grammatical' if $type =~ /gr/i; |
555 | $type = 'spelling' if $type =~ /sp/i; |
556 | $type = 'repetition' if $type =~ /rep/i; |
b15511bf |
557 | $type = 'lexical' if $type =~ /lex/i; |
4cdd82f1 |
558 | $rel_options{'type'} = $type; |
3265b0ce |
559 | if( @$lemma == @$var ) { |
560 | foreach my $i ( 0 .. $#{$lemma} ) { |
b15511bf |
561 | $collation->add_relationship( $var->[$i], $lemma->[$i], |
562 | \%rel_options ); |
563 | } |
3265b0ce |
564 | } else { |
b15511bf |
565 | # An uneven many-to-many mapping. Make a segment out of |
566 | # whatever we have. |
567 | my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0]; |
568 | my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0]; |
569 | $collation->add_relationship( $varseg, $lemseg, \%rel_options ); |
3265b0ce |
570 | } |
b15511bf |
571 | } elsif( $type !~ /^(add|om)$/i ) { |
3265b0ce |
572 | warn "Unrecognized type $type"; |
573 | } |
15d2d3df |
574 | } |
575 | } |
576 | |
577 | |
578 | |
4ca00eca |
579 | sub apply_edits { |
b15511bf |
580 | my( $collation, $edit_sequence, $debug ) = @_; |
c78feb69 |
581 | my @lemma_text = $collation->reading_sequence( $collation->start, |
582 | $collation->reading( '#END#' ) ); |
4ca00eca |
583 | my $drift = 0; |
b15511bf |
584 | foreach my $correction ( @$edit_sequence ) { |
585 | my( $lemma_start, $length, $items ) = @$correction; |
586 | my $offset = $base_text_index{$lemma_start}; |
587 | my $realoffset = $offset + $drift; |
588 | if( $debug || |
589 | $lemma_text[$realoffset]->name ne $lemma_start ) { |
590 | my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1]; |
591 | my @base_phrase; |
592 | my $i = $realoffset; |
593 | my $l = $collation->reading( $lemma_start ); |
594 | while( $i < $realoffset+$length ) { |
595 | push( @base_phrase, $l ); |
596 | $l = $collation->next_reading( $l ); |
597 | $i++; |
598 | } |
599 | |
600 | print STDERR sprintf( "Trying to replace %s (%s) starting at %d " . |
601 | "with %s (%s) with drift %d\n", |
602 | join( ' ', map {$_->label} @base_phrase ), |
603 | join( ' ', map {$_->name} @base_phrase ), |
604 | $realoffset, |
605 | join( ' ', map {$_->label} @$items ), |
606 | join( ' ', map {$_->name} @$items ), |
607 | $drift, |
608 | ) if $debug; |
609 | |
610 | if( $lemma_text[$realoffset]->name ne $lemma_start ) { |
611 | warn( sprintf( "Should be replacing %s (%s) with %s (%s) " . |
612 | "but %s (%s) is there instead", |
613 | join( ' ', map {$_->label} @base_phrase ), |
614 | join( ' ', map {$_->name} @base_phrase ), |
615 | join( ' ', map {$_->label} @$items ), |
616 | join( ' ', map {$_->name} @$items ), |
617 | join( ' ', map {$_->label} @this_phrase ), |
618 | join( ' ', map {$_->name} @this_phrase ), |
619 | ) ); |
620 | # next; |
621 | } |
c78feb69 |
622 | } |
b15511bf |
623 | splice( @lemma_text, $realoffset, $length, @$items ); |
624 | $drift += @$items - $length; |
b49c4318 |
625 | } |
b15511bf |
626 | return @lemma_text; |
b49c4318 |
627 | } |
b15511bf |
628 | |
4ca00eca |
629 | |
e2902068 |
630 | # Helper function. Given a witness sigil, if it is a post-correctione |
631 | # sigil,return the base witness. If not, return a false value. |
632 | sub _is_post_corr { |
633 | my( $sigil ) = @_; |
930ff666 |
634 | if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) { |
e2902068 |
635 | return $1; |
636 | } |
637 | return undef; |
638 | } |
639 | |
6a222840 |
640 | sub _add_hash_entry { |
641 | my( $hash, $key, $entry ) = @_; |
642 | if( exists $hash->{$key} ) { |
643 | push( @{$hash->{$key}}, $entry ); |
644 | } else { |
645 | $hash->{$key} = [ $entry ]; |
646 | } |
647 | } |
648 | |
649 | |
2ceca8c3 |
650 | =item B<cmp_str> |
651 | |
652 | Pretend you never saw this method. Really it needs to not be hardcoded. |
653 | |
654 | =cut |
655 | |
b49c4318 |
656 | sub cmp_str { |
e2902068 |
657 | my( $reading ) = @_; |
658 | my $word = $reading->label(); |
b49c4318 |
659 | $word = lc( $word ); |
660 | $word =~ s/\W//g; |
661 | $word =~ s/v/u/g; |
662 | $word =~ s/j/i/g; |
663 | $word =~ s/cha/ca/g; |
664 | $word =~ s/quatuor/quattuor/g; |
665 | $word =~ s/ioannes/iohannes/g; |
666 | return $word; |
667 | } |
668 | |
2ceca8c3 |
669 | =back |
670 | |
671 | =head1 LICENSE |
672 | |
673 | This package is free software and is provided "as is" without express |
674 | or implied warranty. You can redistribute it and/or modify it under |
675 | the same terms as Perl itself. |
676 | |
677 | =head1 AUTHOR |
678 | |
679 | Tara L Andrews, aurum@cpan.org |
680 | |
681 | =cut |
682 | |
b49c4318 |
683 | 1; |