Commit | Line | Data |
e58153d6 |
1 | package Text::Tradition::Parser::BaseText; |
b49c4318 |
2 | |
3 | use strict; |
4 | use warnings; |
52ce987f |
5 | use Module::Load; |
b49c4318 |
6 | |
2ceca8c3 |
7 | =head1 NAME |
8 | |
9 | Text::Tradition::Parser::BaseText |
10 | |
11 | =head1 SYNOPSIS |
12 | |
13 | use Text::Tradition::Parser::BaseText qw( merge_base ); |
14 | merge_base( $graph, 'reference.txt', @apparatus_entries ) |
15 | |
16 | =head1 DESCRIPTION |
17 | |
18 | For an overview of the package, see the documentation for the |
19 | Text::Tradition::Graph module. |
20 | |
21 | This module is meant for use with certain of the other Parser classes |
22 | - whenever a list of variants is given with reference to a base text, |
23 | these must be joined into a single collation. The parser should |
24 | therefore make a list of variants and their locations, and BaseText |
25 | will join those listed variants onto the reference text. |
26 | |
27 | =head1 SUBROUTINES |
28 | |
29 | =over |
30 | |
52ce987f |
31 | =item B<parse> |
32 | |
33 | parse( $graph, %opts ); |
34 | |
35 | Takes an initialized graph and a set of options, which must include: |
36 | - 'base' - the base text referenced by the variants |
37 | - 'format' - the format of the variant list |
38 | - 'data' - the variants, in the given format. |
39 | |
40 | =cut |
41 | |
42 | sub parse { |
e2902068 |
43 | my( $tradition, %opts ) = @_; |
52ce987f |
44 | |
45 | my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'}; |
46 | load( $format_mod ); |
47 | my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} ); |
e2902068 |
48 | merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries ); |
52ce987f |
49 | } |
50 | |
2ceca8c3 |
51 | =item B<merge_base> |
52 | |
53 | merge_base( $graph, 'reference.txt', @apparatus_entries ) |
54 | |
55 | Takes three arguments: a newly-initialized Text::Tradition::Graph |
56 | object, a text file containing the reference text, and a list of |
57 | variants (apparatus entries). Adds the base text to the graph, and |
58 | joins the variants to that. |
59 | |
60 | The list of variants is an array of hash references; each hash takes |
61 | the form |
62 | { '_id' => line reference, |
63 | 'rdg_0' => lemma reading, |
64 | 'rdg_1' => first variant, |
65 | ... # and so on until all distinct readings are listed |
66 | 'WitnessA' => 'rdg_0', |
67 | 'WitnessB' => 'rdg_1', |
68 | ... # and so on until all witnesses are listed with their readings |
69 | } |
70 | |
71 | Any hash key that is not of the form /^rdg_\d+$/ and that does not |
72 | begin with an underscore is assumed to be a witness name. Any 'meta' |
73 | information to be passed must be passed in a key with a leading |
74 | underscore in its name. |
75 | |
76 | =cut |
77 | |
1f563ac3 |
78 | my $SHORT = 25; |
930ff666 |
79 | |
b49c4318 |
80 | sub merge_base { |
e2902068 |
81 | my( $collation, $base_file, @app_entries ) = @_; |
82 | my @base_line_starts = read_base( $base_file, $collation ); |
b49c4318 |
83 | |
52ce987f |
84 | my %all_witnesses; |
b49c4318 |
85 | foreach my $app ( @app_entries ) { |
86 | my( $line, $num ) = split( /\./, $app->{_id} ); |
87 | # DEBUG with a short graph |
930ff666 |
88 | last if $SHORT && $line > $SHORT; |
2ceca8c3 |
89 | # DEBUG for problematic entries |
930ff666 |
90 | my $scrutinize = ""; |
e2902068 |
91 | my $first_line_reading = $base_line_starts[ $line ]; |
b49c4318 |
92 | my $too_far = $base_line_starts[ $line+1 ]; |
93 | |
94 | my $lemma = $app->{rdg_0}; |
95 | my $seq = 1; |
96 | # Is this the Nth occurrence of this reading in the line? |
97 | if( $lemma =~ s/(_)?(\d)$// ) { |
98 | $seq = $2; |
99 | } |
100 | my @lemma_words = split( /\s+/, $lemma ); |
101 | |
102 | # Now search for the lemma words within this line. |
e2902068 |
103 | my $lemma_start = $first_line_reading; |
b49c4318 |
104 | my $lemma_end; |
105 | my %seen; |
106 | while( $lemma_start ne $too_far ) { |
107 | # Loop detection |
108 | if( $seen{ $lemma_start->name() } ) { |
109 | warn "Detected loop at " . $lemma_start->name() . |
110 | ", ref $line,$num"; |
111 | last; |
112 | } |
113 | $seen{ $lemma_start->name() } = 1; |
114 | |
115 | # Try to match the lemma. |
116 | my $unmatch = 0; |
117 | print STDERR "Matching " . cmp_str( $lemma_start) . " against " . |
118 | $lemma_words[0] . "...\n" |
119 | if "$line.$num" eq $scrutinize; |
120 | if( cmp_str( $lemma_start ) eq $lemma_words[0] ) { |
121 | # Skip it if we need a match that is not the first. |
122 | if( --$seq < 1 ) { |
123 | # Now we have to compare the rest of the words here. |
124 | if( scalar( @lemma_words ) > 1 ) { |
e2902068 |
125 | my $next_reading = |
126 | $collation->next_reading( $lemma_start ); |
b49c4318 |
127 | foreach my $w ( @lemma_words[1..$#lemma_words] ) { |
128 | printf STDERR "Now matching %s against %s\n", |
e2902068 |
129 | cmp_str($next_reading), $w |
b49c4318 |
130 | if "$line.$num" eq $scrutinize; |
e2902068 |
131 | if( $w ne cmp_str($next_reading) ) { |
b49c4318 |
132 | $unmatch = 1; |
133 | last; |
134 | } else { |
e2902068 |
135 | $lemma_end = $next_reading; |
136 | $next_reading = |
137 | $collation->next_reading( $lemma_end ); |
b49c4318 |
138 | } |
139 | } |
140 | } else { |
141 | $lemma_end = $lemma_start; |
142 | } |
143 | } else { |
144 | $unmatch = 1; |
145 | } |
146 | } |
147 | last unless ( $unmatch || !defined( $lemma_end ) ); |
148 | $lemma_end = undef; |
e2902068 |
149 | $lemma_start = $collation->next_reading( $lemma_start ); |
b49c4318 |
150 | } |
151 | |
152 | unless( $lemma_end ) { |
153 | warn "No match found for @lemma_words at $line.$num"; |
154 | next; |
155 | } else { |
e2902068 |
156 | # These are no longer common readings; unmark them as such. |
157 | my @lemma_readings = $collation->reading_sequence( $lemma_start, |
b49c4318 |
158 | $lemma_end ); |
930ff666 |
159 | map { $_->make_variant } @lemma_readings; |
b49c4318 |
160 | } |
161 | |
e2902068 |
162 | # Now we have our lemma readings; we add the variant readings |
163 | # to the collation. |
b49c4318 |
164 | |
e49731d7 |
165 | # Keep track of the start and end point of each reading for later |
e2902068 |
166 | # reading collapse. |
e49731d7 |
167 | my @readings = ( $lemma_start, $lemma_end ); |
168 | |
e2902068 |
169 | # For each reading that is not rdg_0, we make a chain of readings |
b49c4318 |
170 | # and connect them to the anchor. Edges are named after the mss |
171 | # that are relevant. |
172 | foreach my $k ( grep { /^rdg/ } keys( %$app ) ) { |
7854e12e |
173 | if( $k eq 'rdg_0' ) { # that's the lemma |
174 | # The lemma is already in the graph, but we need to look for |
175 | # any explicit post-correctione readings and add the |
176 | # relevant path. |
177 | my @mss = grep { $app->{$_} eq $k } keys( %$app ); |
930ff666 |
178 | # Keep track of what witnesses we have seen. |
179 | @all_witnesses{ @mss } = ( 1 ) x scalar( @mss ); |
7854e12e |
180 | foreach my $m ( @mss ) { |
181 | my $base = _is_post_corr( $m ); |
182 | next unless $base; |
183 | my @lem = $collation->reading_sequence( $lemma_start, $lemma_end ); |
930ff666 |
184 | $collation->add_path( $collation->prior_reading( $lem[0] ), $lem[0], $m ); |
7854e12e |
185 | foreach my $i ( 0 .. $#lem-1 ) { |
930ff666 |
186 | $collation->add_path( $lem[$i], $lem[++$i], $m ); |
7854e12e |
187 | } |
930ff666 |
188 | $collation->add_path( $lem[-1], $collation->next_reading( $lem[-1] ), $m ); |
7854e12e |
189 | } |
930ff666 |
190 | next; |
7854e12e |
191 | } |
b49c4318 |
192 | my @variant = split( /\s+/, $app->{$k} ); |
193 | @variant = () if $app->{$k} eq '/'; # This is an omission. |
194 | my @mss = grep { $app->{$_} eq $k } keys( %$app ); |
195 | |
196 | unless( @mss ) { |
197 | print STDERR "Skipping '@variant' at $line.$num: no mss\n"; |
198 | next; |
199 | } |
200 | |
e2902068 |
201 | # Keep track of what witnesses we have seen. |
52ce987f |
202 | @all_witnesses{ @mss } = ( 1 ) x scalar( @mss ); |
b49c4318 |
203 | |
e2902068 |
204 | # Make the variant into a set of readings. |
b49c4318 |
205 | my $ctr = 0; |
e2902068 |
206 | my $last_reading = $collation->prior_reading( $lemma_start ); |
b49c4318 |
207 | my $var_start; |
208 | foreach my $vw ( @variant ) { |
209 | my $vwname = "$k/$line.$num.$ctr"; $ctr++; |
e2902068 |
210 | my $vwreading = $collation->add_reading( $vwname ); |
211 | $vwreading->text( $vw ); |
212 | $vwreading->make_variant(); |
213 | foreach ( @mss ) { |
214 | $collation->add_path( $last_reading, $vwreading, $_ ); |
215 | } |
216 | $var_start = $vwreading unless $var_start; |
217 | $last_reading = $vwreading; |
b49c4318 |
218 | } |
219 | # Now hook it up at the end. |
e2902068 |
220 | foreach ( @mss ) { |
221 | $collation->add_path( $last_reading, |
7854e12e |
222 | $collation->next_reading( $lemma_end ), |
e2902068 |
223 | $_ ); |
224 | } |
b49c4318 |
225 | |
e49731d7 |
226 | if( $var_start ) { # if it wasn't an empty reading |
e2902068 |
227 | push( @readings, $var_start, $last_reading ); |
e49731d7 |
228 | } |
b49c4318 |
229 | } |
e49731d7 |
230 | |
e2902068 |
231 | # Now collate and collapse the identical readings within the collation. |
232 | collate_variants( $collation, @readings ); |
b49c4318 |
233 | } |
234 | |
e2902068 |
235 | # Now make the witness objects |
52ce987f |
236 | foreach my $w ( keys %all_witnesses ) { |
e2902068 |
237 | my $base = _is_post_corr( $w ); |
238 | if( $base ) { |
239 | my $pctag = substr( $w, length( $base ) ); |
240 | my $existing_wit = $collation->tradition->witness( $base ); |
241 | unless( $existing_wit ) { |
7854e12e |
242 | $existing_wit = $collation->tradition->add_witness( sigil => $base ); |
e2902068 |
243 | } |
244 | $existing_wit->post_correctione( $pctag ); |
245 | } else { |
7854e12e |
246 | $collation->tradition->add_witness( sigil => $w ) |
e2902068 |
247 | unless $collation->tradition->witness( $w ); |
52ce987f |
248 | } |
b49c4318 |
249 | } |
e2902068 |
250 | |
251 | # Now walk paths and calculate positions. |
252 | my @common_readings = |
1f563ac3 |
253 | $collation->walk_and_expand_base( $collation->reading( '#END#' ) ); |
e2902068 |
254 | $collation->calculate_positions( @common_readings ); |
b49c4318 |
255 | } |
256 | |
2ceca8c3 |
257 | =item B<read_base> |
258 | |
e2902068 |
259 | my @line_beginnings = read_base( 'reference.txt', $collation ); |
2ceca8c3 |
260 | |
e2902068 |
261 | Takes a text file and a (presumed empty) collation object, adds the |
262 | words as simple linear readings to the collation, and returns a |
263 | list of readings that represent the beginning of lines. This collation |
264 | is now the starting point for application of apparatus entries in |
265 | merge_base, e.g. from a CSV file or a Classical Text Editor file. |
2ceca8c3 |
266 | |
267 | =cut |
b49c4318 |
268 | |
269 | sub read_base { |
e2902068 |
270 | my( $base_file, $collation ) = @_; |
b49c4318 |
271 | |
e2902068 |
272 | # This array gives the first reading for each line. We put the |
b49c4318 |
273 | # common starting point in line zero. |
e2902068 |
274 | my $last_reading = $collation->start(); |
275 | my $lineref_array = [ $last_reading ]; # There is no line zero. |
b49c4318 |
276 | |
277 | open( BASE, $base_file ) or die "Could not open file $base_file: $!"; |
278 | while(<BASE>) { |
e2902068 |
279 | # Make the readings, and connect them up for the base, but |
280 | # also save the first reading of each line in an array for the |
281 | # purpose. |
282 | # TODO use configurable reading separator |
b49c4318 |
283 | chomp; |
284 | my @words = split; |
285 | my $started = 0; |
286 | my $wordref = 0; |
287 | my $lineref = scalar @$lineref_array; |
930ff666 |
288 | last if $SHORT && $lineref > $SHORT; |
b49c4318 |
289 | foreach my $w ( @words ) { |
e2902068 |
290 | my $readingref = join( ',', $lineref, ++$wordref ); |
291 | my $reading = $collation->add_reading( $readingref ); |
292 | $reading->text( $w ); |
293 | $reading->make_common(); |
b49c4318 |
294 | unless( $started ) { |
e2902068 |
295 | push( @$lineref_array, $reading ); |
b49c4318 |
296 | $started = 1; |
297 | } |
e2902068 |
298 | if( $last_reading ) { |
299 | my $path = $collation->add_path( $last_reading, $reading, |
930ff666 |
300 | $collation->baselabel ); |
e2902068 |
301 | $path->set_attribute( 'class', 'basetext' ); |
302 | $last_reading = $reading; |
b49c4318 |
303 | } # TODO there should be no else here... |
304 | } |
305 | } |
306 | close BASE; |
307 | # Ending point for all texts |
e2902068 |
308 | my $endpoint = $collation->add_reading( '#END#' ); |
930ff666 |
309 | $collation->add_path( $last_reading, $endpoint, $collation->baselabel ); |
b49c4318 |
310 | push( @$lineref_array, $endpoint ); |
311 | |
312 | return( @$lineref_array ); |
313 | } |
314 | |
e49731d7 |
315 | =item B<collate_variants> |
2ceca8c3 |
316 | |
e2902068 |
317 | collate_variants( $collation, @readings ) |
2ceca8c3 |
318 | |
e49731d7 |
319 | Given a set of readings in the form |
320 | ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... ) |
e2902068 |
321 | walks through each to identify those readings that are identical. The |
322 | collation is a Text::Tradition::Collation object; the elements of |
323 | @readings are Text::Tradition::Collation::Reading objects that appear |
324 | on the collation graph. |
b49c4318 |
325 | |
2ceca8c3 |
326 | TODO: Handle collapsed and non-collapsed transpositions. |
327 | |
328 | =cut |
b49c4318 |
329 | |
e49731d7 |
330 | sub collate_variants { |
e2902068 |
331 | my( $collation, @readings ) = @_; |
e49731d7 |
332 | my $lemma_start = shift @readings; |
333 | my $lemma_end = shift @readings; |
7854e12e |
334 | my $detranspose = 1; |
1b9423d5 |
335 | |
e2902068 |
336 | # Start the list of distinct readings with those readings in the lemma. |
337 | my @distinct_readings; |
b49c4318 |
338 | while( $lemma_start ne $lemma_end ) { |
930ff666 |
339 | push( @distinct_readings, [ $lemma_start, $collation->baselabel ] ); |
7854e12e |
340 | $lemma_start = $collation->next_reading( $lemma_start ); |
b49c4318 |
341 | } |
930ff666 |
342 | push( @distinct_readings, [ $lemma_end, $collation->baselabel ] ); |
b49c4318 |
343 | |
e49731d7 |
344 | |
345 | while( scalar @readings ) { |
346 | my( $var_start, $var_end ) = splice( @readings, 0, 2 ); |
347 | |
e2902068 |
348 | # I want to look at the readings in the variant and lemma, and |
349 | # collapse readings that are the same word. This is mini-collation. |
e49731d7 |
350 | # Each word in the 'main' list can only be collapsed once with a |
351 | # word from the current reading. |
352 | my %collapsed = (); |
353 | |
7854e12e |
354 | # Get the variant witnesses. They will all be going along the |
355 | # same path, so just use the first one as representative for |
356 | # the purpose of following the path. |
357 | my @var_wits = map { $_->label } $var_start->outgoing(); |
358 | my $rep_wit = $var_wits[0]; |
e49731d7 |
359 | |
e2902068 |
360 | my @variant_readings; |
e49731d7 |
361 | while( $var_start ne $var_end ) { |
e2902068 |
362 | push( @variant_readings, $var_start ); |
7854e12e |
363 | $var_start = $collation->next_reading( $var_start, $rep_wit ); |
e49731d7 |
364 | } |
e2902068 |
365 | push( @variant_readings, $var_end ); |
e49731d7 |
366 | |
e2902068 |
367 | # Go through the variant readings, and if we find a lemma reading that |
368 | # hasn't yet been collapsed with a reading, equate them. If we do |
369 | # not, keep them to push onto the end of all_readings. |
7854e12e |
370 | # TODO replace this with proper mini-collation |
e2902068 |
371 | my @remaining_readings; |
e49731d7 |
372 | my $last_index = 0; |
1b9423d5 |
373 | my $curr_pos = 0; |
e2902068 |
374 | foreach my $w ( @variant_readings ) { |
e49731d7 |
375 | my $word = $w->label(); |
376 | my $matched = 0; |
e2902068 |
377 | foreach my $idx ( $last_index .. $#distinct_readings ) { |
7854e12e |
378 | my( $l, $pathlabel ) = @{$distinct_readings[$idx]}; |
e49731d7 |
379 | if( $word eq cmp_str( $l ) ) { |
380 | next if exists( $collapsed{ $l->label } ) |
381 | && $collapsed{ $l->label } eq $l; |
382 | $matched = 1; |
383 | $last_index = $idx if $detranspose; |
e2902068 |
384 | # Collapse the readings. |
385 | printf STDERR "Merging readings %s/%s and %s/%s\n", |
e49731d7 |
386 | $l->name, $l->label, $w->name, $w->label; |
e2902068 |
387 | $collation->merge_readings( $l, $w ); |
e49731d7 |
388 | $collapsed{ $l->label } = $l; |
e2902068 |
389 | # Now collapse any multiple paths to and from the reading. |
390 | remove_duplicate_paths( $collation, |
7854e12e |
391 | $collation->prior_reading( $l, $rep_wit ), $l ); |
e2902068 |
392 | remove_duplicate_paths( $collation, $l, |
7854e12e |
393 | $collation->next_reading( $l, $rep_wit ) ); |
1b9423d5 |
394 | last; |
e49731d7 |
395 | } |
b49c4318 |
396 | } |
7854e12e |
397 | push( @remaining_readings, [ $w, $rep_wit ] ) unless $matched; |
b49c4318 |
398 | } |
e2902068 |
399 | push( @distinct_readings, @remaining_readings) if scalar( @remaining_readings ); |
b49c4318 |
400 | } |
401 | } |
402 | |
e2902068 |
403 | =item B<remove_duplicate_paths> |
2ceca8c3 |
404 | |
e2902068 |
405 | remove_duplicate_paths( $collation, $from, $to ); |
2ceca8c3 |
406 | |
e2902068 |
407 | Given two readings, reduce the number of paths between those readings to |
7854e12e |
408 | a set of unique paths. |
2ceca8c3 |
409 | |
410 | =cut |
411 | |
7854e12e |
412 | # TODO wonder if this is necessary |
e2902068 |
413 | sub remove_duplicate_paths { |
414 | my( $collation, $from, $to ) = @_; |
7854e12e |
415 | my %seen_paths; |
416 | foreach my $p ( $from->edges_to( $to ) ) { |
417 | if( exists $seen_paths{$p->name} ) { |
418 | $collation->del_path( $p ); |
b49c4318 |
419 | } else { |
7854e12e |
420 | $seen_paths{$p->name} = 1; |
b49c4318 |
421 | } |
422 | } |
423 | } |
424 | |
e2902068 |
425 | # Helper function. Given a witness sigil, if it is a post-correctione |
426 | # sigil,return the base witness. If not, return a false value. |
427 | sub _is_post_corr { |
428 | my( $sigil ) = @_; |
930ff666 |
429 | if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) { |
e2902068 |
430 | return $1; |
431 | } |
432 | return undef; |
433 | } |
434 | |
2ceca8c3 |
435 | =item B<cmp_str> |
436 | |
437 | Pretend you never saw this method. Really it needs to not be hardcoded. |
438 | |
439 | =cut |
440 | |
b49c4318 |
441 | sub cmp_str { |
e2902068 |
442 | my( $reading ) = @_; |
443 | my $word = $reading->label(); |
b49c4318 |
444 | $word = lc( $word ); |
445 | $word =~ s/\W//g; |
446 | $word =~ s/v/u/g; |
447 | $word =~ s/j/i/g; |
448 | $word =~ s/cha/ca/g; |
449 | $word =~ s/quatuor/quattuor/g; |
450 | $word =~ s/ioannes/iohannes/g; |
451 | return $word; |
452 | } |
453 | |
2ceca8c3 |
454 | =back |
455 | |
456 | =head1 LICENSE |
457 | |
458 | This package is free software and is provided "as is" without express |
459 | or implied warranty. You can redistribute it and/or modify it under |
460 | the same terms as Perl itself. |
461 | |
462 | =head1 AUTHOR |
463 | |
464 | Tara L Andrews, aurum@cpan.org |
465 | |
466 | =cut |
467 | |
b49c4318 |
468 | 1; |