Commit | Line | Data |
fa954f4c |
1 | package Text::Tradition::Parser::CollateText; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | |
6 | =head1 NAME |
7 | |
8 | Text::Tradition::Parser::CollateText |
9 | |
10 | =head1 DESCRIPTION |
11 | |
12 | For an overview of the package, see the documentation for the |
13 | Text::Tradition module. |
14 | |
15 | This module is meant for use with a set of text files saved from Word docs, |
16 | which originated with the COLLATE collation program. |
17 | |
18 | =head1 SUBROUTINES |
19 | |
20 | =over |
21 | |
22 | =item B<parse> |
23 | |
24 | parse( $graph, $opts ); |
25 | |
26 | Takes an initialized graph and a hashref of options, which must include: |
27 | - 'base' - the base text referenced by the variants |
28 | - 'format' - the format of the variant list |
29 | - 'data' - the variants, in the given format. |
30 | |
31 | =cut |
32 | |
33 | my %ALL_SIGLA; |
34 | |
35 | sub parse { |
36 | my( $tradition, $opts ) = @_; |
37 | # String together the base text. |
38 | my $lineref_hash = read_stone_base( $opts->{'base'}, $tradition->collation ); |
39 | # Note the sigla. |
40 | foreach my $sigil ( @{$opts->{'sigla'}} ) { |
41 | $ALL_SIGLA{$sigil} = 1; |
42 | $tradition->add_witness( 'sigil' => $sigil ); |
43 | } |
44 | # Now merge on the apparatus entries. |
45 | merge_stone_apparatus( $tradition->collation, $lineref_hash, $opts->{'input'} ); |
46 | } |
47 | |
48 | =item B<read_stone_base> |
49 | |
50 | my $text_list = read_base( 'reference.txt', $collation ); |
51 | |
52 | Takes a text file and a (presumed empty) collation object, adds the words |
53 | as simple linear readings to the collation, and returns a hash of texts |
54 | with line keys. This collation is now the starting point for application of |
55 | apparatus entries in merge_base, e.g. from a CSV file or a Classical Text |
56 | Editor file. |
57 | |
58 | The hash is of the form |
59 | |
60 | { chapter_name => { line_ref => { start => node, end => node } } } |
61 | |
62 | =cut |
63 | |
64 | sub read_stone_base { |
65 | my( $base_file, $collation ) = @_; |
66 | |
67 | # This array gives the first reading for each line. We put the |
68 | # common starting point in line zero. |
69 | my $last_reading = $collation->start(); |
70 | my $lineref_hash = {}; |
71 | my $last_lineref; |
72 | |
73 | my $curr_text; |
74 | open( BASE, $base_file ) or die "Could not open file $base_file: $!"; |
75 | my $i = 1; |
76 | while(<BASE>) { |
77 | # Make the readings, and connect them up for the base, but |
78 | # also save the first reading of each line in a hash for the |
79 | # purpose. |
80 | chomp; |
81 | next if /^\s+$/; # skip blank lines |
82 | s/^(\d)\x{589}/$1:/; # turn Armenian full stops into colons |
83 | if( /^TESTAMENT/ ) { |
84 | # Initialize the base hash for this section. |
85 | $lineref_hash->{$_} = {}; |
86 | $curr_text = $lineref_hash->{$_}; |
87 | next; |
88 | } |
89 | my @words = split; |
90 | my $lineref; |
91 | if( /^\d/ ) { |
92 | # The first "word" is a line reference; keep it. |
93 | $lineref = shift @words; |
94 | } else { |
95 | # Assume we are dealing with the title. |
96 | $lineref = 'Title:'; |
97 | } |
98 | |
99 | # Now turn the remaining words into readings. |
100 | my $wordref = 0; |
101 | foreach my $w ( @words ) { |
102 | my $readingref = join( ',', $lineref, ++$wordref ); |
103 | my $reading = $collation->add_reading( $readingref ); |
104 | $reading->text( $w ); |
105 | unless( exists $curr_text->{$lineref}->{'start'} ) { |
106 | $curr_text->{$lineref}->{'start'} = $reading; |
107 | } |
108 | # Add edge paths in the graph, for easier tracking when |
109 | # we start applying corrections. These paths will be |
110 | # removed when we're done. |
111 | my $path = $collation->add_path( $last_reading, $reading, |
112 | $collation->baselabel ); |
113 | $last_reading = $reading; |
114 | } |
115 | $curr_text->{$lineref}->{'end'} = $last_reading; |
116 | } |
117 | |
118 | close BASE; |
119 | # Ending point for all texts |
120 | $collation->add_path( $last_reading, $collation->end, $collation->baselabel ); |
121 | return( $lineref_hash ); |
122 | } |
123 | |
124 | =item B<merge_stone_apparatus> |
125 | |
126 | Read an apparatus as output (presumably) by Collate. It should be reasonably |
127 | regular in form, I hope. Merge the apparatus variants onto the appropriate |
128 | lemma readings. |
129 | |
130 | =cut |
131 | |
132 | sub merge_stone_apparatus { |
133 | my( $c, $lineref_hash, $file ) = @_; |
134 | |
135 | my $text_apps = {}; |
136 | my $current_text; |
137 | open( APP, $file ) or die "Could not read apparatus file $file"; |
138 | while( <APP> ) { |
139 | chomp; |
140 | next if /^\s+$/; |
141 | if( /^TESTAMENT/ ) { |
142 | $current_text = $lineref_hash->{$_}; |
143 | next; |
144 | } |
145 | |
146 | # Otherwise, the first word of the line is the base text line reference. |
147 | my $i = 0; |
148 | my $lineref; |
149 | if( s/^(\S+)// ) { |
150 | $lineref = $1; |
151 | } else { |
152 | warn "Unrecognized line $_"; |
153 | } |
154 | my $baseline = $current_text->{$lineref}; |
155 | # The start and end readings for this line are now in $baseline->{start} |
156 | # and $baseline->{end}. |
157 | |
158 | # Now look at the apparatus entries for this line. They are |
159 | # split with |. |
160 | my @apps = split( '|' ); |
161 | foreach my $app ( @apps ) { |
162 | my( $lemma, $rest ) = split( ']', $app ); |
163 | |
164 | # Find the lemma reading. |
165 | my( $lemma_start, $lemma_end ) = |
166 | _find_reading_on_line( $c, $lemma, $baseline ); |
167 | my @lemma_chain = $c->reading_sequence( $lemma_start, $lemma_end ); |
168 | |
169 | # Splice in "start" and "end" placeholders on either |
170 | # side of the lemma. |
171 | my ( $rdg_start, $rdg_end ) = |
172 | _add_reading_placeholders( $c, $lemma_start, $lemma_end ); |
173 | |
174 | # For each reading, attach it to the lemma. |
175 | my @indiv = split( ' ', $rest ); |
176 | foreach my $rdg ( @indiv ) { |
177 | # Parse the string. |
178 | my( $words, $sigla, $recurse ) = parse_app_entry( $rdg ); |
179 | my @readings; |
180 | foreach my $i ( 0 .. $#$words ) { |
181 | next if $i == 0 && $words->[$i] =~ /^__/; |
182 | my $reading_id = $rdg_start->text . '_' . $rdg_end->text . '/' . $i; |
183 | my $reading = $c->add_reading( $reading_id ); |
184 | $reading->text( $words->[$i] ); |
185 | push( @readings, $reading ); |
186 | } |
187 | |
188 | # Deal with any specials. |
189 | my $lemma_sequence; |
190 | if( $words->[0] eq '__LEMMA__' ) { |
191 | $lemma_sequence = [ $lemma_end, $rdg_end ]; |
192 | } elsif ( $rdg->[0] eq '__TRANSPOSE__' ) { |
193 | # Hope it is only two or three words in the lemma. |
194 | # TODO figure out how we really want to handle this |
195 | @readings = reverse @lemma_chain; |
196 | } |
197 | $lemma_sequence = [ $rdg_start, @lemma_chain, $rdg_end ] |
198 | unless $lemma_sequence; |
199 | |
200 | # Now hook up the paths. |
201 | unshift( @readings, $rdg_start ); |
202 | push( @readings, $rdg_end ); |
203 | foreach my $i ( 1 .. $#readings ) { |
204 | if( $recurse->{$i} ) { |
205 | my( $rwords, $rsig ) = parse_app_entry( $recurse->{$i} ); |
206 | # Get the local "lemma" sequence |
207 | my $llseq = [ $readings[$i], $readings[$i+1] ]; |
208 | if( $rwords->[0] ne '__LEMMA__' ) { |
209 | # Treat it as an addition to the last word |
210 | unshift( @$llseq, $readings[$i-1] ); |
211 | } |
212 | # Create the reading nodes in $rwords |
213 | # TODO Hope we don't meet ~ in a recursion |
214 | my $local_rdg = []; |
215 | foreach my $i ( 0 .. $#$rwords ) { |
216 | next if $i == 0 && $rwords->[$i] =~ /^__/; |
217 | my $reading_id = $llseq->[0]->text . '_' . |
218 | $llseq->[-1]->text . '/' . $i; |
219 | my $reading = $c->add_reading( $reading_id ); |
220 | $reading->text( $words->[$i] ); |
221 | push( @$local_rdg, $reading ); |
222 | } |
223 | # Add the path(s) necessary |
224 | _add_sigil_path( $c, $rsig, $local_rdg, $llseq ); |
225 | } |
226 | } |
227 | _add_sigil_path( $c, $sigla, \@readings, $lemma_sequence ); |
228 | } # end processing of $app |
229 | } # end foreach my $app in line |
230 | } # end while <line> |
231 | |
232 | # Now reconcile all the paths in the collation, and delete our |
233 | # temporary anchor nodes. |
234 | expand_all_paths( $c ); |
235 | |
236 | # Finally, calculate the ranks we've got. |
237 | $c->calculate_ranks; |
238 | } |
239 | |
240 | sub _find_reading_on_line { |
241 | my( $c, $lemma, $baseline ) = @_; |
242 | |
243 | my $lemma_start = $baseline->{'start'}; |
244 | my $lemma_end; |
245 | my $too_far = $baseline->{'end'}->next_reading; |
246 | my @lemma_words = split( /\s+/, $lemma ); |
247 | |
248 | my %seen; |
249 | my $scrutinize = ''; # DEBUG variable |
250 | my $seq = 1; |
251 | while( $lemma_start ne $too_far ) { |
252 | # Loop detection |
253 | if( $seen{ $lemma_start->name() } ) { |
254 | warn "Detected loop at " . $lemma_start->name . " for lemma $lemma"; |
255 | last; |
256 | } |
257 | $seen{ $lemma_start->name() } = 1; |
258 | |
259 | # Try to match the lemma. |
260 | # TODO move next/prior reading methods into the reading classes, |
261 | # to make this more self-contained and not need to pass $c. |
262 | my $unmatch = 0; |
263 | my ( $lw, $seq ) = _get_seq( $lemma_words[0] ); |
264 | print STDERR "Matching $lemma_start against $lw...\n" |
265 | if $scrutinize; |
266 | if( $lemma_start->text eq $lw ) { |
267 | # Skip it if we need a match that is not the first. |
268 | if( --$seq < 1 ) { |
269 | # Now we have to compare the rest of the words here. |
270 | if( scalar( @lemma_words ) > 1 ) { |
271 | my $next_reading = |
272 | $c->next_reading( $lemma_start ); |
273 | my $wildcard = 0; |
274 | foreach my $w ( @lemma_words[1..$#lemma_words] ) { |
275 | if( $w eq '---' ) { |
276 | # We match everything to the next word. |
277 | $wildcard = 1; |
278 | next; |
279 | } else { |
280 | $wildcard = 0; |
281 | } |
282 | ( $lw, $seq ) = _get_seq( $w ); |
283 | printf STDERR "Now matching %s against %s\n", |
284 | $next_reading->text, $lw |
285 | if $scrutinize; |
286 | if( !$wildcard && $w ne $next_reading->text) { |
287 | $unmatch = 1; |
288 | last; |
289 | } else { |
290 | $lemma_end = $next_reading; |
291 | $next_reading = |
292 | $c->next_reading( $lemma_end ); |
293 | } |
294 | } |
295 | } else { # single-word match, easy. |
296 | $lemma_end = $lemma_start; |
297 | } |
298 | } else { # we need the Nth match and aren't there yet |
299 | $unmatch = 1; |
300 | } |
301 | } |
302 | last unless ( $unmatch || !defined( $lemma_end ) ); |
303 | $lemma_end = undef; |
304 | $lemma_start = $c->next_reading( $lemma_start ); |
305 | } |
306 | |
307 | unless( $lemma_end ) { |
308 | warn "No match found for @lemma_words"; |
309 | return undef; |
310 | } |
311 | return( $lemma_start, $lemma_end ); |
312 | } |
313 | |
314 | sub _add_reading_placeholders { |
315 | my( $collation, $lemma_start, $lemma_end ) = @_; |
316 | # We will splice in a 'begin' and 'end' marker on either side of the |
317 | # lemma, as sort of a double-endpoint attachment in the graph. |
318 | |
319 | my $attachlabel = "ATTACH"; |
320 | my( $start_node, $end_node ); |
321 | my @start_id = grep { $_->label eq $attachlabel } $lemma_start->incoming; |
322 | if( @start_id ) { |
323 | # There already exists an app-begin node. Use that. |
324 | $start_node = $start_id[0]->from; |
325 | } else { |
326 | $start_node = $collation->add_reading( $app_info->{_id} ); |
327 | $collation->add_path( |
328 | $collation->prior_reading( $lemma_start, $collation->baselabel ), |
329 | $start_node, $attachlabel ); |
330 | $collation->add_path( $start_node, $lemma_start, $attachlabel ); |
331 | } |
332 | # Now the converse for the end. |
333 | my @end_id = grep { $_->label eq $attachlabel } $lemma_end->outgoing; |
334 | if( @end_id ) { |
335 | # There already exists an app-begin node. Use that. |
336 | $end_node = $end_id[0]->to; |
337 | } else { |
338 | $end_node = $collation->add_reading( $app_info->{_id} . "E" ); |
339 | $collation->add_path( $lemma_end, $end_node, $attachlabel ); |
340 | $collation->add_path( $end_node, |
341 | $collation->next_reading( $lemma_end, $collation->baselabel ), |
342 | $attachlabel ); |
343 | } |
344 | return( $start_node, $end_node ); |
345 | } |
346 | |
347 | # Function to parse an apparatus reading string, with reference to no other |
348 | # data. Need to do this separately as readings can include readings (ugh). |
349 | # Try to give whatever information we might need, including recursive app |
350 | # entries that might need to be parsed. |
351 | |
352 | sub parse_app_entry { |
353 | my( $rdg, ) = @_; |
354 | $rdg =~ s/^\s+//; |
355 | $rdg =~ s/\s+$//; |
356 | next unless $rdg; # just in case |
357 | my @words = split( /\s+/, $rdg ); |
358 | # Zero or more sigils e.g. +, followed by Armenian, |
359 | # followed by (possibly modified) sigla, followed by |
360 | # optional : with note. |
361 | my $is_add; |
362 | my $is_omission; |
363 | my $is_transposition; |
364 | my @reading; |
365 | my %reading_sigla; |
366 | my $recursed; |
367 | my $sig_regex = join( '|', keys %ALL_SIGLA ); |
368 | while( @words ) { |
369 | my $bit = shift @words; |
370 | if( $bit eq '+' ) { |
371 | $is_add = 1; |
372 | } elsif( $bit eq 'om' ) { |
373 | $is_omission = 1; |
374 | } elsif( $bit eq '~' ) { |
375 | $is_transposition = 1; |
376 | } elsif( $bit =~ /\p{Armenian}/ ) { |
377 | warn "Found text in omission?!" if $is_omission; |
378 | push( @reading, $bit ); |
379 | } elsif( $bit eq ':' ) { |
380 | # Stop processing. |
381 | last; |
382 | } elsif( $bit =~ /^\($/ ) { |
383 | # It's a recursive reading within a reading. Lemmatize what we |
384 | # have so far and grab the extra. |
385 | my @new = ( $1 ); |
386 | until( $new[-1] =~ /\)$/ ) { |
387 | push( @new, shift @words ); |
388 | } |
389 | my $recursed_reading = join( ' ', @new ); |
390 | $recursed_reading =~ s/^\((.*)\)//; |
391 | # This recursive entry refers to the last reading word(s) we |
392 | # saw. Push its index+1. We will have to come back to parse |
393 | # it when we are dealing with the main reading. |
394 | # TODO handle () as first element |
395 | # TODO handle - as suffix to add, i.e. make new word |
396 | $recursed->{@reading} = $recursed_reading; |
397 | } elsif( $bit =~ /^(\Q$sig_regex\E)(.*)$/ { |
398 | # It must be a sigil. |
399 | my( $sigil, $mod ) = ( $1, $2 ); |
400 | if( $mod eq "\x{80}" ) { |
401 | $reading_sigla->{$sig} = '_PC_'; |
402 | $ALL_SIGLA{$sig} = 2; # a pre- and post-corr version exists |
403 | } elsif( $mod eq '*' ) { |
404 | $reading_sigla->{$sig} = '_AC_'; |
405 | $ALL_SIGLA{$sig} = 2; # a pre- and post-corr version exists |
406 | } else { |
407 | $reading_sigla->{$sig} = 1 unless $mod; # skip secondhand corrections |
408 | } |
409 | } elsif( $bit =~ /transpos/ ) { |
410 | # There are some transpositions not coded rigorously; skip them. |
411 | warn "Found hard transposition in $rdg; fix manually"; |
412 | last; |
413 | } else { |
414 | warn "Not sure what to do with bit $bit in $rdg"; |
415 | } |
416 | } |
417 | |
418 | # Transmogrify the reading if necessary. |
419 | unshift( @reading, '__LEMMA__' ) if $is_add; |
420 | unshift( @reading, '__TRANSPOSE__' ) if $is_transposition; |
421 | @reading = () if $is_omission; |
422 | |
423 | return( \@reading, $reading_sigla, $recursed ); |
424 | } |
425 | |
426 | # Add a path for the specified sigla to connect the reading sequence. |
427 | # Add an a.c. path to the base sequence if we have an explicitly p.c. |
428 | # reading. |
429 | # Also handle the paths for sigla we have already added in recursive |
430 | # apparatus readings (i.e. don't add a path if one already exists.) |
431 | |
432 | sub _add_sigil_path { |
433 | my( $c, $sigla, $base_sequence, $reading_sequence ) = @_; |
434 | my %skip; |
435 | foreach my $sig ( keys %$sigla ) { |
436 | my $use_sig = $sigla->{$sig} eq '_AC_' ? $sig.$c->ac_label : $sig; |
437 | foreach my $i ( 0 .. $#$reading_sequence-1 ) { |
438 | if( $skip{$use_sig} ) { |
439 | next if !_has_prior_reading( $reading_sequence[$i], $use_sig ); |
440 | $skip{$use_sig} = 0; |
441 | if( _has_next_reading( $reading_sequence[$i], $use_sig ) ) { |
442 | $skip{$use_sig} = 1; |
443 | next; |
444 | } |
445 | $c->add_path( $reading_sequence[$i], $reading_sequence[$i+1], $use_sig); |
446 | } |
447 | if( $sigla->{$sig} eq '_PC_') { |
448 | $use_sig = $sig.$c->ac_label |
449 | foreach my $i ( 0 .. @$base_sequence ) { |
450 | if( $skip{$use_sig} ) { |
451 | next if !_has_prior_reading( $reading_sequence[$i], $use_sig ); |
452 | $skip{$use_sig} = 0; |
453 | if( _has_next_reading( $reading_sequence[$i], $use_sig ) ) { |
454 | $skip{$use_sig} = 1; |
455 | next; |
456 | } |
457 | $c->add_path( $base_sequence[$i], $base_sequence[$i+1], $use_sig ); |
458 | } |
459 | } |
460 | } |
461 | } |
462 | |
463 | # Remove all ATTACH* nodes, linking the readings on either side of them. |
464 | # Then walk the collation for all witness paths, and make sure those paths |
465 | # explicitly exist. Then delete all the 'base' paths. |
466 | |
467 | sub expand_all_paths { |
468 | my( $c ) = @_; |
469 | |
470 | # Delete the anchors |
471 | foreach my $anchor ( grep { $_->name =~ /ATTACH/ } $c->readings ) { |
472 | # Map each path to its incoming/outgoing node. |
473 | my %incoming; |
474 | map { $incoming{$_->label} = $_->from } $anchor->incoming(); |
475 | my %outgoing; |
476 | map { $outgoing{$_->label} = $_->to } $anchor->outgoing(); |
477 | $c->del_reading( $anchor ); |
478 | |
479 | # Connect in and out. |
480 | my $aclabel = $c->ac_label; |
481 | foreach my $edge ( keys %incoming ) { |
482 | my $from = $incoming{$edge}; |
483 | my $to = $outgoing{$edge}; |
484 | if( !$to && $edge =~ /^(.*)\Q$aclabel\E$/ ) { |
485 | $to = $outgoing{$1}; |
486 | } |
487 | $to = $outgoing{$c->baselabel} unless $to; |
488 | warn "Have no outbound base link on " . $anchor->name . "!" |
489 | unless $to; |
490 | $c->add_path( $from, $to, $edge ); |
491 | } |
492 | # TODO Think about deleting outgoing/edge as we use them to make this faster. |
493 | foreach my $edge ( keys %outgoing ) { |
494 | my $to = $outgoing{$edge}; |
495 | my $from = incoming{$edge}; |
496 | if( !$from && $edge =~ /^(.*)\Q$aclabel\E$/ ) { |
497 | $from = $incoming{$1}; |
498 | } |
499 | $from = $incoming{$c->baselabel} unless $to; |
500 | warn "Have no inbound base link on " . $anchor->name . "!" |
501 | unless $from; |
502 | $c->add_path( $from, $to, $edge ) |
503 | unless _has_prior_reading( $to, $edge ); |
504 | } |
505 | } |
506 | } |
507 | |
508 | # Walk the collation and add paths if necessary |
509 | foreach my $sig ( keys %ALL_SIGLA ) { |
510 | my $wit = $c->tradition->witness( $sig ); |
511 | my @path = $c->reading_sequence( $c->start, $c->end, $sig ); |
512 | $wit->path( \@path ); |
513 | if( $ALL_SIGLA{$sig} > 1 ) { |
514 | my @ac_path = $c->reading_sequence( $c->start, $c->end, |
515 | $sig.$c->ac_label, $sig ); |
516 | $wit->uncorrected_path( \@path ); |
517 | # a.c. paths are already there by default. |
518 | } |
519 | foreach my $i ( 1 .. $#$path ) { |
520 | # If there is no explicit path for this sigil between n-1 and n, |
521 | # add it. |
522 | unless( grep { $_->label eq $sig } $path[$i]->edges_from( $path[$i-1] ) ) { |
523 | $c->add_path( $path[$i-1], $path[$i], $sig ); |
524 | } |
525 | } |
526 | } |
527 | |
528 | # Delete all baselabel edges |
529 | foreach my $edge ( grep { $_->label eq $c->baselabel } $c->paths ) { |
530 | $c->del_edge( $edge ); |
531 | } |
532 | |
533 | # Calculate ranks on graph nodes |
534 | $c->calculate_ranks(); |
535 | } |
536 | |
537 | sub _get_seq { |
538 | my( $str ) = @_; |
539 | my $seq = 1; |
540 | my $lw = $str; |
541 | if( $str =~ /^(.*)(\d)\x{80}$/ ) { |
542 | ( $lw, $seq) = ( $1, $2 ); |
543 | } |
544 | return( $lw, $seq ); |
545 | } |
546 | |
547 | sub _has_next_reading { |
548 | my( $rdg, $sigil ) = @_; |
549 | return grep { $_->label eq $sigil } $rdg->outgoing(); |
550 | } |
551 | sub _has_prior_reading { |
552 | my( $rdg, $sigil ) = @_; |
553 | return grep { $_->label eq $sigil } $rdg->incoming(); |
554 | } |