Commit | Line | Data |
fa954f4c |
1 | package Text::Tradition::Parser::CollateText; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | |
6 | =head1 NAME |
7 | |
8 | Text::Tradition::Parser::CollateText |
9 | |
10 | =head1 DESCRIPTION |
11 | |
12 | For an overview of the package, see the documentation for the |
13 | Text::Tradition module. |
14 | |
15 | This module is meant for use with a set of text files saved from Word docs, |
16 | which originated with the COLLATE collation program. |
17 | |
18 | =head1 SUBROUTINES |
19 | |
20 | =over |
21 | |
22 | =item B<parse> |
23 | |
24 | parse( $graph, $opts ); |
25 | |
26 | Takes an initialized graph and a hashref of options, which must include: |
27 | - 'base' - the base text referenced by the variants |
28 | - 'format' - the format of the variant list |
29 | - 'data' - the variants, in the given format. |
30 | |
31 | =cut |
32 | |
33 | my %ALL_SIGLA; |
34 | |
35 | sub parse { |
36 | my( $tradition, $opts ) = @_; |
37 | # String together the base text. |
38 | my $lineref_hash = read_stone_base( $opts->{'base'}, $tradition->collation ); |
39 | # Note the sigla. |
40 | foreach my $sigil ( @{$opts->{'sigla'}} ) { |
41 | $ALL_SIGLA{$sigil} = 1; |
82fa4d57 |
42 | $tradition->add_witness( sigil => $sigil, sourcetype => 'collation' ); |
fa954f4c |
43 | } |
44 | # Now merge on the apparatus entries. |
088e4bbe |
45 | merge_stone_apparatus( $tradition->collation, $lineref_hash, $opts->{'file'} ); |
fa954f4c |
46 | } |
47 | |
48 | =item B<read_stone_base> |
49 | |
50 | my $text_list = read_base( 'reference.txt', $collation ); |
51 | |
52 | Takes a text file and a (presumed empty) collation object, adds the words |
53 | as simple linear readings to the collation, and returns a hash of texts |
54 | with line keys. This collation is now the starting point for application of |
55 | apparatus entries in merge_base, e.g. from a CSV file or a Classical Text |
56 | Editor file. |
57 | |
58 | The hash is of the form |
59 | |
60 | { chapter_name => { line_ref => { start => node, end => node } } } |
61 | |
62 | =cut |
63 | |
64 | sub read_stone_base { |
65 | my( $base_file, $collation ) = @_; |
66 | |
67 | # This array gives the first reading for each line. We put the |
68 | # common starting point in line zero. |
69 | my $last_reading = $collation->start(); |
70 | my $lineref_hash = {}; |
71 | my $last_lineref; |
72 | |
73 | my $curr_text; |
74 | open( BASE, $base_file ) or die "Could not open file $base_file: $!"; |
088e4bbe |
75 | binmode BASE, ':utf8'; |
fa954f4c |
76 | my $i = 1; |
77 | while(<BASE>) { |
78 | # Make the readings, and connect them up for the base, but |
79 | # also save the first reading of each line in a hash for the |
80 | # purpose. |
81 | chomp; |
82 | next if /^\s+$/; # skip blank lines |
83 | s/^(\d)\x{589}/$1:/; # turn Armenian full stops into colons |
84 | if( /^TESTAMENT/ ) { |
85 | # Initialize the base hash for this section. |
86 | $lineref_hash->{$_} = {}; |
87 | $curr_text = $lineref_hash->{$_}; |
88 | next; |
89 | } |
90 | my @words = split; |
91 | my $lineref; |
92 | if( /^\d/ ) { |
93 | # The first "word" is a line reference; keep it. |
94 | $lineref = shift @words; |
95 | } else { |
96 | # Assume we are dealing with the title. |
97 | $lineref = 'Title:'; |
98 | } |
99 | |
100 | # Now turn the remaining words into readings. |
101 | my $wordref = 0; |
102 | foreach my $w ( @words ) { |
103 | my $readingref = join( ',', $lineref, ++$wordref ); |
104 | my $reading = $collation->add_reading( $readingref ); |
105 | $reading->text( $w ); |
106 | unless( exists $curr_text->{$lineref}->{'start'} ) { |
107 | $curr_text->{$lineref}->{'start'} = $reading; |
108 | } |
109 | # Add edge paths in the graph, for easier tracking when |
110 | # we start applying corrections. These paths will be |
111 | # removed when we're done. |
112 | my $path = $collation->add_path( $last_reading, $reading, |
113 | $collation->baselabel ); |
114 | $last_reading = $reading; |
115 | } |
116 | $curr_text->{$lineref}->{'end'} = $last_reading; |
117 | } |
118 | |
119 | close BASE; |
120 | # Ending point for all texts |
121 | $collation->add_path( $last_reading, $collation->end, $collation->baselabel ); |
122 | return( $lineref_hash ); |
123 | } |
124 | |
125 | =item B<merge_stone_apparatus> |
126 | |
127 | Read an apparatus as output (presumably) by Collate. It should be reasonably |
128 | regular in form, I hope. Merge the apparatus variants onto the appropriate |
129 | lemma readings. |
130 | |
131 | =cut |
132 | |
133 | sub merge_stone_apparatus { |
134 | my( $c, $lineref_hash, $file ) = @_; |
135 | |
136 | my $text_apps = {}; |
137 | my $current_text; |
138 | open( APP, $file ) or die "Could not read apparatus file $file"; |
088e4bbe |
139 | binmode APP, ':utf8'; |
fa954f4c |
140 | while( <APP> ) { |
141 | chomp; |
088e4bbe |
142 | next if /^\s*$/; |
fa954f4c |
143 | if( /^TESTAMENT/ ) { |
144 | $current_text = $lineref_hash->{$_}; |
145 | next; |
146 | } |
147 | |
148 | # Otherwise, the first word of the line is the base text line reference. |
149 | my $i = 0; |
150 | my $lineref; |
088e4bbe |
151 | if( s/^(\S+)\s+// ) { |
fa954f4c |
152 | $lineref = $1; |
153 | } else { |
154 | warn "Unrecognized line $_"; |
155 | } |
156 | my $baseline = $current_text->{$lineref}; |
157 | # The start and end readings for this line are now in $baseline->{start} |
158 | # and $baseline->{end}. |
159 | |
160 | # Now look at the apparatus entries for this line. They are |
161 | # split with |. |
088e4bbe |
162 | my @apps = split( /\s+\|\s+/ ); |
163 | my $rdg_ctr = 0; |
164 | foreach my $app ( @apps ) { |
165 | my( $lemma, $rest ) = split( /\s+\]\s+/, $app ); |
166 | next unless $rest; # Skip lines e.g. 'Chapter 2' |
fa954f4c |
167 | # Find the lemma reading. |
168 | my( $lemma_start, $lemma_end ) = |
169 | _find_reading_on_line( $c, $lemma, $baseline ); |
088e4bbe |
170 | unless( $lemma_start && $lemma_end ) { |
171 | print STDERR "Lemma $lemma not found; skipping readings $rest\n"; |
172 | next; |
173 | } |
44771cf2 |
174 | my( $rdg_start, $rdg_end, @lemma_chain ); |
175 | if( $lemma_start eq '__PRIOR__' ) { |
176 | # Deal with 'inc' readings: lemma chain should be empty, rdg_start |
177 | # is a placeholder, rdg_end is $lemma_end. |
178 | $rdg_start = _add_reading_placeholders( $c, $lemma_end ); |
179 | $rdg_end = $lemma_end; |
180 | } else { |
181 | @lemma_chain = $c->reading_sequence( $lemma_start, $lemma_end ); |
182 | # Splice in "start" and "end" placeholders on either |
183 | # side of the lemma. |
184 | ( $rdg_start, $rdg_end ) = |
185 | _add_reading_placeholders( $c, $lemma_start, $lemma_end ); |
186 | } |
fa954f4c |
187 | # For each reading, attach it to the lemma. |
088e4bbe |
188 | my @indiv = split( / /, $rest ); |
189 | my $has_rel = 0; |
190 | my %seen_sigla; |
191 | map { $seen_sigla{$_} = 0 } keys %ALL_SIGLA; |
fa954f4c |
192 | foreach my $rdg ( @indiv ) { |
193 | # Parse the string. |
194 | my( $words, $sigla, $recurse ) = parse_app_entry( $rdg ); |
088e4bbe |
195 | |
196 | # Do something really very dodgy indeed. |
197 | if( exists $sigla->{'__REL__'} && !$has_rel ) { |
198 | # Handling this has to be deferred until the end, so push it |
199 | # back onto @indiv and note that we've done so. |
200 | $has_rel = 1; |
201 | push( @indiv, $rdg ); |
202 | next; |
203 | } |
204 | |
fa954f4c |
205 | my @readings; |
088e4bbe |
206 | foreach my $rdg_word ( @$words ) { |
207 | next if $rdg_word =~ /^__/; |
44771cf2 |
208 | my $reading_id = ref( $lemma_start ) |
209 | ? $lemma_start->name : $lemma_start; |
210 | $reading_id .= '_' . $lemma_end->name . '/' . $rdg_ctr++; |
fa954f4c |
211 | my $reading = $c->add_reading( $reading_id ); |
088e4bbe |
212 | $reading->text( $rdg_word ); |
fa954f4c |
213 | push( @readings, $reading ); |
214 | } |
215 | |
216 | # Deal with any specials. |
217 | my $lemma_sequence; |
44771cf2 |
218 | if( @$words && $words->[0] eq '__LEMMA__' |
219 | && $lemma_end ne $rdg_end ) { |
220 | # It's an addition (unless lemma_end eq rdg_end, in which case |
221 | # it's an 'inc'.) Start from lemma rather than from placeholder. |
fa954f4c |
222 | $lemma_sequence = [ $lemma_end, $rdg_end ]; |
088e4bbe |
223 | } elsif ( @$words && $words->[0] eq '__TRANSPOSE__' ) { |
fa954f4c |
224 | # Hope it is only two or three words in the lemma. |
225 | # TODO figure out how we really want to handle this |
226 | @readings = reverse @lemma_chain; |
227 | } |
228 | $lemma_sequence = [ $rdg_start, @lemma_chain, $rdg_end ] |
229 | unless $lemma_sequence; |
230 | |
088e4bbe |
231 | # Note which sigla we are actually dealing with. |
232 | if( $sigla->{'__REL__'} ) { |
233 | delete $sigla->{'__REL__'}; |
234 | map { $sigla->{$_} = 1 } |
235 | grep { $seen_sigla{$_} == 0 } keys %seen_sigla; |
236 | } else { |
237 | map { $seen_sigla{$_} = 1 } keys %$sigla; |
238 | } |
239 | |
fa954f4c |
240 | # Now hook up the paths. |
088e4bbe |
241 | unshift( @readings, $lemma_sequence->[0] ); |
242 | push( @readings, $lemma_sequence->[-1] ); |
fa954f4c |
243 | foreach my $i ( 1 .. $#readings ) { |
244 | if( $recurse->{$i} ) { |
245 | my( $rwords, $rsig ) = parse_app_entry( $recurse->{$i} ); |
246 | # Get the local "lemma" sequence |
247 | my $llseq = [ $readings[$i], $readings[$i+1] ]; |
248 | if( $rwords->[0] ne '__LEMMA__' ) { |
fa954f4c |
249 | unshift( @$llseq, $readings[$i-1] ); |
088e4bbe |
250 | } # Otherwise treat it as an addition to the last word |
fa954f4c |
251 | # Create the reading nodes in $rwords |
252 | # TODO Hope we don't meet ~ in a recursion |
253 | my $local_rdg = []; |
254 | foreach my $i ( 0 .. $#$rwords ) { |
255 | next if $i == 0 && $rwords->[$i] =~ /^__/; |
44771cf2 |
256 | my $reading_id = $llseq->[0]->name . '_' . |
257 | $llseq->[-1]->name . '/' . $i; |
258 | $reading_id =~ s/ATTACH//g; |
fa954f4c |
259 | my $reading = $c->add_reading( $reading_id ); |
088e4bbe |
260 | $reading->text( $rwords->[$i] ); |
fa954f4c |
261 | push( @$local_rdg, $reading ); |
262 | } |
088e4bbe |
263 | unshift( @$local_rdg, $llseq->[0] ); |
264 | push( @$local_rdg, $llseq->[-1] ); |
fa954f4c |
265 | # Add the path(s) necessary |
088e4bbe |
266 | _add_sigil_path( $c, $rsig, $llseq, $local_rdg ); |
fa954f4c |
267 | } |
268 | } |
088e4bbe |
269 | _add_sigil_path( $c, $sigla, $lemma_sequence, \@readings ); |
fa954f4c |
270 | } # end processing of $app |
271 | } # end foreach my $app in line |
272 | } # end while <line> |
273 | |
274 | # Now reconcile all the paths in the collation, and delete our |
275 | # temporary anchor nodes. |
276 | expand_all_paths( $c ); |
277 | |
278 | # Finally, calculate the ranks we've got. |
44771cf2 |
279 | # $c->calculate_ranks; |
861c3e27 |
280 | |
281 | # Save the text for each witness so that we can ensure consistency |
282 | # later on |
283 | $tradition->collation->text_from_paths(); |
fa954f4c |
284 | } |
285 | |
286 | sub _find_reading_on_line { |
088e4bbe |
287 | my( $c, $lemma, $baseline, $prior ) = @_; |
288 | |
088e4bbe |
289 | if( $lemma eq 'totum' ) { |
44771cf2 |
290 | # We want the whole line. |
088e4bbe |
291 | return( $baseline->{'start'}, $baseline->{'end'} ); |
44771cf2 |
292 | } elsif( $lemma eq 'inc' ) { |
293 | # We want to shove things in before the line begins. |
294 | return( '__PRIOR__', $baseline->{'start'} ); |
088e4bbe |
295 | } |
fa954f4c |
296 | |
297 | my $lemma_start = $baseline->{'start'}; |
298 | my $lemma_end; |
088e4bbe |
299 | my $too_far = $c->next_reading( $baseline->{'end'} ); |
fa954f4c |
300 | my @lemma_words = split( /\s+/, $lemma ); |
301 | |
302 | my %seen; |
303 | my $scrutinize = ''; # DEBUG variable |
088e4bbe |
304 | my ( $lw, $seq ) = _get_seq( $lemma_words[0] ); |
fa954f4c |
305 | while( $lemma_start ne $too_far ) { |
306 | # Loop detection |
307 | if( $seen{ $lemma_start->name() } ) { |
308 | warn "Detected loop at " . $lemma_start->name . " for lemma $lemma"; |
309 | last; |
310 | } |
311 | $seen{ $lemma_start->name() } = 1; |
312 | |
313 | # Try to match the lemma. |
314 | # TODO move next/prior reading methods into the reading classes, |
315 | # to make this more self-contained and not need to pass $c. |
316 | my $unmatch = 0; |
088e4bbe |
317 | print STDERR "Matching ".$lemma_start->text." against $lw...\n" |
fa954f4c |
318 | if $scrutinize; |
088e4bbe |
319 | if( _norm( $lemma_start->text ) eq _norm( $lw ) ) { |
fa954f4c |
320 | # Skip it if we need a match that is not the first. |
321 | if( --$seq < 1 ) { |
322 | # Now we have to compare the rest of the words here. |
323 | if( scalar( @lemma_words ) > 1 ) { |
088e4bbe |
324 | my $next_reading = next_real_reading( $c, $lemma_start ); |
fa954f4c |
325 | my $wildcard = 0; |
326 | foreach my $w ( @lemma_words[1..$#lemma_words] ) { |
327 | if( $w eq '---' ) { |
fa954f4c |
328 | $wildcard = 1; |
329 | next; |
fa954f4c |
330 | } |
088e4bbe |
331 | if( $wildcard ) { |
332 | # This should be the word after a --- now, and the |
333 | # last lemma word. |
334 | my( $wst, $wend ) = _find_reading_on_line( $c, $w, |
335 | $baseline, $lemma_start ); |
336 | warn "Something unexpected" unless $wst eq $wend; |
337 | $lemma_end = $wend; |
338 | next; |
339 | } |
340 | |
341 | # If we got this far, there is no wildcard. We must |
342 | # match each word in sequence. |
343 | my( $nlw, $nseq ) = _get_seq( $w ); |
fa954f4c |
344 | printf STDERR "Now matching %s against %s\n", |
088e4bbe |
345 | $next_reading->text, $nlw |
fa954f4c |
346 | if $scrutinize; |
088e4bbe |
347 | if( _norm( $nlw ) eq _norm( $next_reading->text ) ) { |
348 | $lemma_end = $next_reading; |
349 | $next_reading = $c->next_reading( $lemma_end ); |
350 | } else { |
fa954f4c |
351 | $unmatch = 1; |
352 | last; |
fa954f4c |
353 | } |
354 | } |
355 | } else { # single-word match, easy. |
356 | $lemma_end = $lemma_start; |
357 | } |
358 | } else { # we need the Nth match and aren't there yet |
359 | $unmatch = 1; |
360 | } |
088e4bbe |
361 | $unmatch = 1 if $prior && !$seen{$prior->name}; |
fa954f4c |
362 | } |
363 | last unless ( $unmatch || !defined( $lemma_end ) ); |
364 | $lemma_end = undef; |
365 | $lemma_start = $c->next_reading( $lemma_start ); |
366 | } |
367 | |
368 | unless( $lemma_end ) { |
369 | warn "No match found for @lemma_words"; |
370 | return undef; |
371 | } |
372 | return( $lemma_start, $lemma_end ); |
373 | } |
374 | |
375 | sub _add_reading_placeholders { |
376 | my( $collation, $lemma_start, $lemma_end ) = @_; |
377 | # We will splice in a 'begin' and 'end' marker on either side of the |
378 | # lemma, as sort of a double-endpoint attachment in the graph. |
088e4bbe |
379 | # Note that all of this assumes we have a linear base graph at this |
380 | # point, and no diverging readings on the lemmas. |
381 | |
382 | my $start_node = $collation->prior_reading( $lemma_start ); |
383 | unless( $start_node->name =~ /ATTACH/ ) { |
384 | my $sn_id = '#ATTACH_' . $lemma_start->name . '_START#'; |
385 | my $prior = $start_node; |
386 | $start_node = $collation->add_reading( $sn_id ); |
387 | $start_node->is_meta( 1 ); |
388 | $collation->graph->del_edge( $collation->graph->edge( $prior, $lemma_start ) ); |
389 | $collation->add_path( $prior, $start_node, $collation->baselabel ); |
390 | $collation->add_path( $start_node, $lemma_start, $collation->baselabel ); |
fa954f4c |
391 | } |
44771cf2 |
392 | return $start_node unless $lemma_end; |
393 | |
fa954f4c |
394 | # Now the converse for the end. |
088e4bbe |
395 | my $end_node = $collation->next_reading( $lemma_end ); |
396 | unless( $end_node->name =~ /ATTACH/ ) { |
397 | my $en_id = '#ATTACH_' . $lemma_end->name . '_END#'; |
398 | my $next = $end_node; |
399 | $end_node = $collation->add_reading( $en_id ); |
400 | $end_node->is_meta( 1 ); |
401 | $collation->graph->del_edge( $collation->graph->edge( $lemma_end, $next ) ); |
402 | $collation->add_path( $lemma_end, $end_node, $collation->baselabel ); |
403 | $collation->add_path( $end_node, $next, $collation->baselabel ); |
fa954f4c |
404 | } |
088e4bbe |
405 | |
406 | return( $start_node, $end_node ); |
fa954f4c |
407 | } |
408 | |
409 | # Function to parse an apparatus reading string, with reference to no other |
410 | # data. Need to do this separately as readings can include readings (ugh). |
411 | # Try to give whatever information we might need, including recursive app |
412 | # entries that might need to be parsed. |
413 | |
414 | sub parse_app_entry { |
415 | my( $rdg, ) = @_; |
416 | $rdg =~ s/^\s+//; |
417 | $rdg =~ s/\s+$//; |
418 | next unless $rdg; # just in case |
419 | my @words = split( /\s+/, $rdg ); |
420 | # Zero or more sigils e.g. +, followed by Armenian, |
421 | # followed by (possibly modified) sigla, followed by |
422 | # optional : with note. |
423 | my $is_add; |
424 | my $is_omission; |
425 | my $is_transposition; |
088e4bbe |
426 | my $is_base; |
427 | my $skip; |
fa954f4c |
428 | my @reading; |
088e4bbe |
429 | my $reading_sigla = {}; |
fa954f4c |
430 | my $recursed; |
088e4bbe |
431 | my $sig_regex = join( '|', sort { length $b <=> length $a } keys %ALL_SIGLA ); |
fa954f4c |
432 | while( @words ) { |
433 | my $bit = shift @words; |
434 | if( $bit eq '+' ) { |
435 | $is_add = 1; |
436 | } elsif( $bit eq 'om' ) { |
437 | $is_omission = 1; |
438 | } elsif( $bit eq '~' ) { |
439 | $is_transposition = 1; |
440 | } elsif( $bit =~ /\p{Armenian}/ ) { |
441 | warn "Found text in omission?!" if $is_omission; |
442 | push( @reading, $bit ); |
443 | } elsif( $bit eq ':' ) { |
444 | # Stop processing. |
445 | last; |
088e4bbe |
446 | } elsif( $bit =~ /^\(/ ) { |
fa954f4c |
447 | # It's a recursive reading within a reading. Lemmatize what we |
448 | # have so far and grab the extra. |
088e4bbe |
449 | my @new = ( $bit ); |
fa954f4c |
450 | until( $new[-1] =~ /\)$/ ) { |
451 | push( @new, shift @words ); |
452 | } |
453 | my $recursed_reading = join( ' ', @new ); |
088e4bbe |
454 | $recursed_reading =~ s/^\((.*)\)/$1/; |
fa954f4c |
455 | # This recursive entry refers to the last reading word(s) we |
456 | # saw. Push its index+1. We will have to come back to parse |
457 | # it when we are dealing with the main reading. |
458 | # TODO handle () as first element |
459 | # TODO handle - as suffix to add, i.e. make new word |
460 | $recursed->{@reading} = $recursed_reading; |
088e4bbe |
461 | } elsif( $bit =~ /^($sig_regex)(.*)$/ ) { |
fa954f4c |
462 | # It must be a sigil. |
463 | my( $sigil, $mod ) = ( $1, $2 ); |
464 | if( $mod eq "\x{80}" ) { |
088e4bbe |
465 | $reading_sigla->{$sigil} = '_PC_'; |
466 | $ALL_SIGLA{$sigil} = 2; # a pre- and post-corr version exists |
fa954f4c |
467 | } elsif( $mod eq '*' ) { |
088e4bbe |
468 | $reading_sigla->{$sigil} = '_AC_'; |
469 | $ALL_SIGLA{$sigil} = 2; # a pre- and post-corr version exists |
fa954f4c |
470 | } else { |
088e4bbe |
471 | $reading_sigla->{$sigil} = 1 unless $mod; # skip secondhand corrections |
fa954f4c |
472 | } |
088e4bbe |
473 | } elsif( $bit eq 'rel' ) { |
474 | # The anti-reading. All sigla except those cited. |
475 | $reading_sigla->{'__REL__'} = 1; |
476 | } elsif( $bit eq 'ed' ) { |
477 | # An emendation. TODO make sure all other sigla appear in readings? |
478 | $skip = 1; |
479 | last; |
fa954f4c |
480 | } elsif( $bit =~ /transpos/ ) { |
481 | # There are some transpositions not coded rigorously; skip them. |
482 | warn "Found hard transposition in $rdg; fix manually"; |
483 | last; |
484 | } else { |
485 | warn "Not sure what to do with bit $bit in $rdg"; |
088e4bbe |
486 | $skip = 1; |
487 | last; |
fa954f4c |
488 | } |
489 | } |
088e4bbe |
490 | |
491 | return( [], {}, {} ) if $skip; |
fa954f4c |
492 | # Transmogrify the reading if necessary. |
493 | unshift( @reading, '__LEMMA__' ) if $is_add; |
494 | unshift( @reading, '__TRANSPOSE__' ) if $is_transposition; |
495 | @reading = () if $is_omission; |
088e4bbe |
496 | unless( @reading || $is_omission ) { |
497 | # It was just sigla on a line, meaning the base changed. Thus |
498 | # the reading is the lemma. |
499 | unshift( @reading, '__LEMMA__' ); |
500 | } |
fa954f4c |
501 | |
502 | return( \@reading, $reading_sigla, $recursed ); |
503 | } |
504 | |
505 | # Add a path for the specified sigla to connect the reading sequence. |
506 | # Add an a.c. path to the base sequence if we have an explicitly p.c. |
507 | # reading. |
508 | # Also handle the paths for sigla we have already added in recursive |
509 | # apparatus readings (i.e. don't add a path if one already exists.) |
510 | |
511 | sub _add_sigil_path { |
512 | my( $c, $sigla, $base_sequence, $reading_sequence ) = @_; |
513 | my %skip; |
514 | foreach my $sig ( keys %$sigla ) { |
515 | my $use_sig = $sigla->{$sig} eq '_AC_' ? $sig.$c->ac_label : $sig; |
088e4bbe |
516 | foreach my $i ( 0 .. $#{$reading_sequence}-1 ) { |
fa954f4c |
517 | if( $skip{$use_sig} ) { |
088e4bbe |
518 | next if !_has_prior_reading( $reading_sequence->[$i], $use_sig ); |
fa954f4c |
519 | $skip{$use_sig} = 0; |
088e4bbe |
520 | } |
521 | if( _has_next_reading( $reading_sequence->[$i], $use_sig ) ) { |
fa954f4c |
522 | $skip{$use_sig} = 1; |
523 | next; |
524 | } |
088e4bbe |
525 | $c->add_path( $reading_sequence->[$i], $reading_sequence->[$i+1], $use_sig ); |
fa954f4c |
526 | } |
527 | if( $sigla->{$sig} eq '_PC_') { |
088e4bbe |
528 | $use_sig = $sig.$c->ac_label; |
529 | foreach my $i ( 0 .. $#{$base_sequence}-1 ) { |
fa954f4c |
530 | if( $skip{$use_sig} ) { |
088e4bbe |
531 | next if !_has_prior_reading( $reading_sequence->[$i], $use_sig ); |
fa954f4c |
532 | $skip{$use_sig} = 0; |
088e4bbe |
533 | } |
534 | if( _has_next_reading( $reading_sequence->[$i], $use_sig ) ) { |
fa954f4c |
535 | $skip{$use_sig} = 1; |
536 | next; |
537 | } |
088e4bbe |
538 | $c->add_path( $base_sequence->[$i], $base_sequence->[$i+1], $use_sig ); |
fa954f4c |
539 | } |
540 | } |
541 | } |
542 | } |
543 | |
44771cf2 |
544 | # Walk the collation for all witness paths, delete the ATTACH anchor nodes, |
545 | # and then nuke and re-draw all edges (thus getting rid of the base). |
fa954f4c |
546 | |
547 | sub expand_all_paths { |
548 | my( $c ) = @_; |
549 | |
44771cf2 |
550 | # Walk the collation and fish out the paths for each witness |
fa954f4c |
551 | foreach my $sig ( keys %ALL_SIGLA ) { |
552 | my $wit = $c->tradition->witness( $sig ); |
44771cf2 |
553 | my @path = grep { $_->name !~ /ATTACH/ } |
554 | $c->reading_sequence( $c->start, $c->end, $sig ); |
fa954f4c |
555 | $wit->path( \@path ); |
556 | if( $ALL_SIGLA{$sig} > 1 ) { |
44771cf2 |
557 | my @ac_path = grep { $_->name !~ /ATTACH/ } |
861c3e27 |
558 | $c->reading_sequence( $c->start, $c->end, $sig.$c->ac_label ); |
44771cf2 |
559 | $wit->uncorrected_path( \@ac_path ); |
fa954f4c |
560 | } |
44771cf2 |
561 | } |
fa954f4c |
562 | |
44771cf2 |
563 | # Delete the anchors |
564 | foreach my $anchor ( grep { $_->name =~ /ATTACH/ } $c->readings ) { |
565 | $c->del_reading( $anchor ); |
fa954f4c |
566 | } |
44771cf2 |
567 | # Delete all edges |
568 | map { $c->del_path( $_ ) } $c->paths; |
fa954f4c |
569 | |
44771cf2 |
570 | # Make the path edges |
571 | $c->make_witness_paths(); |
fa954f4c |
572 | } |
573 | |
574 | sub _get_seq { |
575 | my( $str ) = @_; |
576 | my $seq = 1; |
577 | my $lw = $str; |
088e4bbe |
578 | if( $str =~ /^(.*)(\d)\x{b0}$/ ) { |
fa954f4c |
579 | ( $lw, $seq) = ( $1, $2 ); |
580 | } |
581 | return( $lw, $seq ); |
582 | } |
583 | |
088e4bbe |
584 | # Normalize to lowercase, no punct |
585 | sub _norm { |
586 | my( $str ) = @_; |
587 | $str =~ s/[^[:alnum:]]//g; |
588 | return lc( $str ); |
589 | } |
590 | |
fa954f4c |
591 | sub _has_next_reading { |
592 | my( $rdg, $sigil ) = @_; |
593 | return grep { $_->label eq $sigil } $rdg->outgoing(); |
594 | } |
595 | sub _has_prior_reading { |
596 | my( $rdg, $sigil ) = @_; |
597 | return grep { $_->label eq $sigil } $rdg->incoming(); |
088e4bbe |
598 | } |
599 | sub next_real_reading { |
600 | my( $c, $rdg ) = @_; |
601 | while( my $r = $c->next_reading( $rdg ) ) { |
602 | return $r unless $r->is_meta; |
603 | return $r if $r eq $c->end; |
604 | $rdg = $r; |
605 | } |
606 | } |
607 | # For debugging |
608 | sub rstr { |
609 | my @l = @_; |
610 | if( ref( $_[0] ) eq 'ARRAY' ) { |
44771cf2 |
611 | @l = @{$_[0]}; |
088e4bbe |
612 | } |
613 | my $str = join( ' ', map { $_->text } @l ); |
614 | return $str; |
615 | } |
616 | |
617 | 1; |