Commit | Line | Data |
fa954f4c |
1 | package Text::Tradition::Parser::CollateText; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | |
6 | =head1 NAME |
7 | |
8 | Text::Tradition::Parser::CollateText |
9 | |
10 | =head1 DESCRIPTION |
11 | |
12 | For an overview of the package, see the documentation for the |
13 | Text::Tradition module. |
14 | |
15 | This module is meant for use with a set of text files saved from Word docs, |
16 | which originated with the COLLATE collation program. |
17 | |
18 | =head1 SUBROUTINES |
19 | |
20 | =over |
21 | |
22 | =item B<parse> |
23 | |
24 | parse( $graph, $opts ); |
25 | |
26 | Takes an initialized graph and a hashref of options, which must include: |
27 | - 'base' - the base text referenced by the variants |
28 | - 'format' - the format of the variant list |
29 | - 'data' - the variants, in the given format. |
30 | |
31 | =cut |
32 | |
33 | my %ALL_SIGLA; |
34 | |
35 | sub parse { |
36 | my( $tradition, $opts ) = @_; |
37 | # String together the base text. |
38 | my $lineref_hash = read_stone_base( $opts->{'base'}, $tradition->collation ); |
39 | # Note the sigla. |
40 | foreach my $sigil ( @{$opts->{'sigla'}} ) { |
41 | $ALL_SIGLA{$sigil} = 1; |
42 | $tradition->add_witness( 'sigil' => $sigil ); |
43 | } |
44 | # Now merge on the apparatus entries. |
088e4bbe |
45 | merge_stone_apparatus( $tradition->collation, $lineref_hash, $opts->{'file'} ); |
fa954f4c |
46 | } |
47 | |
48 | =item B<read_stone_base> |
49 | |
50 | my $text_list = read_base( 'reference.txt', $collation ); |
51 | |
52 | Takes a text file and a (presumed empty) collation object, adds the words |
53 | as simple linear readings to the collation, and returns a hash of texts |
54 | with line keys. This collation is now the starting point for application of |
55 | apparatus entries in merge_base, e.g. from a CSV file or a Classical Text |
56 | Editor file. |
57 | |
58 | The hash is of the form |
59 | |
60 | { chapter_name => { line_ref => { start => node, end => node } } } |
61 | |
62 | =cut |
63 | |
64 | sub read_stone_base { |
65 | my( $base_file, $collation ) = @_; |
66 | |
67 | # This array gives the first reading for each line. We put the |
68 | # common starting point in line zero. |
69 | my $last_reading = $collation->start(); |
70 | my $lineref_hash = {}; |
71 | my $last_lineref; |
72 | |
73 | my $curr_text; |
74 | open( BASE, $base_file ) or die "Could not open file $base_file: $!"; |
088e4bbe |
75 | binmode BASE, ':utf8'; |
fa954f4c |
76 | my $i = 1; |
77 | while(<BASE>) { |
78 | # Make the readings, and connect them up for the base, but |
79 | # also save the first reading of each line in a hash for the |
80 | # purpose. |
81 | chomp; |
82 | next if /^\s+$/; # skip blank lines |
83 | s/^(\d)\x{589}/$1:/; # turn Armenian full stops into colons |
84 | if( /^TESTAMENT/ ) { |
85 | # Initialize the base hash for this section. |
86 | $lineref_hash->{$_} = {}; |
87 | $curr_text = $lineref_hash->{$_}; |
88 | next; |
89 | } |
90 | my @words = split; |
91 | my $lineref; |
92 | if( /^\d/ ) { |
93 | # The first "word" is a line reference; keep it. |
94 | $lineref = shift @words; |
95 | } else { |
96 | # Assume we are dealing with the title. |
97 | $lineref = 'Title:'; |
98 | } |
99 | |
100 | # Now turn the remaining words into readings. |
101 | my $wordref = 0; |
102 | foreach my $w ( @words ) { |
103 | my $readingref = join( ',', $lineref, ++$wordref ); |
104 | my $reading = $collation->add_reading( $readingref ); |
105 | $reading->text( $w ); |
106 | unless( exists $curr_text->{$lineref}->{'start'} ) { |
107 | $curr_text->{$lineref}->{'start'} = $reading; |
108 | } |
109 | # Add edge paths in the graph, for easier tracking when |
110 | # we start applying corrections. These paths will be |
111 | # removed when we're done. |
112 | my $path = $collation->add_path( $last_reading, $reading, |
113 | $collation->baselabel ); |
114 | $last_reading = $reading; |
115 | } |
116 | $curr_text->{$lineref}->{'end'} = $last_reading; |
117 | } |
118 | |
119 | close BASE; |
120 | # Ending point for all texts |
121 | $collation->add_path( $last_reading, $collation->end, $collation->baselabel ); |
122 | return( $lineref_hash ); |
123 | } |
124 | |
125 | =item B<merge_stone_apparatus> |
126 | |
127 | Read an apparatus as output (presumably) by Collate. It should be reasonably |
128 | regular in form, I hope. Merge the apparatus variants onto the appropriate |
129 | lemma readings. |
130 | |
131 | =cut |
132 | |
133 | sub merge_stone_apparatus { |
134 | my( $c, $lineref_hash, $file ) = @_; |
135 | |
136 | my $text_apps = {}; |
137 | my $current_text; |
138 | open( APP, $file ) or die "Could not read apparatus file $file"; |
088e4bbe |
139 | binmode APP, ':utf8'; |
fa954f4c |
140 | while( <APP> ) { |
141 | chomp; |
088e4bbe |
142 | next if /^\s*$/; |
fa954f4c |
143 | if( /^TESTAMENT/ ) { |
144 | $current_text = $lineref_hash->{$_}; |
145 | next; |
146 | } |
147 | |
148 | # Otherwise, the first word of the line is the base text line reference. |
149 | my $i = 0; |
150 | my $lineref; |
088e4bbe |
151 | if( s/^(\S+)\s+// ) { |
fa954f4c |
152 | $lineref = $1; |
153 | } else { |
154 | warn "Unrecognized line $_"; |
155 | } |
156 | my $baseline = $current_text->{$lineref}; |
157 | # The start and end readings for this line are now in $baseline->{start} |
158 | # and $baseline->{end}. |
159 | |
160 | # Now look at the apparatus entries for this line. They are |
161 | # split with |. |
088e4bbe |
162 | my @apps = split( /\s+\|\s+/ ); |
163 | my $rdg_ctr = 0; |
164 | foreach my $app ( @apps ) { |
165 | my( $lemma, $rest ) = split( /\s+\]\s+/, $app ); |
166 | next unless $rest; # Skip lines e.g. 'Chapter 2' |
fa954f4c |
167 | # Find the lemma reading. |
168 | my( $lemma_start, $lemma_end ) = |
169 | _find_reading_on_line( $c, $lemma, $baseline ); |
088e4bbe |
170 | unless( $lemma_start && $lemma_end ) { |
171 | print STDERR "Lemma $lemma not found; skipping readings $rest\n"; |
172 | next; |
173 | } |
fa954f4c |
174 | my @lemma_chain = $c->reading_sequence( $lemma_start, $lemma_end ); |
175 | |
176 | # Splice in "start" and "end" placeholders on either |
177 | # side of the lemma. |
178 | my ( $rdg_start, $rdg_end ) = |
179 | _add_reading_placeholders( $c, $lemma_start, $lemma_end ); |
180 | |
181 | # For each reading, attach it to the lemma. |
088e4bbe |
182 | my @indiv = split( / /, $rest ); |
183 | my $has_rel = 0; |
184 | my %seen_sigla; |
185 | map { $seen_sigla{$_} = 0 } keys %ALL_SIGLA; |
fa954f4c |
186 | foreach my $rdg ( @indiv ) { |
187 | # Parse the string. |
188 | my( $words, $sigla, $recurse ) = parse_app_entry( $rdg ); |
088e4bbe |
189 | |
190 | # Do something really very dodgy indeed. |
191 | if( exists $sigla->{'__REL__'} && !$has_rel ) { |
192 | # Handling this has to be deferred until the end, so push it |
193 | # back onto @indiv and note that we've done so. |
194 | $has_rel = 1; |
195 | push( @indiv, $rdg ); |
196 | next; |
197 | } |
198 | |
fa954f4c |
199 | my @readings; |
088e4bbe |
200 | foreach my $rdg_word ( @$words ) { |
201 | next if $rdg_word =~ /^__/; |
202 | my $reading_id = $lemma_start->name . '_' . $lemma_end->name |
203 | . '/' . $rdg_ctr++; |
fa954f4c |
204 | my $reading = $c->add_reading( $reading_id ); |
088e4bbe |
205 | $reading->text( $rdg_word ); |
fa954f4c |
206 | push( @readings, $reading ); |
207 | } |
208 | |
209 | # Deal with any specials. |
210 | my $lemma_sequence; |
088e4bbe |
211 | if( @$words && $words->[0] eq '__LEMMA__' ) { |
fa954f4c |
212 | $lemma_sequence = [ $lemma_end, $rdg_end ]; |
088e4bbe |
213 | } elsif ( @$words && $words->[0] eq '__TRANSPOSE__' ) { |
fa954f4c |
214 | # Hope it is only two or three words in the lemma. |
215 | # TODO figure out how we really want to handle this |
216 | @readings = reverse @lemma_chain; |
217 | } |
218 | $lemma_sequence = [ $rdg_start, @lemma_chain, $rdg_end ] |
219 | unless $lemma_sequence; |
220 | |
088e4bbe |
221 | # Note which sigla we are actually dealing with. |
222 | if( $sigla->{'__REL__'} ) { |
223 | delete $sigla->{'__REL__'}; |
224 | map { $sigla->{$_} = 1 } |
225 | grep { $seen_sigla{$_} == 0 } keys %seen_sigla; |
226 | } else { |
227 | map { $seen_sigla{$_} = 1 } keys %$sigla; |
228 | } |
229 | |
fa954f4c |
230 | # Now hook up the paths. |
088e4bbe |
231 | unshift( @readings, $lemma_sequence->[0] ); |
232 | push( @readings, $lemma_sequence->[-1] ); |
fa954f4c |
233 | foreach my $i ( 1 .. $#readings ) { |
234 | if( $recurse->{$i} ) { |
235 | my( $rwords, $rsig ) = parse_app_entry( $recurse->{$i} ); |
236 | # Get the local "lemma" sequence |
237 | my $llseq = [ $readings[$i], $readings[$i+1] ]; |
238 | if( $rwords->[0] ne '__LEMMA__' ) { |
fa954f4c |
239 | unshift( @$llseq, $readings[$i-1] ); |
088e4bbe |
240 | } # Otherwise treat it as an addition to the last word |
fa954f4c |
241 | # Create the reading nodes in $rwords |
242 | # TODO Hope we don't meet ~ in a recursion |
243 | my $local_rdg = []; |
088e4bbe |
244 | $DB::single = 1; |
fa954f4c |
245 | foreach my $i ( 0 .. $#$rwords ) { |
246 | next if $i == 0 && $rwords->[$i] =~ /^__/; |
247 | my $reading_id = $llseq->[0]->text . '_' . |
248 | $llseq->[-1]->text . '/' . $i; |
088e4bbe |
249 | $DB::single = 1 if $reading_id =~ /ATTACH/; |
fa954f4c |
250 | my $reading = $c->add_reading( $reading_id ); |
088e4bbe |
251 | $reading->text( $rwords->[$i] ); |
fa954f4c |
252 | push( @$local_rdg, $reading ); |
253 | } |
088e4bbe |
254 | unshift( @$local_rdg, $llseq->[0] ); |
255 | push( @$local_rdg, $llseq->[-1] ); |
fa954f4c |
256 | # Add the path(s) necessary |
088e4bbe |
257 | _add_sigil_path( $c, $rsig, $llseq, $local_rdg ); |
fa954f4c |
258 | } |
259 | } |
088e4bbe |
260 | _add_sigil_path( $c, $sigla, $lemma_sequence, \@readings ); |
fa954f4c |
261 | } # end processing of $app |
262 | } # end foreach my $app in line |
263 | } # end while <line> |
264 | |
265 | # Now reconcile all the paths in the collation, and delete our |
266 | # temporary anchor nodes. |
267 | expand_all_paths( $c ); |
268 | |
269 | # Finally, calculate the ranks we've got. |
270 | $c->calculate_ranks; |
271 | } |
272 | |
273 | sub _find_reading_on_line { |
088e4bbe |
274 | my( $c, $lemma, $baseline, $prior ) = @_; |
275 | |
276 | # We might want the whole line. |
277 | if( $lemma eq 'totum' ) { |
278 | return( $baseline->{'start'}, $baseline->{'end'} ); |
279 | } |
fa954f4c |
280 | |
281 | my $lemma_start = $baseline->{'start'}; |
282 | my $lemma_end; |
088e4bbe |
283 | my $too_far = $c->next_reading( $baseline->{'end'} ); |
fa954f4c |
284 | my @lemma_words = split( /\s+/, $lemma ); |
285 | |
286 | my %seen; |
287 | my $scrutinize = ''; # DEBUG variable |
088e4bbe |
288 | my ( $lw, $seq ) = _get_seq( $lemma_words[0] ); |
fa954f4c |
289 | while( $lemma_start ne $too_far ) { |
290 | # Loop detection |
291 | if( $seen{ $lemma_start->name() } ) { |
292 | warn "Detected loop at " . $lemma_start->name . " for lemma $lemma"; |
293 | last; |
294 | } |
295 | $seen{ $lemma_start->name() } = 1; |
296 | |
297 | # Try to match the lemma. |
298 | # TODO move next/prior reading methods into the reading classes, |
299 | # to make this more self-contained and not need to pass $c. |
300 | my $unmatch = 0; |
088e4bbe |
301 | print STDERR "Matching ".$lemma_start->text." against $lw...\n" |
fa954f4c |
302 | if $scrutinize; |
088e4bbe |
303 | if( _norm( $lemma_start->text ) eq _norm( $lw ) ) { |
fa954f4c |
304 | # Skip it if we need a match that is not the first. |
305 | if( --$seq < 1 ) { |
306 | # Now we have to compare the rest of the words here. |
307 | if( scalar( @lemma_words ) > 1 ) { |
088e4bbe |
308 | my $next_reading = next_real_reading( $c, $lemma_start ); |
fa954f4c |
309 | my $wildcard = 0; |
310 | foreach my $w ( @lemma_words[1..$#lemma_words] ) { |
311 | if( $w eq '---' ) { |
fa954f4c |
312 | $wildcard = 1; |
313 | next; |
fa954f4c |
314 | } |
088e4bbe |
315 | if( $wildcard ) { |
316 | # This should be the word after a --- now, and the |
317 | # last lemma word. |
318 | my( $wst, $wend ) = _find_reading_on_line( $c, $w, |
319 | $baseline, $lemma_start ); |
320 | warn "Something unexpected" unless $wst eq $wend; |
321 | $lemma_end = $wend; |
322 | next; |
323 | } |
324 | |
325 | # If we got this far, there is no wildcard. We must |
326 | # match each word in sequence. |
327 | my( $nlw, $nseq ) = _get_seq( $w ); |
fa954f4c |
328 | printf STDERR "Now matching %s against %s\n", |
088e4bbe |
329 | $next_reading->text, $nlw |
fa954f4c |
330 | if $scrutinize; |
088e4bbe |
331 | if( _norm( $nlw ) eq _norm( $next_reading->text ) ) { |
332 | $lemma_end = $next_reading; |
333 | $next_reading = $c->next_reading( $lemma_end ); |
334 | } else { |
fa954f4c |
335 | $unmatch = 1; |
336 | last; |
fa954f4c |
337 | } |
338 | } |
339 | } else { # single-word match, easy. |
340 | $lemma_end = $lemma_start; |
341 | } |
342 | } else { # we need the Nth match and aren't there yet |
343 | $unmatch = 1; |
344 | } |
088e4bbe |
345 | $unmatch = 1 if $prior && !$seen{$prior->name}; |
fa954f4c |
346 | } |
347 | last unless ( $unmatch || !defined( $lemma_end ) ); |
348 | $lemma_end = undef; |
349 | $lemma_start = $c->next_reading( $lemma_start ); |
350 | } |
351 | |
352 | unless( $lemma_end ) { |
353 | warn "No match found for @lemma_words"; |
354 | return undef; |
355 | } |
356 | return( $lemma_start, $lemma_end ); |
357 | } |
358 | |
359 | sub _add_reading_placeholders { |
360 | my( $collation, $lemma_start, $lemma_end ) = @_; |
361 | # We will splice in a 'begin' and 'end' marker on either side of the |
362 | # lemma, as sort of a double-endpoint attachment in the graph. |
088e4bbe |
363 | # Note that all of this assumes we have a linear base graph at this |
364 | # point, and no diverging readings on the lemmas. |
365 | |
366 | my $start_node = $collation->prior_reading( $lemma_start ); |
367 | unless( $start_node->name =~ /ATTACH/ ) { |
368 | my $sn_id = '#ATTACH_' . $lemma_start->name . '_START#'; |
369 | my $prior = $start_node; |
370 | $start_node = $collation->add_reading( $sn_id ); |
371 | $start_node->is_meta( 1 ); |
372 | $collation->graph->del_edge( $collation->graph->edge( $prior, $lemma_start ) ); |
373 | $collation->add_path( $prior, $start_node, $collation->baselabel ); |
374 | $collation->add_path( $start_node, $lemma_start, $collation->baselabel ); |
fa954f4c |
375 | } |
088e4bbe |
376 | |
fa954f4c |
377 | # Now the converse for the end. |
088e4bbe |
378 | my $end_node = $collation->next_reading( $lemma_end ); |
379 | unless( $end_node->name =~ /ATTACH/ ) { |
380 | my $en_id = '#ATTACH_' . $lemma_end->name . '_END#'; |
381 | my $next = $end_node; |
382 | $end_node = $collation->add_reading( $en_id ); |
383 | $end_node->is_meta( 1 ); |
384 | $collation->graph->del_edge( $collation->graph->edge( $lemma_end, $next ) ); |
385 | $collation->add_path( $lemma_end, $end_node, $collation->baselabel ); |
386 | $collation->add_path( $end_node, $next, $collation->baselabel ); |
fa954f4c |
387 | } |
088e4bbe |
388 | |
389 | return( $start_node, $end_node ); |
fa954f4c |
390 | } |
391 | |
392 | # Function to parse an apparatus reading string, with reference to no other |
393 | # data. Need to do this separately as readings can include readings (ugh). |
394 | # Try to give whatever information we might need, including recursive app |
395 | # entries that might need to be parsed. |
396 | |
397 | sub parse_app_entry { |
398 | my( $rdg, ) = @_; |
399 | $rdg =~ s/^\s+//; |
400 | $rdg =~ s/\s+$//; |
401 | next unless $rdg; # just in case |
402 | my @words = split( /\s+/, $rdg ); |
403 | # Zero or more sigils e.g. +, followed by Armenian, |
404 | # followed by (possibly modified) sigla, followed by |
405 | # optional : with note. |
406 | my $is_add; |
407 | my $is_omission; |
408 | my $is_transposition; |
088e4bbe |
409 | my $is_base; |
410 | my $skip; |
fa954f4c |
411 | my @reading; |
088e4bbe |
412 | my $reading_sigla = {}; |
fa954f4c |
413 | my $recursed; |
088e4bbe |
414 | my $sig_regex = join( '|', sort { length $b <=> length $a } keys %ALL_SIGLA ); |
fa954f4c |
415 | while( @words ) { |
416 | my $bit = shift @words; |
417 | if( $bit eq '+' ) { |
418 | $is_add = 1; |
419 | } elsif( $bit eq 'om' ) { |
420 | $is_omission = 1; |
421 | } elsif( $bit eq '~' ) { |
422 | $is_transposition = 1; |
423 | } elsif( $bit =~ /\p{Armenian}/ ) { |
424 | warn "Found text in omission?!" if $is_omission; |
425 | push( @reading, $bit ); |
426 | } elsif( $bit eq ':' ) { |
427 | # Stop processing. |
428 | last; |
088e4bbe |
429 | } elsif( $bit =~ /^\(/ ) { |
fa954f4c |
430 | # It's a recursive reading within a reading. Lemmatize what we |
431 | # have so far and grab the extra. |
088e4bbe |
432 | my @new = ( $bit ); |
fa954f4c |
433 | until( $new[-1] =~ /\)$/ ) { |
434 | push( @new, shift @words ); |
435 | } |
436 | my $recursed_reading = join( ' ', @new ); |
088e4bbe |
437 | $recursed_reading =~ s/^\((.*)\)/$1/; |
fa954f4c |
438 | # This recursive entry refers to the last reading word(s) we |
439 | # saw. Push its index+1. We will have to come back to parse |
440 | # it when we are dealing with the main reading. |
441 | # TODO handle () as first element |
442 | # TODO handle - as suffix to add, i.e. make new word |
443 | $recursed->{@reading} = $recursed_reading; |
088e4bbe |
444 | } elsif( $bit =~ /^($sig_regex)(.*)$/ ) { |
fa954f4c |
445 | # It must be a sigil. |
446 | my( $sigil, $mod ) = ( $1, $2 ); |
447 | if( $mod eq "\x{80}" ) { |
088e4bbe |
448 | $reading_sigla->{$sigil} = '_PC_'; |
449 | $ALL_SIGLA{$sigil} = 2; # a pre- and post-corr version exists |
fa954f4c |
450 | } elsif( $mod eq '*' ) { |
088e4bbe |
451 | $reading_sigla->{$sigil} = '_AC_'; |
452 | $ALL_SIGLA{$sigil} = 2; # a pre- and post-corr version exists |
fa954f4c |
453 | } else { |
088e4bbe |
454 | $reading_sigla->{$sigil} = 1 unless $mod; # skip secondhand corrections |
fa954f4c |
455 | } |
088e4bbe |
456 | } elsif( $bit eq 'rel' ) { |
457 | # The anti-reading. All sigla except those cited. |
458 | $reading_sigla->{'__REL__'} = 1; |
459 | } elsif( $bit eq 'ed' ) { |
460 | # An emendation. TODO make sure all other sigla appear in readings? |
461 | $skip = 1; |
462 | last; |
fa954f4c |
463 | } elsif( $bit =~ /transpos/ ) { |
464 | # There are some transpositions not coded rigorously; skip them. |
465 | warn "Found hard transposition in $rdg; fix manually"; |
466 | last; |
467 | } else { |
468 | warn "Not sure what to do with bit $bit in $rdg"; |
088e4bbe |
469 | $skip = 1; |
470 | last; |
fa954f4c |
471 | } |
472 | } |
088e4bbe |
473 | |
474 | return( [], {}, {} ) if $skip; |
fa954f4c |
475 | # Transmogrify the reading if necessary. |
476 | unshift( @reading, '__LEMMA__' ) if $is_add; |
477 | unshift( @reading, '__TRANSPOSE__' ) if $is_transposition; |
478 | @reading = () if $is_omission; |
088e4bbe |
479 | unless( @reading || $is_omission ) { |
480 | # It was just sigla on a line, meaning the base changed. Thus |
481 | # the reading is the lemma. |
482 | unshift( @reading, '__LEMMA__' ); |
483 | } |
fa954f4c |
484 | |
485 | return( \@reading, $reading_sigla, $recursed ); |
486 | } |
487 | |
488 | # Add a path for the specified sigla to connect the reading sequence. |
489 | # Add an a.c. path to the base sequence if we have an explicitly p.c. |
490 | # reading. |
491 | # Also handle the paths for sigla we have already added in recursive |
492 | # apparatus readings (i.e. don't add a path if one already exists.) |
493 | |
494 | sub _add_sigil_path { |
495 | my( $c, $sigla, $base_sequence, $reading_sequence ) = @_; |
496 | my %skip; |
497 | foreach my $sig ( keys %$sigla ) { |
498 | my $use_sig = $sigla->{$sig} eq '_AC_' ? $sig.$c->ac_label : $sig; |
088e4bbe |
499 | foreach my $i ( 0 .. $#{$reading_sequence}-1 ) { |
fa954f4c |
500 | if( $skip{$use_sig} ) { |
088e4bbe |
501 | next if !_has_prior_reading( $reading_sequence->[$i], $use_sig ); |
fa954f4c |
502 | $skip{$use_sig} = 0; |
088e4bbe |
503 | } |
504 | if( _has_next_reading( $reading_sequence->[$i], $use_sig ) ) { |
fa954f4c |
505 | $skip{$use_sig} = 1; |
506 | next; |
507 | } |
088e4bbe |
508 | $c->add_path( $reading_sequence->[$i], $reading_sequence->[$i+1], $use_sig ); |
fa954f4c |
509 | } |
510 | if( $sigla->{$sig} eq '_PC_') { |
088e4bbe |
511 | $use_sig = $sig.$c->ac_label; |
512 | foreach my $i ( 0 .. $#{$base_sequence}-1 ) { |
fa954f4c |
513 | if( $skip{$use_sig} ) { |
088e4bbe |
514 | next if !_has_prior_reading( $reading_sequence->[$i], $use_sig ); |
fa954f4c |
515 | $skip{$use_sig} = 0; |
088e4bbe |
516 | } |
517 | if( _has_next_reading( $reading_sequence->[$i], $use_sig ) ) { |
fa954f4c |
518 | $skip{$use_sig} = 1; |
519 | next; |
520 | } |
088e4bbe |
521 | $c->add_path( $base_sequence->[$i], $base_sequence->[$i+1], $use_sig ); |
fa954f4c |
522 | } |
523 | } |
524 | } |
525 | } |
526 | |
527 | # Remove all ATTACH* nodes, linking the readings on either side of them. |
528 | # Then walk the collation for all witness paths, and make sure those paths |
529 | # explicitly exist. Then delete all the 'base' paths. |
530 | |
531 | sub expand_all_paths { |
532 | my( $c ) = @_; |
533 | |
534 | # Delete the anchors |
535 | foreach my $anchor ( grep { $_->name =~ /ATTACH/ } $c->readings ) { |
536 | # Map each path to its incoming/outgoing node. |
537 | my %incoming; |
538 | map { $incoming{$_->label} = $_->from } $anchor->incoming(); |
539 | my %outgoing; |
540 | map { $outgoing{$_->label} = $_->to } $anchor->outgoing(); |
541 | $c->del_reading( $anchor ); |
542 | |
543 | # Connect in and out. |
544 | my $aclabel = $c->ac_label; |
545 | foreach my $edge ( keys %incoming ) { |
546 | my $from = $incoming{$edge}; |
547 | my $to = $outgoing{$edge}; |
548 | if( !$to && $edge =~ /^(.*)\Q$aclabel\E$/ ) { |
549 | $to = $outgoing{$1}; |
550 | } |
551 | $to = $outgoing{$c->baselabel} unless $to; |
088e4bbe |
552 | $DB::single = 1 unless $to; |
fa954f4c |
553 | warn "Have no outbound base link on " . $anchor->name . "!" |
554 | unless $to; |
555 | $c->add_path( $from, $to, $edge ); |
088e4bbe |
556 | delete $outgoing{$edge} unless $edge eq $c->baselabel; |
fa954f4c |
557 | } |
fa954f4c |
558 | foreach my $edge ( keys %outgoing ) { |
559 | my $to = $outgoing{$edge}; |
088e4bbe |
560 | my $from = $incoming{$edge}; |
fa954f4c |
561 | if( !$from && $edge =~ /^(.*)\Q$aclabel\E$/ ) { |
562 | $from = $incoming{$1}; |
563 | } |
088e4bbe |
564 | $from = $incoming{$c->baselabel} unless $from; |
fa954f4c |
565 | warn "Have no inbound base link on " . $anchor->name . "!" |
566 | unless $from; |
088e4bbe |
567 | $c->add_path( $from, $to, $edge ); |
fa954f4c |
568 | } |
569 | } |
570 | |
088e4bbe |
571 | $DB::single = 1; |
fa954f4c |
572 | # Walk the collation and add paths if necessary |
573 | foreach my $sig ( keys %ALL_SIGLA ) { |
574 | my $wit = $c->tradition->witness( $sig ); |
575 | my @path = $c->reading_sequence( $c->start, $c->end, $sig ); |
576 | $wit->path( \@path ); |
577 | if( $ALL_SIGLA{$sig} > 1 ) { |
578 | my @ac_path = $c->reading_sequence( $c->start, $c->end, |
579 | $sig.$c->ac_label, $sig ); |
580 | $wit->uncorrected_path( \@path ); |
581 | # a.c. paths are already there by default. |
582 | } |
088e4bbe |
583 | foreach my $i ( 1 .. $#path ) { |
fa954f4c |
584 | # If there is no explicit path for this sigil between n-1 and n, |
585 | # add it. |
088e4bbe |
586 | my @sigedges = grep { $_->label eq $sig } $path[$i]->incoming; |
587 | if( @sigedges ) { |
588 | warn "Found more than one path already for $sig" if @sigedges > 1; |
589 | warn "Would add second path for $sig" |
590 | unless $sigedges[0]->from eq $path[$i-1]; |
591 | next; |
fa954f4c |
592 | } |
088e4bbe |
593 | $c->add_path( $path[$i-1], $path[$i], $sig ); |
fa954f4c |
594 | } |
595 | } |
596 | |
597 | # Delete all baselabel edges |
598 | foreach my $edge ( grep { $_->label eq $c->baselabel } $c->paths ) { |
599 | $c->del_edge( $edge ); |
600 | } |
601 | |
602 | # Calculate ranks on graph nodes |
603 | $c->calculate_ranks(); |
604 | } |
605 | |
606 | sub _get_seq { |
607 | my( $str ) = @_; |
608 | my $seq = 1; |
609 | my $lw = $str; |
088e4bbe |
610 | if( $str =~ /^(.*)(\d)\x{b0}$/ ) { |
fa954f4c |
611 | ( $lw, $seq) = ( $1, $2 ); |
612 | } |
613 | return( $lw, $seq ); |
614 | } |
615 | |
088e4bbe |
616 | # Normalize to lowercase, no punct |
617 | sub _norm { |
618 | my( $str ) = @_; |
619 | $str =~ s/[^[:alnum:]]//g; |
620 | return lc( $str ); |
621 | } |
622 | |
fa954f4c |
623 | sub _has_next_reading { |
624 | my( $rdg, $sigil ) = @_; |
625 | return grep { $_->label eq $sigil } $rdg->outgoing(); |
626 | } |
627 | sub _has_prior_reading { |
628 | my( $rdg, $sigil ) = @_; |
629 | return grep { $_->label eq $sigil } $rdg->incoming(); |
088e4bbe |
630 | } |
631 | sub next_real_reading { |
632 | my( $c, $rdg ) = @_; |
633 | while( my $r = $c->next_reading( $rdg ) ) { |
634 | return $r unless $r->is_meta; |
635 | return $r if $r eq $c->end; |
636 | $rdg = $r; |
637 | } |
638 | } |
639 | # For debugging |
640 | sub rstr { |
641 | my @l = @_; |
642 | if( ref( $_[0] ) eq 'ARRAY' ) { |
643 | @l = @$_[0]; |
644 | } |
645 | my $str = join( ' ', map { $_->text } @l ); |
646 | return $str; |
647 | } |
648 | |
649 | 1; |