progress, though still not 100%
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / CollateText.pm
CommitLineData
fa954f4c 1package Text::Tradition::Parser::CollateText;
2
3use strict;
4use warnings;
5
6=head1 NAME
7
8Text::Tradition::Parser::CollateText
9
10=head1 DESCRIPTION
11
12For an overview of the package, see the documentation for the
13Text::Tradition module.
14
15This module is meant for use with a set of text files saved from Word docs,
16which originated with the COLLATE collation program.
17
18=head1 SUBROUTINES
19
20=over
21
22=item B<parse>
23
24parse( $graph, $opts );
25
26Takes an initialized graph and a hashref of options, which must include:
27- 'base' - the base text referenced by the variants
28- 'format' - the format of the variant list
29- 'data' - the variants, in the given format.
30
31=cut
32
33my %ALL_SIGLA;
34
35sub parse {
36 my( $tradition, $opts ) = @_;
37 # String together the base text.
38 my $lineref_hash = read_stone_base( $opts->{'base'}, $tradition->collation );
39 # Note the sigla.
40 foreach my $sigil ( @{$opts->{'sigla'}} ) {
41 $ALL_SIGLA{$sigil} = 1;
42 $tradition->add_witness( 'sigil' => $sigil );
43 }
44 # Now merge on the apparatus entries.
088e4bbe 45 merge_stone_apparatus( $tradition->collation, $lineref_hash, $opts->{'file'} );
fa954f4c 46}
47
48=item B<read_stone_base>
49
50my $text_list = read_base( 'reference.txt', $collation );
51
52Takes a text file and a (presumed empty) collation object, adds the words
53as simple linear readings to the collation, and returns a hash of texts
54with line keys. This collation is now the starting point for application of
55apparatus entries in merge_base, e.g. from a CSV file or a Classical Text
56Editor file.
57
58The hash is of the form
59
60 { chapter_name => { line_ref => { start => node, end => node } } }
61
62=cut
63
64sub read_stone_base {
65 my( $base_file, $collation ) = @_;
66
67 # This array gives the first reading for each line. We put the
68 # common starting point in line zero.
69 my $last_reading = $collation->start();
70 my $lineref_hash = {};
71 my $last_lineref;
72
73 my $curr_text;
74 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
088e4bbe 75 binmode BASE, ':utf8';
fa954f4c 76 my $i = 1;
77 while(<BASE>) {
78 # Make the readings, and connect them up for the base, but
79 # also save the first reading of each line in a hash for the
80 # purpose.
81 chomp;
82 next if /^\s+$/; # skip blank lines
83 s/^(\d)\x{589}/$1:/; # turn Armenian full stops into colons
84 if( /^TESTAMENT/ ) {
85 # Initialize the base hash for this section.
86 $lineref_hash->{$_} = {};
87 $curr_text = $lineref_hash->{$_};
88 next;
89 }
90 my @words = split;
91 my $lineref;
92 if( /^\d/ ) {
93 # The first "word" is a line reference; keep it.
94 $lineref = shift @words;
95 } else {
96 # Assume we are dealing with the title.
97 $lineref = 'Title:';
98 }
99
100 # Now turn the remaining words into readings.
101 my $wordref = 0;
102 foreach my $w ( @words ) {
103 my $readingref = join( ',', $lineref, ++$wordref );
104 my $reading = $collation->add_reading( $readingref );
105 $reading->text( $w );
106 unless( exists $curr_text->{$lineref}->{'start'} ) {
107 $curr_text->{$lineref}->{'start'} = $reading;
108 }
109 # Add edge paths in the graph, for easier tracking when
110 # we start applying corrections. These paths will be
111 # removed when we're done.
112 my $path = $collation->add_path( $last_reading, $reading,
113 $collation->baselabel );
114 $last_reading = $reading;
115 }
116 $curr_text->{$lineref}->{'end'} = $last_reading;
117 }
118
119 close BASE;
120 # Ending point for all texts
121 $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
122 return( $lineref_hash );
123}
124
125=item B<merge_stone_apparatus>
126
127Read an apparatus as output (presumably) by Collate. It should be reasonably
128regular in form, I hope. Merge the apparatus variants onto the appropriate
129lemma readings.
130
131=cut
132
133sub merge_stone_apparatus {
134 my( $c, $lineref_hash, $file ) = @_;
135
136 my $text_apps = {};
137 my $current_text;
138 open( APP, $file ) or die "Could not read apparatus file $file";
088e4bbe 139 binmode APP, ':utf8';
fa954f4c 140 while( <APP> ) {
141 chomp;
088e4bbe 142 next if /^\s*$/;
fa954f4c 143 if( /^TESTAMENT/ ) {
144 $current_text = $lineref_hash->{$_};
145 next;
146 }
147
148 # Otherwise, the first word of the line is the base text line reference.
149 my $i = 0;
150 my $lineref;
088e4bbe 151 if( s/^(\S+)\s+// ) {
fa954f4c 152 $lineref = $1;
153 } else {
154 warn "Unrecognized line $_";
155 }
156 my $baseline = $current_text->{$lineref};
157 # The start and end readings for this line are now in $baseline->{start}
158 # and $baseline->{end}.
159
160 # Now look at the apparatus entries for this line. They are
161 # split with |.
088e4bbe 162 my @apps = split( /\s+\|\s+/ );
163 my $rdg_ctr = 0;
164 foreach my $app ( @apps ) {
165 my( $lemma, $rest ) = split( /\s+\]\s+/, $app );
166 next unless $rest; # Skip lines e.g. 'Chapter 2'
fa954f4c 167 # Find the lemma reading.
168 my( $lemma_start, $lemma_end ) =
169 _find_reading_on_line( $c, $lemma, $baseline );
088e4bbe 170 unless( $lemma_start && $lemma_end ) {
171 print STDERR "Lemma $lemma not found; skipping readings $rest\n";
172 next;
173 }
fa954f4c 174 my @lemma_chain = $c->reading_sequence( $lemma_start, $lemma_end );
175
176 # Splice in "start" and "end" placeholders on either
177 # side of the lemma.
178 my ( $rdg_start, $rdg_end ) =
179 _add_reading_placeholders( $c, $lemma_start, $lemma_end );
180
181 # For each reading, attach it to the lemma.
088e4bbe 182 my @indiv = split( / /, $rest );
183 my $has_rel = 0;
184 my %seen_sigla;
185 map { $seen_sigla{$_} = 0 } keys %ALL_SIGLA;
fa954f4c 186 foreach my $rdg ( @indiv ) {
187 # Parse the string.
188 my( $words, $sigla, $recurse ) = parse_app_entry( $rdg );
088e4bbe 189
190 # Do something really very dodgy indeed.
191 if( exists $sigla->{'__REL__'} && !$has_rel ) {
192 # Handling this has to be deferred until the end, so push it
193 # back onto @indiv and note that we've done so.
194 $has_rel = 1;
195 push( @indiv, $rdg );
196 next;
197 }
198
fa954f4c 199 my @readings;
088e4bbe 200 foreach my $rdg_word ( @$words ) {
201 next if $rdg_word =~ /^__/;
202 my $reading_id = $lemma_start->name . '_' . $lemma_end->name
203 . '/' . $rdg_ctr++;
fa954f4c 204 my $reading = $c->add_reading( $reading_id );
088e4bbe 205 $reading->text( $rdg_word );
fa954f4c 206 push( @readings, $reading );
207 }
208
209 # Deal with any specials.
210 my $lemma_sequence;
088e4bbe 211 if( @$words && $words->[0] eq '__LEMMA__' ) {
fa954f4c 212 $lemma_sequence = [ $lemma_end, $rdg_end ];
088e4bbe 213 } elsif ( @$words && $words->[0] eq '__TRANSPOSE__' ) {
fa954f4c 214 # Hope it is only two or three words in the lemma.
215 # TODO figure out how we really want to handle this
216 @readings = reverse @lemma_chain;
217 }
218 $lemma_sequence = [ $rdg_start, @lemma_chain, $rdg_end ]
219 unless $lemma_sequence;
220
088e4bbe 221 # Note which sigla we are actually dealing with.
222 if( $sigla->{'__REL__'} ) {
223 delete $sigla->{'__REL__'};
224 map { $sigla->{$_} = 1 }
225 grep { $seen_sigla{$_} == 0 } keys %seen_sigla;
226 } else {
227 map { $seen_sigla{$_} = 1 } keys %$sigla;
228 }
229
fa954f4c 230 # Now hook up the paths.
088e4bbe 231 unshift( @readings, $lemma_sequence->[0] );
232 push( @readings, $lemma_sequence->[-1] );
fa954f4c 233 foreach my $i ( 1 .. $#readings ) {
234 if( $recurse->{$i} ) {
235 my( $rwords, $rsig ) = parse_app_entry( $recurse->{$i} );
236 # Get the local "lemma" sequence
237 my $llseq = [ $readings[$i], $readings[$i+1] ];
238 if( $rwords->[0] ne '__LEMMA__' ) {
fa954f4c 239 unshift( @$llseq, $readings[$i-1] );
088e4bbe 240 } # Otherwise treat it as an addition to the last word
fa954f4c 241 # Create the reading nodes in $rwords
242 # TODO Hope we don't meet ~ in a recursion
243 my $local_rdg = [];
088e4bbe 244 $DB::single = 1;
fa954f4c 245 foreach my $i ( 0 .. $#$rwords ) {
246 next if $i == 0 && $rwords->[$i] =~ /^__/;
247 my $reading_id = $llseq->[0]->text . '_' .
248 $llseq->[-1]->text . '/' . $i;
088e4bbe 249 $DB::single = 1 if $reading_id =~ /ATTACH/;
fa954f4c 250 my $reading = $c->add_reading( $reading_id );
088e4bbe 251 $reading->text( $rwords->[$i] );
fa954f4c 252 push( @$local_rdg, $reading );
253 }
088e4bbe 254 unshift( @$local_rdg, $llseq->[0] );
255 push( @$local_rdg, $llseq->[-1] );
fa954f4c 256 # Add the path(s) necessary
088e4bbe 257 _add_sigil_path( $c, $rsig, $llseq, $local_rdg );
fa954f4c 258 }
259 }
088e4bbe 260 _add_sigil_path( $c, $sigla, $lemma_sequence, \@readings );
fa954f4c 261 } # end processing of $app
262 } # end foreach my $app in line
263 } # end while <line>
264
265 # Now reconcile all the paths in the collation, and delete our
266 # temporary anchor nodes.
267 expand_all_paths( $c );
268
269 # Finally, calculate the ranks we've got.
270 $c->calculate_ranks;
271}
272
273sub _find_reading_on_line {
088e4bbe 274 my( $c, $lemma, $baseline, $prior ) = @_;
275
276 # We might want the whole line.
277 if( $lemma eq 'totum' ) {
278 return( $baseline->{'start'}, $baseline->{'end'} );
279 }
fa954f4c 280
281 my $lemma_start = $baseline->{'start'};
282 my $lemma_end;
088e4bbe 283 my $too_far = $c->next_reading( $baseline->{'end'} );
fa954f4c 284 my @lemma_words = split( /\s+/, $lemma );
285
286 my %seen;
287 my $scrutinize = ''; # DEBUG variable
088e4bbe 288 my ( $lw, $seq ) = _get_seq( $lemma_words[0] );
fa954f4c 289 while( $lemma_start ne $too_far ) {
290 # Loop detection
291 if( $seen{ $lemma_start->name() } ) {
292 warn "Detected loop at " . $lemma_start->name . " for lemma $lemma";
293 last;
294 }
295 $seen{ $lemma_start->name() } = 1;
296
297 # Try to match the lemma.
298 # TODO move next/prior reading methods into the reading classes,
299 # to make this more self-contained and not need to pass $c.
300 my $unmatch = 0;
088e4bbe 301 print STDERR "Matching ".$lemma_start->text." against $lw...\n"
fa954f4c 302 if $scrutinize;
088e4bbe 303 if( _norm( $lemma_start->text ) eq _norm( $lw ) ) {
fa954f4c 304 # Skip it if we need a match that is not the first.
305 if( --$seq < 1 ) {
306 # Now we have to compare the rest of the words here.
307 if( scalar( @lemma_words ) > 1 ) {
088e4bbe 308 my $next_reading = next_real_reading( $c, $lemma_start );
fa954f4c 309 my $wildcard = 0;
310 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
311 if( $w eq '---' ) {
fa954f4c 312 $wildcard = 1;
313 next;
fa954f4c 314 }
088e4bbe 315 if( $wildcard ) {
316 # This should be the word after a --- now, and the
317 # last lemma word.
318 my( $wst, $wend ) = _find_reading_on_line( $c, $w,
319 $baseline, $lemma_start );
320 warn "Something unexpected" unless $wst eq $wend;
321 $lemma_end = $wend;
322 next;
323 }
324
325 # If we got this far, there is no wildcard. We must
326 # match each word in sequence.
327 my( $nlw, $nseq ) = _get_seq( $w );
fa954f4c 328 printf STDERR "Now matching %s against %s\n",
088e4bbe 329 $next_reading->text, $nlw
fa954f4c 330 if $scrutinize;
088e4bbe 331 if( _norm( $nlw ) eq _norm( $next_reading->text ) ) {
332 $lemma_end = $next_reading;
333 $next_reading = $c->next_reading( $lemma_end );
334 } else {
fa954f4c 335 $unmatch = 1;
336 last;
fa954f4c 337 }
338 }
339 } else { # single-word match, easy.
340 $lemma_end = $lemma_start;
341 }
342 } else { # we need the Nth match and aren't there yet
343 $unmatch = 1;
344 }
088e4bbe 345 $unmatch = 1 if $prior && !$seen{$prior->name};
fa954f4c 346 }
347 last unless ( $unmatch || !defined( $lemma_end ) );
348 $lemma_end = undef;
349 $lemma_start = $c->next_reading( $lemma_start );
350 }
351
352 unless( $lemma_end ) {
353 warn "No match found for @lemma_words";
354 return undef;
355 }
356 return( $lemma_start, $lemma_end );
357}
358
359sub _add_reading_placeholders {
360 my( $collation, $lemma_start, $lemma_end ) = @_;
361 # We will splice in a 'begin' and 'end' marker on either side of the
362 # lemma, as sort of a double-endpoint attachment in the graph.
088e4bbe 363 # Note that all of this assumes we have a linear base graph at this
364 # point, and no diverging readings on the lemmas.
365
366 my $start_node = $collation->prior_reading( $lemma_start );
367 unless( $start_node->name =~ /ATTACH/ ) {
368 my $sn_id = '#ATTACH_' . $lemma_start->name . '_START#';
369 my $prior = $start_node;
370 $start_node = $collation->add_reading( $sn_id );
371 $start_node->is_meta( 1 );
372 $collation->graph->del_edge( $collation->graph->edge( $prior, $lemma_start ) );
373 $collation->add_path( $prior, $start_node, $collation->baselabel );
374 $collation->add_path( $start_node, $lemma_start, $collation->baselabel );
fa954f4c 375 }
088e4bbe 376
fa954f4c 377 # Now the converse for the end.
088e4bbe 378 my $end_node = $collation->next_reading( $lemma_end );
379 unless( $end_node->name =~ /ATTACH/ ) {
380 my $en_id = '#ATTACH_' . $lemma_end->name . '_END#';
381 my $next = $end_node;
382 $end_node = $collation->add_reading( $en_id );
383 $end_node->is_meta( 1 );
384 $collation->graph->del_edge( $collation->graph->edge( $lemma_end, $next ) );
385 $collation->add_path( $lemma_end, $end_node, $collation->baselabel );
386 $collation->add_path( $end_node, $next, $collation->baselabel );
fa954f4c 387 }
088e4bbe 388
389 return( $start_node, $end_node );
fa954f4c 390}
391
392# Function to parse an apparatus reading string, with reference to no other
393# data. Need to do this separately as readings can include readings (ugh).
394# Try to give whatever information we might need, including recursive app
395# entries that might need to be parsed.
396
397sub parse_app_entry {
398 my( $rdg, ) = @_;
399 $rdg =~ s/^\s+//;
400 $rdg =~ s/\s+$//;
401 next unless $rdg; # just in case
402 my @words = split( /\s+/, $rdg );
403 # Zero or more sigils e.g. +, followed by Armenian,
404 # followed by (possibly modified) sigla, followed by
405 # optional : with note.
406 my $is_add;
407 my $is_omission;
408 my $is_transposition;
088e4bbe 409 my $is_base;
410 my $skip;
fa954f4c 411 my @reading;
088e4bbe 412 my $reading_sigla = {};
fa954f4c 413 my $recursed;
088e4bbe 414 my $sig_regex = join( '|', sort { length $b <=> length $a } keys %ALL_SIGLA );
fa954f4c 415 while( @words ) {
416 my $bit = shift @words;
417 if( $bit eq '+' ) {
418 $is_add = 1;
419 } elsif( $bit eq 'om' ) {
420 $is_omission = 1;
421 } elsif( $bit eq '~' ) {
422 $is_transposition = 1;
423 } elsif( $bit =~ /\p{Armenian}/ ) {
424 warn "Found text in omission?!" if $is_omission;
425 push( @reading, $bit );
426 } elsif( $bit eq ':' ) {
427 # Stop processing.
428 last;
088e4bbe 429 } elsif( $bit =~ /^\(/ ) {
fa954f4c 430 # It's a recursive reading within a reading. Lemmatize what we
431 # have so far and grab the extra.
088e4bbe 432 my @new = ( $bit );
fa954f4c 433 until( $new[-1] =~ /\)$/ ) {
434 push( @new, shift @words );
435 }
436 my $recursed_reading = join( ' ', @new );
088e4bbe 437 $recursed_reading =~ s/^\((.*)\)/$1/;
fa954f4c 438 # This recursive entry refers to the last reading word(s) we
439 # saw. Push its index+1. We will have to come back to parse
440 # it when we are dealing with the main reading.
441 # TODO handle () as first element
442 # TODO handle - as suffix to add, i.e. make new word
443 $recursed->{@reading} = $recursed_reading;
088e4bbe 444 } elsif( $bit =~ /^($sig_regex)(.*)$/ ) {
fa954f4c 445 # It must be a sigil.
446 my( $sigil, $mod ) = ( $1, $2 );
447 if( $mod eq "\x{80}" ) {
088e4bbe 448 $reading_sigla->{$sigil} = '_PC_';
449 $ALL_SIGLA{$sigil} = 2; # a pre- and post-corr version exists
fa954f4c 450 } elsif( $mod eq '*' ) {
088e4bbe 451 $reading_sigla->{$sigil} = '_AC_';
452 $ALL_SIGLA{$sigil} = 2; # a pre- and post-corr version exists
fa954f4c 453 } else {
088e4bbe 454 $reading_sigla->{$sigil} = 1 unless $mod; # skip secondhand corrections
fa954f4c 455 }
088e4bbe 456 } elsif( $bit eq 'rel' ) {
457 # The anti-reading. All sigla except those cited.
458 $reading_sigla->{'__REL__'} = 1;
459 } elsif( $bit eq 'ed' ) {
460 # An emendation. TODO make sure all other sigla appear in readings?
461 $skip = 1;
462 last;
fa954f4c 463 } elsif( $bit =~ /transpos/ ) {
464 # There are some transpositions not coded rigorously; skip them.
465 warn "Found hard transposition in $rdg; fix manually";
466 last;
467 } else {
468 warn "Not sure what to do with bit $bit in $rdg";
088e4bbe 469 $skip = 1;
470 last;
fa954f4c 471 }
472 }
088e4bbe 473
474 return( [], {}, {} ) if $skip;
fa954f4c 475 # Transmogrify the reading if necessary.
476 unshift( @reading, '__LEMMA__' ) if $is_add;
477 unshift( @reading, '__TRANSPOSE__' ) if $is_transposition;
478 @reading = () if $is_omission;
088e4bbe 479 unless( @reading || $is_omission ) {
480 # It was just sigla on a line, meaning the base changed. Thus
481 # the reading is the lemma.
482 unshift( @reading, '__LEMMA__' );
483 }
fa954f4c 484
485 return( \@reading, $reading_sigla, $recursed );
486}
487
488# Add a path for the specified sigla to connect the reading sequence.
489# Add an a.c. path to the base sequence if we have an explicitly p.c.
490# reading.
491# Also handle the paths for sigla we have already added in recursive
492# apparatus readings (i.e. don't add a path if one already exists.)
493
494sub _add_sigil_path {
495 my( $c, $sigla, $base_sequence, $reading_sequence ) = @_;
496 my %skip;
497 foreach my $sig ( keys %$sigla ) {
498 my $use_sig = $sigla->{$sig} eq '_AC_' ? $sig.$c->ac_label : $sig;
088e4bbe 499 foreach my $i ( 0 .. $#{$reading_sequence}-1 ) {
fa954f4c 500 if( $skip{$use_sig} ) {
088e4bbe 501 next if !_has_prior_reading( $reading_sequence->[$i], $use_sig );
fa954f4c 502 $skip{$use_sig} = 0;
088e4bbe 503 }
504 if( _has_next_reading( $reading_sequence->[$i], $use_sig ) ) {
fa954f4c 505 $skip{$use_sig} = 1;
506 next;
507 }
088e4bbe 508 $c->add_path( $reading_sequence->[$i], $reading_sequence->[$i+1], $use_sig );
fa954f4c 509 }
510 if( $sigla->{$sig} eq '_PC_') {
088e4bbe 511 $use_sig = $sig.$c->ac_label;
512 foreach my $i ( 0 .. $#{$base_sequence}-1 ) {
fa954f4c 513 if( $skip{$use_sig} ) {
088e4bbe 514 next if !_has_prior_reading( $reading_sequence->[$i], $use_sig );
fa954f4c 515 $skip{$use_sig} = 0;
088e4bbe 516 }
517 if( _has_next_reading( $reading_sequence->[$i], $use_sig ) ) {
fa954f4c 518 $skip{$use_sig} = 1;
519 next;
520 }
088e4bbe 521 $c->add_path( $base_sequence->[$i], $base_sequence->[$i+1], $use_sig );
fa954f4c 522 }
523 }
524 }
525}
526
527# Remove all ATTACH* nodes, linking the readings on either side of them.
528# Then walk the collation for all witness paths, and make sure those paths
529# explicitly exist. Then delete all the 'base' paths.
530
531sub expand_all_paths {
532 my( $c ) = @_;
533
534 # Delete the anchors
535 foreach my $anchor ( grep { $_->name =~ /ATTACH/ } $c->readings ) {
536 # Map each path to its incoming/outgoing node.
537 my %incoming;
538 map { $incoming{$_->label} = $_->from } $anchor->incoming();
539 my %outgoing;
540 map { $outgoing{$_->label} = $_->to } $anchor->outgoing();
541 $c->del_reading( $anchor );
542
543 # Connect in and out.
544 my $aclabel = $c->ac_label;
545 foreach my $edge ( keys %incoming ) {
546 my $from = $incoming{$edge};
547 my $to = $outgoing{$edge};
548 if( !$to && $edge =~ /^(.*)\Q$aclabel\E$/ ) {
549 $to = $outgoing{$1};
550 }
551 $to = $outgoing{$c->baselabel} unless $to;
088e4bbe 552 $DB::single = 1 unless $to;
fa954f4c 553 warn "Have no outbound base link on " . $anchor->name . "!"
554 unless $to;
555 $c->add_path( $from, $to, $edge );
088e4bbe 556 delete $outgoing{$edge} unless $edge eq $c->baselabel;
fa954f4c 557 }
fa954f4c 558 foreach my $edge ( keys %outgoing ) {
559 my $to = $outgoing{$edge};
088e4bbe 560 my $from = $incoming{$edge};
fa954f4c 561 if( !$from && $edge =~ /^(.*)\Q$aclabel\E$/ ) {
562 $from = $incoming{$1};
563 }
088e4bbe 564 $from = $incoming{$c->baselabel} unless $from;
fa954f4c 565 warn "Have no inbound base link on " . $anchor->name . "!"
566 unless $from;
088e4bbe 567 $c->add_path( $from, $to, $edge );
fa954f4c 568 }
569 }
570
088e4bbe 571 $DB::single = 1;
fa954f4c 572 # Walk the collation and add paths if necessary
573 foreach my $sig ( keys %ALL_SIGLA ) {
574 my $wit = $c->tradition->witness( $sig );
575 my @path = $c->reading_sequence( $c->start, $c->end, $sig );
576 $wit->path( \@path );
577 if( $ALL_SIGLA{$sig} > 1 ) {
578 my @ac_path = $c->reading_sequence( $c->start, $c->end,
579 $sig.$c->ac_label, $sig );
580 $wit->uncorrected_path( \@path );
581 # a.c. paths are already there by default.
582 }
088e4bbe 583 foreach my $i ( 1 .. $#path ) {
fa954f4c 584 # If there is no explicit path for this sigil between n-1 and n,
585 # add it.
088e4bbe 586 my @sigedges = grep { $_->label eq $sig } $path[$i]->incoming;
587 if( @sigedges ) {
588 warn "Found more than one path already for $sig" if @sigedges > 1;
589 warn "Would add second path for $sig"
590 unless $sigedges[0]->from eq $path[$i-1];
591 next;
fa954f4c 592 }
088e4bbe 593 $c->add_path( $path[$i-1], $path[$i], $sig );
fa954f4c 594 }
595 }
596
597 # Delete all baselabel edges
598 foreach my $edge ( grep { $_->label eq $c->baselabel } $c->paths ) {
599 $c->del_edge( $edge );
600 }
601
602 # Calculate ranks on graph nodes
603 $c->calculate_ranks();
604}
605
606sub _get_seq {
607 my( $str ) = @_;
608 my $seq = 1;
609 my $lw = $str;
088e4bbe 610 if( $str =~ /^(.*)(\d)\x{b0}$/ ) {
fa954f4c 611 ( $lw, $seq) = ( $1, $2 );
612 }
613 return( $lw, $seq );
614}
615
088e4bbe 616# Normalize to lowercase, no punct
617sub _norm {
618 my( $str ) = @_;
619 $str =~ s/[^[:alnum:]]//g;
620 return lc( $str );
621}
622
fa954f4c 623sub _has_next_reading {
624 my( $rdg, $sigil ) = @_;
625 return grep { $_->label eq $sigil } $rdg->outgoing();
626}
627sub _has_prior_reading {
628 my( $rdg, $sigil ) = @_;
629 return grep { $_->label eq $sigil } $rdg->incoming();
088e4bbe 630}
631sub next_real_reading {
632 my( $c, $rdg ) = @_;
633 while( my $r = $c->next_reading( $rdg ) ) {
634 return $r unless $r->is_meta;
635 return $r if $r eq $c->end;
636 $rdg = $r;
637 }
638}
639# For debugging
640sub rstr {
641 my @l = @_;
642 if( ref( $_[0] ) eq 'ARRAY' ) {
643 @l = @$_[0];
644 }
645 my $str = join( ' ', map { $_->text } @l );
646 return $str;
647}
648
6491;