lib/Text/Tradition/Parser/CollateText.pm

   1 package Text::Tradition::Parser::CollateText;
   2
   3 use strict;
   4 use warnings;
   5
   6 =head1 NAME
   7
   8 Text::Tradition::Parser::CollateText
   9
  10 =head1 DESCRIPTION
  11
  12 For an overview of the package, see the documentation for the
  13 Text::Tradition module.
  14
  15 This module is meant for use with a set of text files saved from Word docs,
  16 which originated with the COLLATE collation program.
  17
  18 =head1 SUBROUTINES
  19
  20 =over
  21
  22 =item B<parse>
  23
  24 parse( $graph, $opts );
  25
  26 Takes an initialized graph and a hashref of options, which must include:
  27 - 'base' - the base text referenced by the variants
  28 - 'format' - the format of the variant list
  29 - 'data' - the variants, in the given format.
  30
  31 =cut
  32
  33 my %ALL_SIGLA;
  34
  35 sub parse {
  36     my( $tradition, $opts ) = @_;
  37     # String together the base text.
  38     my $lineref_hash = read_stone_base( $opts->{'base'}, $tradition->collation );
  39     # Note the sigla.
  40     foreach my $sigil ( @{$opts->{'sigla'}} ) {
  41         $ALL_SIGLA{$sigil} = 1;
  42         $tradition->add_witness( 'sigil' => $sigil );
  43     }
  44     # Now merge on the apparatus entries.
  45     merge_stone_apparatus( $tradition->collation, $lineref_hash, $opts->{'file'} );
  46 }
  47
  48 =item B<read_stone_base>
  49
  50 my $text_list = read_base( 'reference.txt', $collation );
  51
  52 Takes a text file and a (presumed empty) collation object, adds the words
  53 as simple linear readings to the collation, and returns a hash of texts
  54 with line keys. This collation is now the starting point for application of
  55 apparatus entries in merge_base, e.g. from a CSV file or a Classical Text
  56 Editor file.
  57
  58 The hash is of the form
  59
  60  { chapter_name => { line_ref => { start => node, end => node } } }
  61
  62 =cut
  63
  64 sub read_stone_base {
  65     my( $base_file, $collation ) = @_;
  66
  67     # This array gives the first reading for each line.  We put the
  68     # common starting point in line zero.
  69     my $last_reading = $collation->start();
  70     my $lineref_hash = {};
  71     my $last_lineref;
  72
  73     my $curr_text;
  74     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
  75     binmode BASE, ':utf8';
  76     my $i = 1;
  77     while(<BASE>) {
  78         # Make the readings, and connect them up for the base, but
  79         # also save the first reading of each line in a hash for the
  80         # purpose.
  81         chomp;
  82         next if /^\s+$/; # skip blank lines
  83         s/^(\d)\x{589}/$1:/; # turn Armenian full stops into colons
  84         if( /^TESTAMENT/ ) {
  85             # Initialize the base hash for this section.
  86             $lineref_hash->{$_} = {};
  87             $curr_text = $lineref_hash->{$_};
  88             next;
  89         }
  90         my @words = split;
  91         my $lineref;
  92         if( /^\d/ ) {
  93             # The first "word" is a line reference; keep it.
  94             $lineref = shift @words;
  95         } else {
  96             # Assume we are dealing with the title.
  97             $lineref = 'Title:';
  98         }
  99
 100         # Now turn the remaining words into readings.
 101         my $wordref = 0;
 102         foreach my $w ( @words ) {
 103             my $readingref = join( ',', $lineref, ++$wordref );
 104             my $reading = $collation->add_reading( $readingref );
 105             $reading->text( $w );
 106             unless( exists $curr_text->{$lineref}->{'start'} ) {
 107                 $curr_text->{$lineref}->{'start'} = $reading;
 108             }
 109             # Add edge paths in the graph, for easier tracking when
 110             # we start applying corrections.  These paths will be
 111             # removed when we're done.
 112             my $path = $collation->add_path( $last_reading, $reading,
 113                                              $collation->baselabel );
 114             $last_reading = $reading;
 115         }
 116         $curr_text->{$lineref}->{'end'} = $last_reading;
 117     }
 118
 119     close BASE;
 120     # Ending point for all texts
 121     $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
 122     return( $lineref_hash );
 123 }
 124
 125 =item B<merge_stone_apparatus>
 126
 127 Read an apparatus as output (presumably) by Collate.  It should be reasonably
 128 regular in form, I hope.  Merge the apparatus variants onto the appropriate
 129 lemma readings.
 130
 131 =cut
 132
 133 sub merge_stone_apparatus {
 134     my( $c, $lineref_hash, $file ) = @_;
 135
 136     my $text_apps = {};
 137     my $current_text;
 138     open( APP, $file ) or die "Could not read apparatus file $file";
 139     binmode APP, ':utf8';
 140     while( <APP> ) {
 141         chomp;
 142         next if /^\s*$/;
 143         if( /^TESTAMENT/ ) {
 144             $current_text = $lineref_hash->{$_};
 145             next;
 146         }
 147
 148         # Otherwise, the first word of the line is the base text line reference.
 149         my $i = 0;
 150         my $lineref;
 151         if( s/^(\S+)\s+// ) {
 152             $lineref = $1;
 153         } else {
 154             warn "Unrecognized line $_";
 155         }
 156         my $baseline = $current_text->{$lineref};
 157         # The start and end readings for this line are now in $baseline->{start}
 158         # and $baseline->{end}.
 159
 160         # Now look at the apparatus entries for this line. They are
 161         # split with |.
 162         my @apps = split( /\s+\|\s+/ );
 163         my $rdg_ctr = 0;
 164         foreach my $app ( @apps ) {
 165             my( $lemma, $rest ) = split( /\s+\]\s+/, $app );
 166             next unless $rest; # Skip lines e.g. 'Chapter 2'
 167             # Find the lemma reading.
 168             my( $lemma_start, $lemma_end ) =
 169                 _find_reading_on_line( $c, $lemma, $baseline );
 170             unless( $lemma_start && $lemma_end ) {
 171                 print STDERR "Lemma $lemma not found; skipping readings $rest\n";
 172                 next;
 173             }
 174             my @lemma_chain = $c->reading_sequence( $lemma_start, $lemma_end );
 175
 176             # Splice in "start" and "end" placeholders on either
 177             # side of the lemma.
 178             my ( $rdg_start, $rdg_end ) =
 179                 _add_reading_placeholders( $c, $lemma_start, $lemma_end );
 180
 181             # For each reading, attach it to the lemma.
 182             my @indiv = split( /   /, $rest );
 183             my $has_rel = 0;
 184             my %seen_sigla;
 185             map { $seen_sigla{$_} = 0 } keys %ALL_SIGLA;
 186             foreach my $rdg ( @indiv ) {
 187                 # Parse the string.
 188                 my( $words, $sigla, $recurse ) = parse_app_entry( $rdg );
 189
 190                 # Do something really very dodgy indeed.
 191                 if( exists $sigla->{'__REL__'} && !$has_rel ) {
 192                     # Handling this has to be deferred until the end, so push it
 193                     # back onto @indiv and note that we've done so.
 194                     $has_rel = 1;
 195                     push( @indiv, $rdg );
 196                     next;
 197                 }
 198
 199                 my @readings;
 200                 foreach my $rdg_word ( @$words ) {
 201                     next if $rdg_word =~ /^__/;
 202                     my $reading_id = $lemma_start->name . '_' . $lemma_end->name
 203                         . '/' . $rdg_ctr++;
 204                     my $reading = $c->add_reading( $reading_id );
 205                     $reading->text( $rdg_word );
 206                     push( @readings, $reading );
 207                 }
 208
 209                 # Deal with any specials.
 210                 my $lemma_sequence;
 211                 if( @$words && $words->[0] eq '__LEMMA__' ) {
 212                     $lemma_sequence = [ $lemma_end, $rdg_end ];
 213                 } elsif ( @$words && $words->[0] eq '__TRANSPOSE__' ) {
 214                     # Hope it is only two or three words in the lemma.
 215                     # TODO figure out how we really want to handle this
 216                     @readings = reverse @lemma_chain;
 217                 }
 218                 $lemma_sequence = [ $rdg_start, @lemma_chain, $rdg_end ]
 219                     unless $lemma_sequence;
 220
 221                 # Note which sigla we are actually dealing with.
 222                 if( $sigla->{'__REL__'} ) {
 223                     delete $sigla->{'__REL__'};
 224                     map { $sigla->{$_} = 1 }
 225                         grep { $seen_sigla{$_} == 0 } keys %seen_sigla;
 226                 } else {
 227                     map { $seen_sigla{$_} = 1 } keys %$sigla;
 228                 }
 229
 230                 # Now hook up the paths.
 231                 unshift( @readings, $lemma_sequence->[0] );
 232                 push( @readings, $lemma_sequence->[-1] );
 233                 foreach my $i ( 1 .. $#readings ) {
 234                     if( $recurse->{$i} ) {
 235                         my( $rwords, $rsig ) = parse_app_entry( $recurse->{$i} );
 236                         # Get the local "lemma" sequence
 237                         my $llseq = [ $readings[$i], $readings[$i+1] ];
 238                         if( $rwords->[0] ne '__LEMMA__' ) {
 239                             unshift( @$llseq, $readings[$i-1] );
 240                         } # Otherwise treat it as an addition to the last word
 241                         # Create the reading nodes in $rwords
 242                         # TODO Hope we don't meet ~ in a recursion
 243                         my $local_rdg = [];
 244                         $DB::single = 1;
 245                         foreach my $i ( 0 .. $#$rwords ) {
 246                             next if $i == 0 && $rwords->[$i] =~ /^__/;
 247                             my $reading_id = $llseq->[0]->text . '_' .
 248                                 $llseq->[-1]->text . '/' . $i;
 249                             $DB::single = 1 if $reading_id =~ /ATTACH/;
 250                             my $reading = $c->add_reading( $reading_id );
 251                             $reading->text( $rwords->[$i] );
 252                             push( @$local_rdg, $reading );
 253                         }
 254                         unshift( @$local_rdg, $llseq->[0] );
 255                         push( @$local_rdg, $llseq->[-1] );
 256                         # Add the path(s) necessary
 257                         _add_sigil_path( $c, $rsig, $llseq, $local_rdg );
 258                     }
 259                 }
 260                 _add_sigil_path( $c, $sigla, $lemma_sequence, \@readings );
 261             } # end processing of $app
 262         } # end foreach my $app in line
 263     } # end while <line>
 264
 265     # Now reconcile all the paths in the collation, and delete our
 266     # temporary anchor nodes.
 267     expand_all_paths( $c );
 268
 269     # Finally, calculate the ranks we've got.
 270     $c->calculate_ranks;
 271 }
 272
 273 sub _find_reading_on_line {
 274     my( $c, $lemma, $baseline, $prior ) = @_;
 275
 276     # We might want the whole line.
 277     if( $lemma eq 'totum' ) {
 278         return( $baseline->{'start'}, $baseline->{'end'} );
 279     }
 280
 281     my $lemma_start = $baseline->{'start'};
 282     my $lemma_end;
 283     my $too_far = $c->next_reading( $baseline->{'end'} );
 284     my @lemma_words = split( /\s+/, $lemma );
 285
 286     my %seen;
 287     my $scrutinize = '';   # DEBUG variable
 288     my ( $lw, $seq ) = _get_seq( $lemma_words[0] );
 289     while( $lemma_start ne $too_far ) {
 290         # Loop detection
 291         if( $seen{ $lemma_start->name() } ) {
 292             warn "Detected loop at " . $lemma_start->name . " for lemma $lemma";
 293             last;
 294         }
 295         $seen{ $lemma_start->name() } = 1;
 296
 297         # Try to match the lemma.
 298         # TODO move next/prior reading methods into the reading classes,
 299         # to make this more self-contained and not need to pass $c.
 300         my $unmatch = 0;
 301         print STDERR "Matching ".$lemma_start->text." against $lw...\n"
 302             if $scrutinize;
 303         if( _norm( $lemma_start->text ) eq _norm( $lw ) ) {
 304             # Skip it if we need a match that is not the first.
 305             if( --$seq < 1 ) {
 306                 # Now we have to compare the rest of the words here.
 307                 if( scalar( @lemma_words ) > 1 ) {
 308                     my $next_reading = next_real_reading( $c, $lemma_start );
 309                     my $wildcard = 0;
 310                     foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 311                         if( $w eq '---' ) {
 312                             $wildcard = 1;
 313                             next;
 314                         }
 315                         if( $wildcard ) {
 316                             # This should be the word after a --- now, and the
 317                             # last lemma word.
 318                             my( $wst, $wend ) = _find_reading_on_line( $c, $w,
 319                                 $baseline, $lemma_start );
 320                             warn "Something unexpected" unless $wst eq $wend;
 321                             $lemma_end = $wend;
 322                             next;
 323                         }
 324
 325                         # If we got this far, there is no wildcard.  We must
 326                         # match each word in sequence.
 327                         my( $nlw, $nseq ) = _get_seq( $w );
 328                         printf STDERR "Now matching %s against %s\n",
 329                                 $next_reading->text, $nlw
 330                             if $scrutinize;
 331                         if( _norm( $nlw ) eq _norm( $next_reading->text ) ) {
 332                             $lemma_end = $next_reading;
 333                             $next_reading = $c->next_reading( $lemma_end );
 334                         } else {
 335                             $unmatch = 1;
 336                             last;
 337                         }
 338                     }
 339                 } else { # single-word match, easy.
 340                     $lemma_end = $lemma_start;
 341                 }
 342             } else { # we need the Nth match and aren't there yet
 343                 $unmatch = 1;
 344             }
 345             $unmatch = 1 if $prior && !$seen{$prior->name};
 346         }
 347         last unless ( $unmatch || !defined( $lemma_end ) );
 348         $lemma_end = undef;
 349         $lemma_start = $c->next_reading( $lemma_start );
 350     }
 351
 352     unless( $lemma_end ) {
 353         warn "No match found for @lemma_words";
 354         return undef;
 355     }
 356     return( $lemma_start, $lemma_end );
 357 }
 358
 359 sub _add_reading_placeholders {
 360     my( $collation, $lemma_start, $lemma_end ) = @_;
 361     # We will splice in a 'begin' and 'end' marker on either side of the
 362     # lemma, as sort of a double-endpoint attachment in the graph.
 363     # Note that all of this assumes we have a linear base graph at this
 364     # point, and no diverging readings on the lemmas.
 365
 366     my $start_node = $collation->prior_reading( $lemma_start );
 367     unless( $start_node->name =~ /ATTACH/ ) {
 368         my $sn_id = '#ATTACH_' . $lemma_start->name . '_START#';
 369         my $prior = $start_node;
 370         $start_node = $collation->add_reading( $sn_id );
 371         $start_node->is_meta( 1 );
 372         $collation->graph->del_edge( $collation->graph->edge( $prior, $lemma_start ) );
 373         $collation->add_path( $prior, $start_node, $collation->baselabel );
 374         $collation->add_path( $start_node, $lemma_start, $collation->baselabel );
 375     }
 376
 377     # Now the converse for the end.
 378     my $end_node = $collation->next_reading( $lemma_end );
 379     unless( $end_node->name =~ /ATTACH/ ) {
 380         my $en_id = '#ATTACH_' . $lemma_end->name . '_END#';
 381         my $next = $end_node;
 382         $end_node = $collation->add_reading( $en_id );
 383         $end_node->is_meta( 1 );
 384         $collation->graph->del_edge( $collation->graph->edge( $lemma_end, $next ) );
 385         $collation->add_path( $lemma_end, $end_node, $collation->baselabel );
 386         $collation->add_path( $end_node, $next, $collation->baselabel );
 387     }
 388
 389     return( $start_node, $end_node );
 390 }
 391
 392 # Function to parse an apparatus reading string, with reference to no other
 393 # data.  Need to do this separately as readings can include readings (ugh).
 394 # Try to give whatever information we might need, including recursive app
 395 # entries that might need to be parsed.
 396
 397 sub parse_app_entry {
 398     my( $rdg, ) = @_;
 399     $rdg =~ s/^\s+//;
 400     $rdg =~ s/\s+$//;
 401     next unless $rdg;  # just in case
 402     my @words = split( /\s+/, $rdg );
 403     # Zero or more sigils e.g. +, followed by Armenian,
 404     # followed by (possibly modified) sigla, followed by
 405     # optional : with note.
 406     my $is_add;
 407     my $is_omission;
 408     my $is_transposition;
 409     my $is_base;
 410     my $skip;
 411     my @reading;
 412     my $reading_sigla = {};
 413     my $recursed;
 414     my $sig_regex = join( '|', sort { length $b <=> length $a } keys %ALL_SIGLA );
 415     while( @words ) {
 416         my $bit = shift @words;
 417         if( $bit eq '+' ) {
 418             $is_add = 1;
 419         } elsif( $bit eq 'om' ) {
 420             $is_omission = 1;
 421         } elsif( $bit eq '~' ) {
 422             $is_transposition = 1;
 423         } elsif( $bit =~ /\p{Armenian}/ ) {
 424             warn "Found text in omission?!" if $is_omission;
 425             push( @reading, $bit );
 426         } elsif( $bit eq ':' ) {
 427             # Stop processing.
 428             last;
 429         } elsif( $bit =~ /^\(/ ) {
 430             # It's a recursive reading within a reading. Lemmatize what we
 431             # have so far and grab the extra.
 432             my @new = ( $bit );
 433             until( $new[-1] =~ /\)$/ ) {
 434                 push( @new, shift @words );
 435             }
 436             my $recursed_reading = join( ' ', @new );
 437             $recursed_reading =~ s/^\((.*)\)/$1/;
 438             # This recursive entry refers to the last reading word(s) we
 439             # saw.  Push its index+1.  We will have to come back to parse
 440             # it when we are dealing with the main reading.
 441             # TODO handle () as first element
 442             # TODO handle - as suffix to add, i.e. make new word
 443             $recursed->{@reading} = $recursed_reading;
 444         } elsif( $bit =~ /^($sig_regex)(.*)$/ ) {
 445             # It must be a sigil.
 446             my( $sigil, $mod ) = ( $1, $2 );
 447             if( $mod eq "\x{80}" ) {
 448                 $reading_sigla->{$sigil} = '_PC_';
 449                 $ALL_SIGLA{$sigil} = 2;  # a pre- and post-corr version exists
 450             } elsif( $mod eq '*' ) {
 451                 $reading_sigla->{$sigil} = '_AC_';
 452                 $ALL_SIGLA{$sigil} = 2;  # a pre- and post-corr version exists
 453             } else {
 454                 $reading_sigla->{$sigil} = 1 unless $mod; # skip secondhand corrections
 455             }
 456         } elsif( $bit eq 'rel' ) {
 457             # The anti-reading. All sigla except those cited.
 458             $reading_sigla->{'__REL__'} = 1;
 459         } elsif( $bit eq 'ed' ) {
 460             # An emendation. TODO make sure all other sigla appear in readings?
 461             $skip = 1;
 462             last;
 463         } elsif( $bit =~ /transpos/ ) {
 464             # There are some transpositions not coded rigorously; skip them.
 465             warn "Found hard transposition in $rdg; fix manually";
 466             last;
 467         } else {
 468             warn "Not sure what to do with bit $bit in $rdg";
 469             $skip = 1;
 470             last;
 471         }
 472     }
 473
 474     return( [], {}, {} ) if $skip;
 475     # Transmogrify the reading if necessary.
 476     unshift( @reading, '__LEMMA__' ) if $is_add;
 477     unshift( @reading, '__TRANSPOSE__' ) if $is_transposition;
 478     @reading = () if $is_omission;
 479     unless( @reading || $is_omission ) {
 480         # It was just sigla on a line, meaning the base changed. Thus
 481         # the reading is the lemma.
 482         unshift( @reading, '__LEMMA__' );
 483     }
 484
 485     return( \@reading, $reading_sigla, $recursed );
 486 }
 487
 488 # Add a path for the specified sigla to connect the reading sequence.
 489 # Add an a.c. path to the base sequence if we have an explicitly p.c.
 490 # reading.
 491 # Also handle the paths for sigla we have already added in recursive
 492 # apparatus readings (i.e. don't add a path if one already exists.)
 493
 494 sub _add_sigil_path {
 495     my( $c, $sigla, $base_sequence, $reading_sequence ) = @_;
 496     my %skip;
 497     foreach my $sig ( keys %$sigla ) {
 498         my $use_sig = $sigla->{$sig} eq '_AC_' ? $sig.$c->ac_label : $sig;
 499         foreach my $i ( 0 .. $#{$reading_sequence}-1 ) {
 500             if( $skip{$use_sig} ) {
 501                 next if !_has_prior_reading( $reading_sequence->[$i], $use_sig );
 502                 $skip{$use_sig} = 0;
 503             }
 504             if( _has_next_reading( $reading_sequence->[$i], $use_sig ) ) {
 505                 $skip{$use_sig} = 1;
 506                 next;
 507             }
 508             $c->add_path( $reading_sequence->[$i], $reading_sequence->[$i+1], $use_sig );
 509         }
 510         if( $sigla->{$sig} eq '_PC_') {
 511             $use_sig = $sig.$c->ac_label;
 512             foreach my $i ( 0 .. $#{$base_sequence}-1 ) {
 513                 if( $skip{$use_sig} ) {
 514                     next if !_has_prior_reading( $reading_sequence->[$i], $use_sig );
 515                     $skip{$use_sig} = 0;
 516                 }
 517                 if( _has_next_reading( $reading_sequence->[$i], $use_sig ) ) {
 518                     $skip{$use_sig} = 1;
 519                     next;
 520                 }
 521                 $c->add_path( $base_sequence->[$i], $base_sequence->[$i+1], $use_sig );
 522             }
 523         }
 524     }
 525 }
 526
 527 # Remove all ATTACH* nodes, linking the readings on either side of them.
 528 # Then walk the collation for all witness paths, and make sure those paths
 529 # explicitly exist.  Then delete all the 'base' paths.
 530
 531 sub expand_all_paths {
 532     my( $c ) = @_;
 533
 534     # Delete the anchors
 535     foreach my $anchor ( grep { $_->name =~ /ATTACH/ } $c->readings ) {
 536         # Map each path to its incoming/outgoing node.
 537         my %incoming;
 538         map { $incoming{$_->label} = $_->from } $anchor->incoming();
 539         my %outgoing;
 540         map { $outgoing{$_->label} = $_->to } $anchor->outgoing();
 541         $c->del_reading( $anchor );
 542
 543         # Connect in and out.
 544         my $aclabel = $c->ac_label;
 545         foreach my $edge ( keys %incoming ) {
 546             my $from = $incoming{$edge};
 547             my $to = $outgoing{$edge};
 548             if( !$to && $edge =~ /^(.*)\Q$aclabel\E$/ ) {
 549                 $to = $outgoing{$1};
 550             }
 551             $to = $outgoing{$c->baselabel} unless $to;
 552             $DB::single = 1 unless $to;
 553             warn "Have no outbound base link on " . $anchor->name . "!"
 554                 unless $to;
 555             $c->add_path( $from, $to, $edge );
 556             delete $outgoing{$edge} unless $edge eq $c->baselabel;
 557         }
 558         foreach my $edge ( keys %outgoing ) {
 559             my $to = $outgoing{$edge};
 560             my $from = $incoming{$edge};
 561             if( !$from && $edge =~ /^(.*)\Q$aclabel\E$/ ) {
 562                 $from = $incoming{$1};
 563             }
 564             $from = $incoming{$c->baselabel} unless $from;
 565             warn "Have no inbound base link on " . $anchor->name . "!"
 566                 unless $from;
 567             $c->add_path( $from, $to, $edge );
 568         }
 569     }
 570
 571     $DB::single = 1;
 572     # Walk the collation and add paths if necessary
 573     foreach my $sig ( keys %ALL_SIGLA ) {
 574         my $wit = $c->tradition->witness( $sig );
 575         my @path = $c->reading_sequence( $c->start, $c->end, $sig );
 576         $wit->path( \@path );
 577         if( $ALL_SIGLA{$sig} > 1 ) {
 578             my @ac_path = $c->reading_sequence( $c->start, $c->end,
 579                                                 $sig.$c->ac_label, $sig );
 580             $wit->uncorrected_path( \@path );
 581             # a.c. paths are already there by default.
 582         }
 583         foreach my $i ( 1 .. $#path ) {
 584             # If there is no explicit path for this sigil between n-1 and n,
 585             # add it.
 586             my @sigedges = grep { $_->label eq $sig } $path[$i]->incoming;
 587             if( @sigedges ) {
 588                 warn "Found more than one path already for $sig" if @sigedges > 1;
 589                 warn "Would add second path for $sig"
 590                     unless $sigedges[0]->from eq $path[$i-1];
 591                 next;
 592             }
 593             $c->add_path( $path[$i-1], $path[$i], $sig );
 594         }
 595     }
 596
 597     # Delete all baselabel edges
 598     foreach my $edge ( grep { $_->label eq $c->baselabel } $c->paths ) {
 599         $c->del_edge( $edge );
 600     }
 601
 602     # Calculate ranks on graph nodes
 603     $c->calculate_ranks();
 604 }
 605
 606 sub _get_seq {
 607     my( $str ) = @_;
 608     my $seq = 1;
 609     my $lw = $str;
 610     if( $str =~ /^(.*)(\d)\x{b0}$/ ) {
 611         ( $lw, $seq) = ( $1, $2 );
 612     }
 613     return( $lw, $seq );
 614 }
 615
 616 # Normalize to lowercase, no punct
 617 sub _norm {
 618     my( $str ) = @_;
 619     $str =~ s/[^[:alnum:]]//g;
 620     return lc( $str );
 621 }
 622
 623 sub _has_next_reading {
 624     my( $rdg, $sigil ) = @_;
 625     return grep { $_->label eq $sigil } $rdg->outgoing();
 626 }
 627 sub _has_prior_reading {
 628     my( $rdg, $sigil ) = @_;
 629     return grep { $_->label eq $sigil } $rdg->incoming();
 630 }
 631 sub next_real_reading {
 632     my( $c, $rdg ) = @_;
 633     while( my $r = $c->next_reading( $rdg ) ) {
 634         return $r unless $r->is_meta;
 635         return $r if $r eq $c->end;
 636         $rdg = $r;
 637     }
 638 }
 639 # For debugging
 640 sub rstr {
 641     my @l = @_;
 642     if( ref( $_[0] ) eq 'ARRAY' ) {
 643         @l = @$_[0];
 644     }
 645     my $str = join( ' ', map { $_->text } @l );
 646     return $str;
 647 }
 648
 649 1;