lib/Text/Tradition/Parser/CollateText.pm

   1 package Text::Tradition::Parser::CollateText;
   2
   3 use strict;
   4 use warnings;
   5
   6 =head1 NAME
   7
   8 Text::Tradition::Parser::CollateText
   9
  10 =head1 DESCRIPTION
  11
  12 For an overview of the package, see the documentation for the
  13 Text::Tradition module.
  14
  15 This module is meant for use with a set of text files saved from Word docs,
  16 which originated with the COLLATE collation program.
  17
  18 =head1 SUBROUTINES
  19
  20 =over
  21
  22 =item B<parse>
  23
  24 parse( $graph, $opts );
  25
  26 Takes an initialized graph and a hashref of options, which must include:
  27 - 'base' - the base text referenced by the variants
  28 - 'format' - the format of the variant list
  29 - 'data' - the variants, in the given format.
  30
  31 =cut
  32
  33 my %ALL_SIGLA;
  34
  35 sub parse {
  36     my( $tradition, $opts ) = @_;
  37     # String together the base text.
  38     my $lineref_hash = read_stone_base( $opts->{'base'}, $tradition->collation );
  39     # Note the sigla.
  40     foreach my $sigil ( @{$opts->{'sigla'}} ) {
  41         $ALL_SIGLA{$sigil} = 1;
  42         $tradition->add_witness( 'sigil' => $sigil );
  43     }
  44     # Now merge on the apparatus entries.
  45     merge_stone_apparatus( $tradition->collation, $lineref_hash, $opts->{'input'} );
  46 }
  47
  48 =item B<read_stone_base>
  49
  50 my $text_list = read_base( 'reference.txt', $collation );
  51
  52 Takes a text file and a (presumed empty) collation object, adds the words
  53 as simple linear readings to the collation, and returns a hash of texts
  54 with line keys. This collation is now the starting point for application of
  55 apparatus entries in merge_base, e.g. from a CSV file or a Classical Text
  56 Editor file.
  57
  58 The hash is of the form
  59
  60  { chapter_name => { line_ref => { start => node, end => node } } }
  61
  62 =cut
  63
  64 sub read_stone_base {
  65     my( $base_file, $collation ) = @_;
  66
  67     # This array gives the first reading for each line.  We put the
  68     # common starting point in line zero.
  69     my $last_reading = $collation->start();
  70     my $lineref_hash = {};
  71     my $last_lineref;
  72
  73     my $curr_text;
  74     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
  75     my $i = 1;
  76     while(<BASE>) {
  77         # Make the readings, and connect them up for the base, but
  78         # also save the first reading of each line in a hash for the
  79         # purpose.
  80         chomp;
  81         next if /^\s+$/; # skip blank lines
  82         s/^(\d)\x{589}/$1:/; # turn Armenian full stops into colons
  83         if( /^TESTAMENT/ ) {
  84             # Initialize the base hash for this section.
  85             $lineref_hash->{$_} = {};
  86             $curr_text = $lineref_hash->{$_};
  87             next;
  88         }
  89         my @words = split;
  90         my $lineref;
  91         if( /^\d/ ) {
  92             # The first "word" is a line reference; keep it.
  93             $lineref = shift @words;
  94         } else {
  95             # Assume we are dealing with the title.
  96             $lineref = 'Title:';
  97         }
  98
  99         # Now turn the remaining words into readings.
 100         my $wordref = 0;
 101         foreach my $w ( @words ) {
 102             my $readingref = join( ',', $lineref, ++$wordref );
 103             my $reading = $collation->add_reading( $readingref );
 104             $reading->text( $w );
 105             unless( exists $curr_text->{$lineref}->{'start'} ) {
 106                 $curr_text->{$lineref}->{'start'} = $reading;
 107             }
 108             # Add edge paths in the graph, for easier tracking when
 109             # we start applying corrections.  These paths will be
 110             # removed when we're done.
 111             my $path = $collation->add_path( $last_reading, $reading,
 112                                              $collation->baselabel );
 113             $last_reading = $reading;
 114         }
 115         $curr_text->{$lineref}->{'end'} = $last_reading;
 116     }
 117
 118     close BASE;
 119     # Ending point for all texts
 120     $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
 121     return( $lineref_hash );
 122 }
 123
 124 =item B<merge_stone_apparatus>
 125
 126 Read an apparatus as output (presumably) by Collate.  It should be reasonably
 127 regular in form, I hope.  Merge the apparatus variants onto the appropriate
 128 lemma readings.
 129
 130 =cut
 131
 132 sub merge_stone_apparatus {
 133     my( $c, $lineref_hash, $file ) = @_;
 134
 135     my $text_apps = {};
 136     my $current_text;
 137     open( APP, $file ) or die "Could not read apparatus file $file";
 138     while( <APP> ) {
 139         chomp;
 140         next if /^\s+$/;
 141         if( /^TESTAMENT/ ) {
 142             $current_text = $lineref_hash->{$_};
 143             next;
 144         }
 145
 146         # Otherwise, the first word of the line is the base text line reference.
 147         my $i = 0;
 148         my $lineref;
 149         if( s/^(\S+)// ) {
 150             $lineref = $1;
 151         } else {
 152             warn "Unrecognized line $_";
 153         }
 154         my $baseline = $current_text->{$lineref};
 155         # The start and end readings for this line are now in $baseline->{start}
 156         # and $baseline->{end}.
 157
 158         # Now look at the apparatus entries for this line. They are
 159         # split with |.
 160         my @apps = split( '|' );
 161         foreach my $app ( @apps ) {
 162             my( $lemma, $rest ) = split( ']', $app );
 163
 164             # Find the lemma reading.
 165             my( $lemma_start, $lemma_end ) =
 166                 _find_reading_on_line( $c, $lemma, $baseline );
 167             my @lemma_chain = $c->reading_sequence( $lemma_start, $lemma_end );
 168
 169             # Splice in "start" and "end" placeholders on either
 170             # side of the lemma.
 171             my ( $rdg_start, $rdg_end ) =
 172                 _add_reading_placeholders( $c, $lemma_start, $lemma_end );
 173
 174             # For each reading, attach it to the lemma.
 175             my @indiv = split( '  ', $rest );
 176             foreach my $rdg ( @indiv ) {
 177                 # Parse the string.
 178                 my( $words, $sigla, $recurse ) = parse_app_entry( $rdg );
 179                 my @readings;
 180                 foreach my $i ( 0 .. $#$words ) {
 181                     next if $i == 0 && $words->[$i] =~ /^__/;
 182                     my $reading_id = $rdg_start->text . '_' . $rdg_end->text . '/' . $i;
 183                     my $reading = $c->add_reading( $reading_id );
 184                     $reading->text( $words->[$i] );
 185                     push( @readings, $reading );
 186                 }
 187
 188                 # Deal with any specials.
 189                 my $lemma_sequence;
 190                 if( $words->[0] eq '__LEMMA__' ) {
 191                     $lemma_sequence = [ $lemma_end, $rdg_end ];
 192                 } elsif ( $rdg->[0] eq '__TRANSPOSE__' ) {
 193                     # Hope it is only two or three words in the lemma.
 194                     # TODO figure out how we really want to handle this
 195                     @readings = reverse @lemma_chain;
 196                 }
 197                 $lemma_sequence = [ $rdg_start, @lemma_chain, $rdg_end ]
 198                     unless $lemma_sequence;
 199
 200                 # Now hook up the paths.
 201                 unshift( @readings, $rdg_start );
 202                 push( @readings, $rdg_end );
 203                 foreach my $i ( 1 .. $#readings ) {
 204                     if( $recurse->{$i} ) {
 205                         my( $rwords, $rsig ) = parse_app_entry( $recurse->{$i} );
 206                         # Get the local "lemma" sequence
 207                         my $llseq = [ $readings[$i], $readings[$i+1] ];
 208                         if( $rwords->[0] ne '__LEMMA__' ) {
 209                             # Treat it as an addition to the last word
 210                             unshift( @$llseq, $readings[$i-1] );
 211                         }
 212                         # Create the reading nodes in $rwords
 213                         # TODO Hope we don't meet ~ in a recursion
 214                         my $local_rdg = [];
 215                         foreach my $i ( 0 .. $#$rwords ) {
 216                             next if $i == 0 && $rwords->[$i] =~ /^__/;
 217                             my $reading_id = $llseq->[0]->text . '_' .
 218                                 $llseq->[-1]->text . '/' . $i;
 219                             my $reading = $c->add_reading( $reading_id );
 220                             $reading->text( $words->[$i] );
 221                             push( @$local_rdg, $reading );
 222                         }
 223                         # Add the path(s) necessary
 224                         _add_sigil_path( $c, $rsig, $local_rdg, $llseq );
 225                     }
 226                 }
 227                 _add_sigil_path( $c, $sigla, \@readings, $lemma_sequence );
 228             } # end processing of $app
 229         } # end foreach my $app in line
 230     } # end while <line>
 231
 232     # Now reconcile all the paths in the collation, and delete our
 233     # temporary anchor nodes.
 234     expand_all_paths( $c );
 235
 236     # Finally, calculate the ranks we've got.
 237     $c->calculate_ranks;
 238 }
 239
 240 sub _find_reading_on_line {
 241     my( $c, $lemma, $baseline ) = @_;
 242
 243     my $lemma_start = $baseline->{'start'};
 244     my $lemma_end;
 245     my $too_far = $baseline->{'end'}->next_reading;
 246     my @lemma_words = split( /\s+/, $lemma );
 247
 248     my %seen;
 249     my $scrutinize = '';   # DEBUG variable
 250     my $seq = 1;
 251     while( $lemma_start ne $too_far ) {
 252         # Loop detection
 253         if( $seen{ $lemma_start->name() } ) {
 254             warn "Detected loop at " . $lemma_start->name . " for lemma $lemma";
 255             last;
 256         }
 257         $seen{ $lemma_start->name() } = 1;
 258
 259         # Try to match the lemma.
 260         # TODO move next/prior reading methods into the reading classes,
 261         # to make this more self-contained and not need to pass $c.
 262         my $unmatch = 0;
 263         my ( $lw, $seq ) = _get_seq( $lemma_words[0] );
 264         print STDERR "Matching $lemma_start against $lw...\n"
 265             if $scrutinize;
 266         if( $lemma_start->text eq $lw ) {
 267             # Skip it if we need a match that is not the first.
 268             if( --$seq < 1 ) {
 269                 # Now we have to compare the rest of the words here.
 270                 if( scalar( @lemma_words ) > 1 ) {
 271                     my $next_reading =
 272                         $c->next_reading( $lemma_start );
 273                     my $wildcard = 0;
 274                     foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 275                         if( $w eq '---' ) {
 276                             # We match everything to the next word.
 277                             $wildcard = 1;
 278                             next;
 279                         } else {
 280                             $wildcard = 0;
 281                         }
 282                         ( $lw, $seq ) = _get_seq( $w );
 283                         printf STDERR "Now matching %s against %s\n",
 284                                 $next_reading->text, $lw
 285                             if $scrutinize;
 286                         if( !$wildcard && $w ne $next_reading->text) {
 287                             $unmatch = 1;
 288                             last;
 289                         } else {
 290                             $lemma_end = $next_reading;
 291                             $next_reading =
 292                                 $c->next_reading( $lemma_end );
 293                         }
 294                     }
 295                 } else { # single-word match, easy.
 296                     $lemma_end = $lemma_start;
 297                 }
 298             } else { # we need the Nth match and aren't there yet
 299                 $unmatch = 1;
 300             }
 301         }
 302         last unless ( $unmatch || !defined( $lemma_end ) );
 303         $lemma_end = undef;
 304         $lemma_start = $c->next_reading( $lemma_start );
 305     }
 306
 307     unless( $lemma_end ) {
 308         warn "No match found for @lemma_words";
 309         return undef;
 310     }
 311     return( $lemma_start, $lemma_end );
 312 }
 313
 314 sub _add_reading_placeholders {
 315     my( $collation, $lemma_start, $lemma_end ) = @_;
 316     # We will splice in a 'begin' and 'end' marker on either side of the
 317     # lemma, as sort of a double-endpoint attachment in the graph.
 318
 319     my $attachlabel = "ATTACH";
 320     my( $start_node, $end_node );
 321     my @start_id = grep { $_->label eq $attachlabel } $lemma_start->incoming;
 322     if( @start_id ) {
 323         # There already exists an app-begin node. Use that.
 324         $start_node = $start_id[0]->from;
 325     } else {
 326         $start_node = $collation->add_reading( $app_info->{_id} );
 327         $collation->add_path(
 328             $collation->prior_reading( $lemma_start, $collation->baselabel ),
 329             $start_node, $attachlabel );
 330         $collation->add_path( $start_node, $lemma_start, $attachlabel );
 331     }
 332     # Now the converse for the end.
 333     my @end_id = grep { $_->label eq $attachlabel } $lemma_end->outgoing;
 334     if( @end_id ) {
 335         # There already exists an app-begin node. Use that.
 336         $end_node = $end_id[0]->to;
 337     } else {
 338         $end_node = $collation->add_reading( $app_info->{_id} . "E" );
 339         $collation->add_path( $lemma_end, $end_node, $attachlabel );
 340         $collation->add_path( $end_node,
 341             $collation->next_reading( $lemma_end, $collation->baselabel ),
 342             $attachlabel );
 343     }
 344     return( $start_node, $end_node );
 345 }
 346
 347 # Function to parse an apparatus reading string, with reference to no other
 348 # data.  Need to do this separately as readings can include readings (ugh).
 349 # Try to give whatever information we might need, including recursive app
 350 # entries that might need to be parsed.
 351
 352 sub parse_app_entry {
 353     my( $rdg, ) = @_;
 354     $rdg =~ s/^\s+//;
 355     $rdg =~ s/\s+$//;
 356     next unless $rdg;  # just in case
 357     my @words = split( /\s+/, $rdg );
 358     # Zero or more sigils e.g. +, followed by Armenian,
 359     # followed by (possibly modified) sigla, followed by
 360     # optional : with note.
 361     my $is_add;
 362     my $is_omission;
 363     my $is_transposition;
 364     my @reading;
 365     my %reading_sigla;
 366     my $recursed;
 367     my $sig_regex = join( '|', keys %ALL_SIGLA );
 368     while( @words ) {
 369         my $bit = shift @words;
 370         if( $bit eq '+' ) {
 371             $is_add = 1;
 372         } elsif( $bit eq 'om' ) {
 373             $is_omission = 1;
 374         } elsif( $bit eq '~' ) {
 375             $is_transposition = 1;
 376         } elsif( $bit =~ /\p{Armenian}/ ) {
 377             warn "Found text in omission?!" if $is_omission;
 378             push( @reading, $bit );
 379         } elsif( $bit eq ':' ) {
 380             # Stop processing.
 381             last;
 382         } elsif( $bit =~ /^\($/ ) {
 383             # It's a recursive reading within a reading. Lemmatize what we
 384             # have so far and grab the extra.
 385             my @new = ( $1 );
 386             until( $new[-1] =~ /\)$/ ) {
 387                 push( @new, shift @words );
 388             }
 389             my $recursed_reading = join( ' ', @new );
 390             $recursed_reading =~ s/^\((.*)\)//;
 391             # This recursive entry refers to the last reading word(s) we
 392             # saw.  Push its index+1.  We will have to come back to parse
 393             # it when we are dealing with the main reading.
 394             # TODO handle () as first element
 395             # TODO handle - as suffix to add, i.e. make new word
 396             $recursed->{@reading} = $recursed_reading;
 397         } elsif( $bit =~ /^(\Q$sig_regex\E)(.*)$/ {
 398             # It must be a sigil.
 399             my( $sigil, $mod ) = ( $1, $2 );
 400             if( $mod eq "\x{80}" ) {
 401                 $reading_sigla->{$sig} = '_PC_';
 402                 $ALL_SIGLA{$sig} = 2;  # a pre- and post-corr version exists
 403             } elsif( $mod eq '*' ) {
 404                 $reading_sigla->{$sig} = '_AC_';
 405                 $ALL_SIGLA{$sig} = 2;  # a pre- and post-corr version exists
 406             } else {
 407                 $reading_sigla->{$sig} = 1 unless $mod; # skip secondhand corrections
 408             }
 409         } elsif( $bit =~ /transpos/ ) {
 410             # There are some transpositions not coded rigorously; skip them.
 411             warn "Found hard transposition in $rdg; fix manually";
 412             last;
 413         } else {
 414             warn "Not sure what to do with bit $bit in $rdg";
 415         }
 416     }
 417
 418     # Transmogrify the reading if necessary.
 419     unshift( @reading, '__LEMMA__' ) if $is_add;
 420     unshift( @reading, '__TRANSPOSE__' ) if $is_transposition;
 421     @reading = () if $is_omission;
 422
 423     return( \@reading, $reading_sigla, $recursed );
 424 }
 425
 426 # Add a path for the specified sigla to connect the reading sequence.
 427 # Add an a.c. path to the base sequence if we have an explicitly p.c.
 428 # reading.
 429 # Also handle the paths for sigla we have already added in recursive
 430 # apparatus readings (i.e. don't add a path if one already exists.)
 431
 432 sub _add_sigil_path {
 433     my( $c, $sigla, $base_sequence, $reading_sequence ) = @_;
 434     my %skip;
 435     foreach my $sig ( keys %$sigla ) {
 436         my $use_sig = $sigla->{$sig} eq '_AC_' ? $sig.$c->ac_label : $sig;
 437         foreach my $i ( 0 .. $#$reading_sequence-1 ) {
 438             if( $skip{$use_sig} ) {
 439                 next if !_has_prior_reading( $reading_sequence[$i], $use_sig );
 440                 $skip{$use_sig} = 0;
 441             if( _has_next_reading( $reading_sequence[$i], $use_sig ) ) {
 442                 $skip{$use_sig} = 1;
 443                 next;
 444             }
 445             $c->add_path( $reading_sequence[$i], $reading_sequence[$i+1], $use_sig);
 446         }
 447         if( $sigla->{$sig} eq '_PC_') {
 448             $use_sig = $sig.$c->ac_label
 449             foreach my $i ( 0 .. @$base_sequence ) {
 450                 if( $skip{$use_sig} ) {
 451                     next if !_has_prior_reading( $reading_sequence[$i], $use_sig );
 452                     $skip{$use_sig} = 0;
 453                 if( _has_next_reading( $reading_sequence[$i], $use_sig ) ) {
 454                     $skip{$use_sig} = 1;
 455                     next;
 456                 }
 457                 $c->add_path( $base_sequence[$i], $base_sequence[$i+1], $use_sig );
 458             }
 459         }
 460     }
 461 }
 462
 463 # Remove all ATTACH* nodes, linking the readings on either side of them.
 464 # Then walk the collation for all witness paths, and make sure those paths
 465 # explicitly exist.  Then delete all the 'base' paths.
 466
 467 sub expand_all_paths {
 468     my( $c ) = @_;
 469
 470     # Delete the anchors
 471     foreach my $anchor ( grep { $_->name =~ /ATTACH/ } $c->readings ) {
 472         # Map each path to its incoming/outgoing node.
 473         my %incoming;
 474         map { $incoming{$_->label} = $_->from } $anchor->incoming();
 475         my %outgoing;
 476         map { $outgoing{$_->label} = $_->to } $anchor->outgoing();
 477         $c->del_reading( $anchor );
 478
 479         # Connect in and out.
 480         my $aclabel = $c->ac_label;
 481         foreach my $edge ( keys %incoming ) {
 482             my $from = $incoming{$edge};
 483             my $to = $outgoing{$edge};
 484             if( !$to && $edge =~ /^(.*)\Q$aclabel\E$/ ) {
 485                 $to = $outgoing{$1};
 486             }
 487             $to = $outgoing{$c->baselabel} unless $to;
 488             warn "Have no outbound base link on " . $anchor->name . "!"
 489                 unless $to;
 490             $c->add_path( $from, $to, $edge );
 491         }
 492         # TODO Think about deleting outgoing/edge as we use them to make this faster.
 493         foreach my $edge ( keys %outgoing ) {
 494             my $to = $outgoing{$edge};
 495             my $from = incoming{$edge};
 496             if( !$from && $edge =~ /^(.*)\Q$aclabel\E$/ ) {
 497                 $from = $incoming{$1};
 498             }
 499             $from = $incoming{$c->baselabel} unless $to;
 500             warn "Have no inbound base link on " . $anchor->name . "!"
 501                 unless $from;
 502             $c->add_path( $from, $to, $edge )
 503                 unless _has_prior_reading( $to, $edge );
 504             }
 505         }
 506     }
 507
 508     # Walk the collation and add paths if necessary
 509     foreach my $sig ( keys %ALL_SIGLA ) {
 510         my $wit = $c->tradition->witness( $sig );
 511         my @path = $c->reading_sequence( $c->start, $c->end, $sig );
 512         $wit->path( \@path );
 513         if( $ALL_SIGLA{$sig} > 1 ) {
 514             my @ac_path = $c->reading_sequence( $c->start, $c->end,
 515                                                 $sig.$c->ac_label, $sig );
 516             $wit->uncorrected_path( \@path );
 517             # a.c. paths are already there by default.
 518         }
 519         foreach my $i ( 1 .. $#$path ) {
 520             # If there is no explicit path for this sigil between n-1 and n,
 521             # add it.
 522             unless( grep { $_->label eq $sig } $path[$i]->edges_from( $path[$i-1] ) ) {
 523                 $c->add_path( $path[$i-1], $path[$i], $sig );
 524             }
 525         }
 526     }
 527
 528     # Delete all baselabel edges
 529     foreach my $edge ( grep { $_->label eq $c->baselabel } $c->paths ) {
 530         $c->del_edge( $edge );
 531     }
 532
 533     # Calculate ranks on graph nodes
 534     $c->calculate_ranks();
 535 }
 536
 537 sub _get_seq {
 538     my( $str ) = @_;
 539     my $seq = 1;
 540     my $lw = $str;
 541     if( $str =~ /^(.*)(\d)\x{80}$/ ) {
 542         ( $lw, $seq) = ( $1, $2 );
 543     }
 544     return( $lw, $seq );
 545 }
 546
 547 sub _has_next_reading {
 548     my( $rdg, $sigil ) = @_;
 549     return grep { $_->label eq $sigil } $rdg->outgoing();
 550 }
 551 sub _has_prior_reading {
 552     my( $rdg, $sigil ) = @_;
 553     return grep { $_->label eq $sigil } $rdg->incoming();
 554 }