1 package Text::Tradition::Parser::CollateText;
8 Text::Tradition::Parser::CollateText
12 For an overview of the package, see the documentation for the
13 Text::Tradition module.
15 This module is meant for use with a set of text files saved from Word docs,
16 which originated with the COLLATE collation program.
24 parse( $graph, $opts );
26 Takes an initialized graph and a hashref of options, which must include:
27 - 'base' - the base text referenced by the variants
28 - 'format' - the format of the variant list
29 - 'data' - the variants, in the given format.
36 my( $tradition, $opts ) = @_;
37 # String together the base text.
38 my $lineref_hash = read_stone_base( $opts->{'base'}, $tradition->collation );
40 foreach my $sigil ( @{$opts->{'sigla'}} ) {
41 $ALL_SIGLA{$sigil} = 1;
42 $tradition->add_witness( sigil => $sigil, sourcetype => 'collation' );
44 # Now merge on the apparatus entries.
45 merge_stone_apparatus( $tradition->collation, $lineref_hash, $opts->{'file'} );
48 =item B<read_stone_base>
50 my $text_list = read_base( 'reference.txt', $collation );
52 Takes a text file and a (presumed empty) collation object, adds the words
53 as simple linear readings to the collation, and returns a hash of texts
54 with line keys. This collation is now the starting point for application of
55 apparatus entries in merge_base, e.g. from a CSV file or a Classical Text
58 The hash is of the form
60 { chapter_name => { line_ref => { start => node, end => node } } }
65 my( $base_file, $collation ) = @_;
67 # This array gives the first reading for each line. We put the
68 # common starting point in line zero.
69 my $last_reading = $collation->start();
70 my $lineref_hash = {};
74 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
75 binmode BASE, ':utf8';
78 # Make the readings, and connect them up for the base, but
79 # also save the first reading of each line in a hash for the
82 next if /^\s+$/; # skip blank lines
83 s/^(\d)\x{589}/$1:/; # turn Armenian full stops into colons
85 # Initialize the base hash for this section.
86 $lineref_hash->{$_} = {};
87 $curr_text = $lineref_hash->{$_};
93 # The first "word" is a line reference; keep it.
94 $lineref = shift @words;
96 # Assume we are dealing with the title.
100 # Now turn the remaining words into readings.
102 foreach my $w ( @words ) {
103 my $readingref = join( ',', $lineref, ++$wordref );
104 my $reading = $collation->add_reading( $readingref );
105 $reading->text( $w );
106 unless( exists $curr_text->{$lineref}->{'start'} ) {
107 $curr_text->{$lineref}->{'start'} = $reading;
109 # Add edge paths in the graph, for easier tracking when
110 # we start applying corrections. These paths will be
111 # removed when we're done.
112 my $path = $collation->add_path( $last_reading, $reading,
113 $collation->baselabel );
114 $last_reading = $reading;
116 $curr_text->{$lineref}->{'end'} = $last_reading;
120 # Ending point for all texts
121 $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
122 return( $lineref_hash );
125 =item B<merge_stone_apparatus>
127 Read an apparatus as output (presumably) by Collate. It should be reasonably
128 regular in form, I hope. Merge the apparatus variants onto the appropriate
133 sub merge_stone_apparatus {
134 my( $c, $lineref_hash, $file ) = @_;
138 open( APP, $file ) or die "Could not read apparatus file $file";
139 binmode APP, ':utf8';
144 $current_text = $lineref_hash->{$_};
148 # Otherwise, the first word of the line is the base text line reference.
151 if( s/^(\S+)\s+// ) {
154 warn "Unrecognized line $_";
156 my $baseline = $current_text->{$lineref};
157 # The start and end readings for this line are now in $baseline->{start}
158 # and $baseline->{end}.
160 # Now look at the apparatus entries for this line. They are
162 my @apps = split( /\s+\|\s+/ );
164 foreach my $app ( @apps ) {
165 my( $lemma, $rest ) = split( /\s+\]\s+/, $app );
166 next unless $rest; # Skip lines e.g. 'Chapter 2'
167 # Find the lemma reading.
168 my( $lemma_start, $lemma_end ) =
169 _find_reading_on_line( $c, $lemma, $baseline );
170 unless( $lemma_start && $lemma_end ) {
171 print STDERR "Lemma $lemma not found; skipping readings $rest\n";
174 my( $rdg_start, $rdg_end, @lemma_chain );
175 if( $lemma_start eq '__PRIOR__' ) {
176 # Deal with 'inc' readings: lemma chain should be empty, rdg_start
177 # is a placeholder, rdg_end is $lemma_end.
178 $rdg_start = _add_reading_placeholders( $c, $lemma_end );
179 $rdg_end = $lemma_end;
181 @lemma_chain = $c->reading_sequence( $lemma_start, $lemma_end );
182 # Splice in "start" and "end" placeholders on either
184 ( $rdg_start, $rdg_end ) =
185 _add_reading_placeholders( $c, $lemma_start, $lemma_end );
187 # For each reading, attach it to the lemma.
188 my @indiv = split( / /, $rest );
191 map { $seen_sigla{$_} = 0 } keys %ALL_SIGLA;
192 foreach my $rdg ( @indiv ) {
194 my( $words, $sigla, $recurse ) = parse_app_entry( $rdg );
196 # Do something really very dodgy indeed.
197 if( exists $sigla->{'__REL__'} && !$has_rel ) {
198 # Handling this has to be deferred until the end, so push it
199 # back onto @indiv and note that we've done so.
201 push( @indiv, $rdg );
206 foreach my $rdg_word ( @$words ) {
207 next if $rdg_word =~ /^__/;
208 my $reading_id = ref( $lemma_start )
209 ? $lemma_start->name : $lemma_start;
210 $reading_id .= '_' . $lemma_end->name . '/' . $rdg_ctr++;
211 my $reading = $c->add_reading( $reading_id );
212 $reading->text( $rdg_word );
213 push( @readings, $reading );
216 # Deal with any specials.
218 if( @$words && $words->[0] eq '__LEMMA__'
219 && $lemma_end ne $rdg_end ) {
220 # It's an addition (unless lemma_end eq rdg_end, in which case
221 # it's an 'inc'.) Start from lemma rather than from placeholder.
222 $lemma_sequence = [ $lemma_end, $rdg_end ];
223 } elsif ( @$words && $words->[0] eq '__TRANSPOSE__' ) {
224 # Hope it is only two or three words in the lemma.
225 # TODO figure out how we really want to handle this
226 @readings = reverse @lemma_chain;
228 $lemma_sequence = [ $rdg_start, @lemma_chain, $rdg_end ]
229 unless $lemma_sequence;
231 # Note which sigla we are actually dealing with.
232 if( $sigla->{'__REL__'} ) {
233 delete $sigla->{'__REL__'};
234 map { $sigla->{$_} = 1 }
235 grep { $seen_sigla{$_} == 0 } keys %seen_sigla;
237 map { $seen_sigla{$_} = 1 } keys %$sigla;
240 # Now hook up the paths.
241 unshift( @readings, $lemma_sequence->[0] );
242 push( @readings, $lemma_sequence->[-1] );
243 foreach my $i ( 1 .. $#readings ) {
244 if( $recurse->{$i} ) {
245 my( $rwords, $rsig ) = parse_app_entry( $recurse->{$i} );
246 # Get the local "lemma" sequence
247 my $llseq = [ $readings[$i], $readings[$i+1] ];
248 if( $rwords->[0] ne '__LEMMA__' ) {
249 unshift( @$llseq, $readings[$i-1] );
250 } # Otherwise treat it as an addition to the last word
251 # Create the reading nodes in $rwords
252 # TODO Hope we don't meet ~ in a recursion
254 foreach my $i ( 0 .. $#$rwords ) {
255 next if $i == 0 && $rwords->[$i] =~ /^__/;
256 my $reading_id = $llseq->[0]->name . '_' .
257 $llseq->[-1]->name . '/' . $i;
258 $reading_id =~ s/ATTACH//g;
259 my $reading = $c->add_reading( $reading_id );
260 $reading->text( $rwords->[$i] );
261 push( @$local_rdg, $reading );
263 unshift( @$local_rdg, $llseq->[0] );
264 push( @$local_rdg, $llseq->[-1] );
265 # Add the path(s) necessary
266 _add_sigil_path( $c, $rsig, $llseq, $local_rdg );
269 _add_sigil_path( $c, $sigla, $lemma_sequence, \@readings );
270 } # end processing of $app
271 } # end foreach my $app in line
274 # Now reconcile all the paths in the collation, and delete our
275 # temporary anchor nodes.
276 expand_all_paths( $c );
278 # Finally, calculate the ranks we've got.
279 # $c->calculate_ranks;
281 # Save the text for each witness so that we can ensure consistency
283 $tradition->collation->text_from_paths();
286 sub _find_reading_on_line {
287 my( $c, $lemma, $baseline, $prior ) = @_;
289 if( $lemma eq 'totum' ) {
290 # We want the whole line.
291 return( $baseline->{'start'}, $baseline->{'end'} );
292 } elsif( $lemma eq 'inc' ) {
293 # We want to shove things in before the line begins.
294 return( '__PRIOR__', $baseline->{'start'} );
297 my $lemma_start = $baseline->{'start'};
299 my $too_far = $c->next_reading( $baseline->{'end'} );
300 my @lemma_words = split( /\s+/, $lemma );
303 my $scrutinize = ''; # DEBUG variable
304 my ( $lw, $seq ) = _get_seq( $lemma_words[0] );
305 while( $lemma_start ne $too_far ) {
307 if( $seen{ $lemma_start->name() } ) {
308 warn "Detected loop at " . $lemma_start->name . " for lemma $lemma";
311 $seen{ $lemma_start->name() } = 1;
313 # Try to match the lemma.
314 # TODO move next/prior reading methods into the reading classes,
315 # to make this more self-contained and not need to pass $c.
317 print STDERR "Matching ".$lemma_start->text." against $lw...\n"
319 if( _norm( $lemma_start->text ) eq _norm( $lw ) ) {
320 # Skip it if we need a match that is not the first.
322 # Now we have to compare the rest of the words here.
323 if( scalar( @lemma_words ) > 1 ) {
324 my $next_reading = next_real_reading( $c, $lemma_start );
326 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
332 # This should be the word after a --- now, and the
334 my( $wst, $wend ) = _find_reading_on_line( $c, $w,
335 $baseline, $lemma_start );
336 warn "Something unexpected" unless $wst eq $wend;
341 # If we got this far, there is no wildcard. We must
342 # match each word in sequence.
343 my( $nlw, $nseq ) = _get_seq( $w );
344 printf STDERR "Now matching %s against %s\n",
345 $next_reading->text, $nlw
347 if( _norm( $nlw ) eq _norm( $next_reading->text ) ) {
348 $lemma_end = $next_reading;
349 $next_reading = $c->next_reading( $lemma_end );
355 } else { # single-word match, easy.
356 $lemma_end = $lemma_start;
358 } else { # we need the Nth match and aren't there yet
361 $unmatch = 1 if $prior && !$seen{$prior->name};
363 last unless ( $unmatch || !defined( $lemma_end ) );
365 $lemma_start = $c->next_reading( $lemma_start );
368 unless( $lemma_end ) {
369 warn "No match found for @lemma_words";
372 return( $lemma_start, $lemma_end );
375 sub _add_reading_placeholders {
376 my( $collation, $lemma_start, $lemma_end ) = @_;
377 # We will splice in a 'begin' and 'end' marker on either side of the
378 # lemma, as sort of a double-endpoint attachment in the graph.
379 # Note that all of this assumes we have a linear base graph at this
380 # point, and no diverging readings on the lemmas.
382 my $start_node = $collation->prior_reading( $lemma_start );
383 unless( $start_node->name =~ /ATTACH/ ) {
384 my $sn_id = '#ATTACH_' . $lemma_start->name . '_START#';
385 my $prior = $start_node;
386 $start_node = $collation->add_reading( $sn_id );
387 $start_node->is_meta( 1 );
388 $collation->graph->del_edge( $collation->graph->edge( $prior, $lemma_start ) );
389 $collation->add_path( $prior, $start_node, $collation->baselabel );
390 $collation->add_path( $start_node, $lemma_start, $collation->baselabel );
392 return $start_node unless $lemma_end;
394 # Now the converse for the end.
395 my $end_node = $collation->next_reading( $lemma_end );
396 unless( $end_node->name =~ /ATTACH/ ) {
397 my $en_id = '#ATTACH_' . $lemma_end->name . '_END#';
398 my $next = $end_node;
399 $end_node = $collation->add_reading( $en_id );
400 $end_node->is_meta( 1 );
401 $collation->graph->del_edge( $collation->graph->edge( $lemma_end, $next ) );
402 $collation->add_path( $lemma_end, $end_node, $collation->baselabel );
403 $collation->add_path( $end_node, $next, $collation->baselabel );
406 return( $start_node, $end_node );
409 # Function to parse an apparatus reading string, with reference to no other
410 # data. Need to do this separately as readings can include readings (ugh).
411 # Try to give whatever information we might need, including recursive app
412 # entries that might need to be parsed.
414 sub parse_app_entry {
418 next unless $rdg; # just in case
419 my @words = split( /\s+/, $rdg );
420 # Zero or more sigils e.g. +, followed by Armenian,
421 # followed by (possibly modified) sigla, followed by
422 # optional : with note.
425 my $is_transposition;
429 my $reading_sigla = {};
431 my $sig_regex = join( '|', sort { length $b <=> length $a } keys %ALL_SIGLA );
433 my $bit = shift @words;
436 } elsif( $bit eq 'om' ) {
438 } elsif( $bit eq '~' ) {
439 $is_transposition = 1;
440 } elsif( $bit =~ /\p{Armenian}/ ) {
441 warn "Found text in omission?!" if $is_omission;
442 push( @reading, $bit );
443 } elsif( $bit eq ':' ) {
446 } elsif( $bit =~ /^\(/ ) {
447 # It's a recursive reading within a reading. Lemmatize what we
448 # have so far and grab the extra.
450 until( $new[-1] =~ /\)$/ ) {
451 push( @new, shift @words );
453 my $recursed_reading = join( ' ', @new );
454 $recursed_reading =~ s/^\((.*)\)/$1/;
455 # This recursive entry refers to the last reading word(s) we
456 # saw. Push its index+1. We will have to come back to parse
457 # it when we are dealing with the main reading.
458 # TODO handle () as first element
459 # TODO handle - as suffix to add, i.e. make new word
460 $recursed->{@reading} = $recursed_reading;
461 } elsif( $bit =~ /^($sig_regex)(.*)$/ ) {
462 # It must be a sigil.
463 my( $sigil, $mod ) = ( $1, $2 );
464 if( $mod eq "\x{80}" ) {
465 $reading_sigla->{$sigil} = '_PC_';
466 $ALL_SIGLA{$sigil} = 2; # a pre- and post-corr version exists
467 } elsif( $mod eq '*' ) {
468 $reading_sigla->{$sigil} = '_AC_';
469 $ALL_SIGLA{$sigil} = 2; # a pre- and post-corr version exists
471 $reading_sigla->{$sigil} = 1 unless $mod; # skip secondhand corrections
473 } elsif( $bit eq 'rel' ) {
474 # The anti-reading. All sigla except those cited.
475 $reading_sigla->{'__REL__'} = 1;
476 } elsif( $bit eq 'ed' ) {
477 # An emendation. TODO make sure all other sigla appear in readings?
480 } elsif( $bit =~ /transpos/ ) {
481 # There are some transpositions not coded rigorously; skip them.
482 warn "Found hard transposition in $rdg; fix manually";
485 warn "Not sure what to do with bit $bit in $rdg";
491 return( [], {}, {} ) if $skip;
492 # Transmogrify the reading if necessary.
493 unshift( @reading, '__LEMMA__' ) if $is_add;
494 unshift( @reading, '__TRANSPOSE__' ) if $is_transposition;
495 @reading = () if $is_omission;
496 unless( @reading || $is_omission ) {
497 # It was just sigla on a line, meaning the base changed. Thus
498 # the reading is the lemma.
499 unshift( @reading, '__LEMMA__' );
502 return( \@reading, $reading_sigla, $recursed );
505 # Add a path for the specified sigla to connect the reading sequence.
506 # Add an a.c. path to the base sequence if we have an explicitly p.c.
508 # Also handle the paths for sigla we have already added in recursive
509 # apparatus readings (i.e. don't add a path if one already exists.)
511 sub _add_sigil_path {
512 my( $c, $sigla, $base_sequence, $reading_sequence ) = @_;
514 foreach my $sig ( keys %$sigla ) {
515 my $use_sig = $sigla->{$sig} eq '_AC_' ? $sig.$c->ac_label : $sig;
516 foreach my $i ( 0 .. $#{$reading_sequence}-1 ) {
517 if( $skip{$use_sig} ) {
518 next if !_has_prior_reading( $reading_sequence->[$i], $use_sig );
521 if( _has_next_reading( $reading_sequence->[$i], $use_sig ) ) {
525 $c->add_path( $reading_sequence->[$i], $reading_sequence->[$i+1], $use_sig );
527 if( $sigla->{$sig} eq '_PC_') {
528 $use_sig = $sig.$c->ac_label;
529 foreach my $i ( 0 .. $#{$base_sequence}-1 ) {
530 if( $skip{$use_sig} ) {
531 next if !_has_prior_reading( $reading_sequence->[$i], $use_sig );
534 if( _has_next_reading( $reading_sequence->[$i], $use_sig ) ) {
538 $c->add_path( $base_sequence->[$i], $base_sequence->[$i+1], $use_sig );
544 # Walk the collation for all witness paths, delete the ATTACH anchor nodes,
545 # and then nuke and re-draw all edges (thus getting rid of the base).
547 sub expand_all_paths {
550 # Walk the collation and fish out the paths for each witness
551 foreach my $sig ( keys %ALL_SIGLA ) {
552 my $wit = $c->tradition->witness( $sig );
553 my @path = grep { $_->name !~ /ATTACH/ }
554 $c->reading_sequence( $c->start, $c->end, $sig );
555 $wit->path( \@path );
556 if( $ALL_SIGLA{$sig} > 1 ) {
557 my @ac_path = grep { $_->name !~ /ATTACH/ }
558 $c->reading_sequence( $c->start, $c->end, $sig.$c->ac_label );
559 $wit->uncorrected_path( \@ac_path );
564 foreach my $anchor ( grep { $_->name =~ /ATTACH/ } $c->readings ) {
565 $c->del_reading( $anchor );
568 map { $c->del_path( $_ ) } $c->paths;
570 # Make the path edges
571 $c->make_witness_paths();
578 if( $str =~ /^(.*)(\d)\x{b0}$/ ) {
579 ( $lw, $seq) = ( $1, $2 );
584 # Normalize to lowercase, no punct
587 $str =~ s/[^[:alnum:]]//g;
591 sub _has_next_reading {
592 my( $rdg, $sigil ) = @_;
593 return grep { $_->label eq $sigil } $rdg->outgoing();
595 sub _has_prior_reading {
596 my( $rdg, $sigil ) = @_;
597 return grep { $_->label eq $sigil } $rdg->incoming();
599 sub next_real_reading {
601 while( my $r = $c->next_reading( $rdg ) ) {
602 return $r unless $r->is_meta;
603 return $r if $r eq $c->end;
610 if( ref( $_[0] ) eq 'ARRAY' ) {
613 my $str = join( ' ', map { $_->text } @l );