stop tracking bbedit stuff; first pass at Collate! parsing
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / CollateText.pm
CommitLineData
fa954f4c 1package Text::Tradition::Parser::CollateText;
2
3use strict;
4use warnings;
5
6=head1 NAME
7
8Text::Tradition::Parser::CollateText
9
10=head1 DESCRIPTION
11
12For an overview of the package, see the documentation for the
13Text::Tradition module.
14
15This module is meant for use with a set of text files saved from Word docs,
16which originated with the COLLATE collation program.
17
18=head1 SUBROUTINES
19
20=over
21
22=item B<parse>
23
24parse( $graph, $opts );
25
26Takes an initialized graph and a hashref of options, which must include:
27- 'base' - the base text referenced by the variants
28- 'format' - the format of the variant list
29- 'data' - the variants, in the given format.
30
31=cut
32
33my %ALL_SIGLA;
34
35sub parse {
36 my( $tradition, $opts ) = @_;
37 # String together the base text.
38 my $lineref_hash = read_stone_base( $opts->{'base'}, $tradition->collation );
39 # Note the sigla.
40 foreach my $sigil ( @{$opts->{'sigla'}} ) {
41 $ALL_SIGLA{$sigil} = 1;
42 $tradition->add_witness( 'sigil' => $sigil );
43 }
44 # Now merge on the apparatus entries.
45 merge_stone_apparatus( $tradition->collation, $lineref_hash, $opts->{'input'} );
46}
47
48=item B<read_stone_base>
49
50my $text_list = read_base( 'reference.txt', $collation );
51
52Takes a text file and a (presumed empty) collation object, adds the words
53as simple linear readings to the collation, and returns a hash of texts
54with line keys. This collation is now the starting point for application of
55apparatus entries in merge_base, e.g. from a CSV file or a Classical Text
56Editor file.
57
58The hash is of the form
59
60 { chapter_name => { line_ref => { start => node, end => node } } }
61
62=cut
63
64sub read_stone_base {
65 my( $base_file, $collation ) = @_;
66
67 # This array gives the first reading for each line. We put the
68 # common starting point in line zero.
69 my $last_reading = $collation->start();
70 my $lineref_hash = {};
71 my $last_lineref;
72
73 my $curr_text;
74 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
75 my $i = 1;
76 while(<BASE>) {
77 # Make the readings, and connect them up for the base, but
78 # also save the first reading of each line in a hash for the
79 # purpose.
80 chomp;
81 next if /^\s+$/; # skip blank lines
82 s/^(\d)\x{589}/$1:/; # turn Armenian full stops into colons
83 if( /^TESTAMENT/ ) {
84 # Initialize the base hash for this section.
85 $lineref_hash->{$_} = {};
86 $curr_text = $lineref_hash->{$_};
87 next;
88 }
89 my @words = split;
90 my $lineref;
91 if( /^\d/ ) {
92 # The first "word" is a line reference; keep it.
93 $lineref = shift @words;
94 } else {
95 # Assume we are dealing with the title.
96 $lineref = 'Title:';
97 }
98
99 # Now turn the remaining words into readings.
100 my $wordref = 0;
101 foreach my $w ( @words ) {
102 my $readingref = join( ',', $lineref, ++$wordref );
103 my $reading = $collation->add_reading( $readingref );
104 $reading->text( $w );
105 unless( exists $curr_text->{$lineref}->{'start'} ) {
106 $curr_text->{$lineref}->{'start'} = $reading;
107 }
108 # Add edge paths in the graph, for easier tracking when
109 # we start applying corrections. These paths will be
110 # removed when we're done.
111 my $path = $collation->add_path( $last_reading, $reading,
112 $collation->baselabel );
113 $last_reading = $reading;
114 }
115 $curr_text->{$lineref}->{'end'} = $last_reading;
116 }
117
118 close BASE;
119 # Ending point for all texts
120 $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
121 return( $lineref_hash );
122}
123
124=item B<merge_stone_apparatus>
125
126Read an apparatus as output (presumably) by Collate. It should be reasonably
127regular in form, I hope. Merge the apparatus variants onto the appropriate
128lemma readings.
129
130=cut
131
132sub merge_stone_apparatus {
133 my( $c, $lineref_hash, $file ) = @_;
134
135 my $text_apps = {};
136 my $current_text;
137 open( APP, $file ) or die "Could not read apparatus file $file";
138 while( <APP> ) {
139 chomp;
140 next if /^\s+$/;
141 if( /^TESTAMENT/ ) {
142 $current_text = $lineref_hash->{$_};
143 next;
144 }
145
146 # Otherwise, the first word of the line is the base text line reference.
147 my $i = 0;
148 my $lineref;
149 if( s/^(\S+)// ) {
150 $lineref = $1;
151 } else {
152 warn "Unrecognized line $_";
153 }
154 my $baseline = $current_text->{$lineref};
155 # The start and end readings for this line are now in $baseline->{start}
156 # and $baseline->{end}.
157
158 # Now look at the apparatus entries for this line. They are
159 # split with |.
160 my @apps = split( '|' );
161 foreach my $app ( @apps ) {
162 my( $lemma, $rest ) = split( ']', $app );
163
164 # Find the lemma reading.
165 my( $lemma_start, $lemma_end ) =
166 _find_reading_on_line( $c, $lemma, $baseline );
167 my @lemma_chain = $c->reading_sequence( $lemma_start, $lemma_end );
168
169 # Splice in "start" and "end" placeholders on either
170 # side of the lemma.
171 my ( $rdg_start, $rdg_end ) =
172 _add_reading_placeholders( $c, $lemma_start, $lemma_end );
173
174 # For each reading, attach it to the lemma.
175 my @indiv = split( ' ', $rest );
176 foreach my $rdg ( @indiv ) {
177 # Parse the string.
178 my( $words, $sigla, $recurse ) = parse_app_entry( $rdg );
179 my @readings;
180 foreach my $i ( 0 .. $#$words ) {
181 next if $i == 0 && $words->[$i] =~ /^__/;
182 my $reading_id = $rdg_start->text . '_' . $rdg_end->text . '/' . $i;
183 my $reading = $c->add_reading( $reading_id );
184 $reading->text( $words->[$i] );
185 push( @readings, $reading );
186 }
187
188 # Deal with any specials.
189 my $lemma_sequence;
190 if( $words->[0] eq '__LEMMA__' ) {
191 $lemma_sequence = [ $lemma_end, $rdg_end ];
192 } elsif ( $rdg->[0] eq '__TRANSPOSE__' ) {
193 # Hope it is only two or three words in the lemma.
194 # TODO figure out how we really want to handle this
195 @readings = reverse @lemma_chain;
196 }
197 $lemma_sequence = [ $rdg_start, @lemma_chain, $rdg_end ]
198 unless $lemma_sequence;
199
200 # Now hook up the paths.
201 unshift( @readings, $rdg_start );
202 push( @readings, $rdg_end );
203 foreach my $i ( 1 .. $#readings ) {
204 if( $recurse->{$i} ) {
205 my( $rwords, $rsig ) = parse_app_entry( $recurse->{$i} );
206 # Get the local "lemma" sequence
207 my $llseq = [ $readings[$i], $readings[$i+1] ];
208 if( $rwords->[0] ne '__LEMMA__' ) {
209 # Treat it as an addition to the last word
210 unshift( @$llseq, $readings[$i-1] );
211 }
212 # Create the reading nodes in $rwords
213 # TODO Hope we don't meet ~ in a recursion
214 my $local_rdg = [];
215 foreach my $i ( 0 .. $#$rwords ) {
216 next if $i == 0 && $rwords->[$i] =~ /^__/;
217 my $reading_id = $llseq->[0]->text . '_' .
218 $llseq->[-1]->text . '/' . $i;
219 my $reading = $c->add_reading( $reading_id );
220 $reading->text( $words->[$i] );
221 push( @$local_rdg, $reading );
222 }
223 # Add the path(s) necessary
224 _add_sigil_path( $c, $rsig, $local_rdg, $llseq );
225 }
226 }
227 _add_sigil_path( $c, $sigla, \@readings, $lemma_sequence );
228 } # end processing of $app
229 } # end foreach my $app in line
230 } # end while <line>
231
232 # Now reconcile all the paths in the collation, and delete our
233 # temporary anchor nodes.
234 expand_all_paths( $c );
235
236 # Finally, calculate the ranks we've got.
237 $c->calculate_ranks;
238}
239
240sub _find_reading_on_line {
241 my( $c, $lemma, $baseline ) = @_;
242
243 my $lemma_start = $baseline->{'start'};
244 my $lemma_end;
245 my $too_far = $baseline->{'end'}->next_reading;
246 my @lemma_words = split( /\s+/, $lemma );
247
248 my %seen;
249 my $scrutinize = ''; # DEBUG variable
250 my $seq = 1;
251 while( $lemma_start ne $too_far ) {
252 # Loop detection
253 if( $seen{ $lemma_start->name() } ) {
254 warn "Detected loop at " . $lemma_start->name . " for lemma $lemma";
255 last;
256 }
257 $seen{ $lemma_start->name() } = 1;
258
259 # Try to match the lemma.
260 # TODO move next/prior reading methods into the reading classes,
261 # to make this more self-contained and not need to pass $c.
262 my $unmatch = 0;
263 my ( $lw, $seq ) = _get_seq( $lemma_words[0] );
264 print STDERR "Matching $lemma_start against $lw...\n"
265 if $scrutinize;
266 if( $lemma_start->text eq $lw ) {
267 # Skip it if we need a match that is not the first.
268 if( --$seq < 1 ) {
269 # Now we have to compare the rest of the words here.
270 if( scalar( @lemma_words ) > 1 ) {
271 my $next_reading =
272 $c->next_reading( $lemma_start );
273 my $wildcard = 0;
274 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
275 if( $w eq '---' ) {
276 # We match everything to the next word.
277 $wildcard = 1;
278 next;
279 } else {
280 $wildcard = 0;
281 }
282 ( $lw, $seq ) = _get_seq( $w );
283 printf STDERR "Now matching %s against %s\n",
284 $next_reading->text, $lw
285 if $scrutinize;
286 if( !$wildcard && $w ne $next_reading->text) {
287 $unmatch = 1;
288 last;
289 } else {
290 $lemma_end = $next_reading;
291 $next_reading =
292 $c->next_reading( $lemma_end );
293 }
294 }
295 } else { # single-word match, easy.
296 $lemma_end = $lemma_start;
297 }
298 } else { # we need the Nth match and aren't there yet
299 $unmatch = 1;
300 }
301 }
302 last unless ( $unmatch || !defined( $lemma_end ) );
303 $lemma_end = undef;
304 $lemma_start = $c->next_reading( $lemma_start );
305 }
306
307 unless( $lemma_end ) {
308 warn "No match found for @lemma_words";
309 return undef;
310 }
311 return( $lemma_start, $lemma_end );
312}
313
314sub _add_reading_placeholders {
315 my( $collation, $lemma_start, $lemma_end ) = @_;
316 # We will splice in a 'begin' and 'end' marker on either side of the
317 # lemma, as sort of a double-endpoint attachment in the graph.
318
319 my $attachlabel = "ATTACH";
320 my( $start_node, $end_node );
321 my @start_id = grep { $_->label eq $attachlabel } $lemma_start->incoming;
322 if( @start_id ) {
323 # There already exists an app-begin node. Use that.
324 $start_node = $start_id[0]->from;
325 } else {
326 $start_node = $collation->add_reading( $app_info->{_id} );
327 $collation->add_path(
328 $collation->prior_reading( $lemma_start, $collation->baselabel ),
329 $start_node, $attachlabel );
330 $collation->add_path( $start_node, $lemma_start, $attachlabel );
331 }
332 # Now the converse for the end.
333 my @end_id = grep { $_->label eq $attachlabel } $lemma_end->outgoing;
334 if( @end_id ) {
335 # There already exists an app-begin node. Use that.
336 $end_node = $end_id[0]->to;
337 } else {
338 $end_node = $collation->add_reading( $app_info->{_id} . "E" );
339 $collation->add_path( $lemma_end, $end_node, $attachlabel );
340 $collation->add_path( $end_node,
341 $collation->next_reading( $lemma_end, $collation->baselabel ),
342 $attachlabel );
343 }
344 return( $start_node, $end_node );
345}
346
347# Function to parse an apparatus reading string, with reference to no other
348# data. Need to do this separately as readings can include readings (ugh).
349# Try to give whatever information we might need, including recursive app
350# entries that might need to be parsed.
351
352sub parse_app_entry {
353 my( $rdg, ) = @_;
354 $rdg =~ s/^\s+//;
355 $rdg =~ s/\s+$//;
356 next unless $rdg; # just in case
357 my @words = split( /\s+/, $rdg );
358 # Zero or more sigils e.g. +, followed by Armenian,
359 # followed by (possibly modified) sigla, followed by
360 # optional : with note.
361 my $is_add;
362 my $is_omission;
363 my $is_transposition;
364 my @reading;
365 my %reading_sigla;
366 my $recursed;
367 my $sig_regex = join( '|', keys %ALL_SIGLA );
368 while( @words ) {
369 my $bit = shift @words;
370 if( $bit eq '+' ) {
371 $is_add = 1;
372 } elsif( $bit eq 'om' ) {
373 $is_omission = 1;
374 } elsif( $bit eq '~' ) {
375 $is_transposition = 1;
376 } elsif( $bit =~ /\p{Armenian}/ ) {
377 warn "Found text in omission?!" if $is_omission;
378 push( @reading, $bit );
379 } elsif( $bit eq ':' ) {
380 # Stop processing.
381 last;
382 } elsif( $bit =~ /^\($/ ) {
383 # It's a recursive reading within a reading. Lemmatize what we
384 # have so far and grab the extra.
385 my @new = ( $1 );
386 until( $new[-1] =~ /\)$/ ) {
387 push( @new, shift @words );
388 }
389 my $recursed_reading = join( ' ', @new );
390 $recursed_reading =~ s/^\((.*)\)//;
391 # This recursive entry refers to the last reading word(s) we
392 # saw. Push its index+1. We will have to come back to parse
393 # it when we are dealing with the main reading.
394 # TODO handle () as first element
395 # TODO handle - as suffix to add, i.e. make new word
396 $recursed->{@reading} = $recursed_reading;
397 } elsif( $bit =~ /^(\Q$sig_regex\E)(.*)$/ {
398 # It must be a sigil.
399 my( $sigil, $mod ) = ( $1, $2 );
400 if( $mod eq "\x{80}" ) {
401 $reading_sigla->{$sig} = '_PC_';
402 $ALL_SIGLA{$sig} = 2; # a pre- and post-corr version exists
403 } elsif( $mod eq '*' ) {
404 $reading_sigla->{$sig} = '_AC_';
405 $ALL_SIGLA{$sig} = 2; # a pre- and post-corr version exists
406 } else {
407 $reading_sigla->{$sig} = 1 unless $mod; # skip secondhand corrections
408 }
409 } elsif( $bit =~ /transpos/ ) {
410 # There are some transpositions not coded rigorously; skip them.
411 warn "Found hard transposition in $rdg; fix manually";
412 last;
413 } else {
414 warn "Not sure what to do with bit $bit in $rdg";
415 }
416 }
417
418 # Transmogrify the reading if necessary.
419 unshift( @reading, '__LEMMA__' ) if $is_add;
420 unshift( @reading, '__TRANSPOSE__' ) if $is_transposition;
421 @reading = () if $is_omission;
422
423 return( \@reading, $reading_sigla, $recursed );
424}
425
426# Add a path for the specified sigla to connect the reading sequence.
427# Add an a.c. path to the base sequence if we have an explicitly p.c.
428# reading.
429# Also handle the paths for sigla we have already added in recursive
430# apparatus readings (i.e. don't add a path if one already exists.)
431
432sub _add_sigil_path {
433 my( $c, $sigla, $base_sequence, $reading_sequence ) = @_;
434 my %skip;
435 foreach my $sig ( keys %$sigla ) {
436 my $use_sig = $sigla->{$sig} eq '_AC_' ? $sig.$c->ac_label : $sig;
437 foreach my $i ( 0 .. $#$reading_sequence-1 ) {
438 if( $skip{$use_sig} ) {
439 next if !_has_prior_reading( $reading_sequence[$i], $use_sig );
440 $skip{$use_sig} = 0;
441 if( _has_next_reading( $reading_sequence[$i], $use_sig ) ) {
442 $skip{$use_sig} = 1;
443 next;
444 }
445 $c->add_path( $reading_sequence[$i], $reading_sequence[$i+1], $use_sig);
446 }
447 if( $sigla->{$sig} eq '_PC_') {
448 $use_sig = $sig.$c->ac_label
449 foreach my $i ( 0 .. @$base_sequence ) {
450 if( $skip{$use_sig} ) {
451 next if !_has_prior_reading( $reading_sequence[$i], $use_sig );
452 $skip{$use_sig} = 0;
453 if( _has_next_reading( $reading_sequence[$i], $use_sig ) ) {
454 $skip{$use_sig} = 1;
455 next;
456 }
457 $c->add_path( $base_sequence[$i], $base_sequence[$i+1], $use_sig );
458 }
459 }
460 }
461}
462
463# Remove all ATTACH* nodes, linking the readings on either side of them.
464# Then walk the collation for all witness paths, and make sure those paths
465# explicitly exist. Then delete all the 'base' paths.
466
467sub expand_all_paths {
468 my( $c ) = @_;
469
470 # Delete the anchors
471 foreach my $anchor ( grep { $_->name =~ /ATTACH/ } $c->readings ) {
472 # Map each path to its incoming/outgoing node.
473 my %incoming;
474 map { $incoming{$_->label} = $_->from } $anchor->incoming();
475 my %outgoing;
476 map { $outgoing{$_->label} = $_->to } $anchor->outgoing();
477 $c->del_reading( $anchor );
478
479 # Connect in and out.
480 my $aclabel = $c->ac_label;
481 foreach my $edge ( keys %incoming ) {
482 my $from = $incoming{$edge};
483 my $to = $outgoing{$edge};
484 if( !$to && $edge =~ /^(.*)\Q$aclabel\E$/ ) {
485 $to = $outgoing{$1};
486 }
487 $to = $outgoing{$c->baselabel} unless $to;
488 warn "Have no outbound base link on " . $anchor->name . "!"
489 unless $to;
490 $c->add_path( $from, $to, $edge );
491 }
492 # TODO Think about deleting outgoing/edge as we use them to make this faster.
493 foreach my $edge ( keys %outgoing ) {
494 my $to = $outgoing{$edge};
495 my $from = incoming{$edge};
496 if( !$from && $edge =~ /^(.*)\Q$aclabel\E$/ ) {
497 $from = $incoming{$1};
498 }
499 $from = $incoming{$c->baselabel} unless $to;
500 warn "Have no inbound base link on " . $anchor->name . "!"
501 unless $from;
502 $c->add_path( $from, $to, $edge )
503 unless _has_prior_reading( $to, $edge );
504 }
505 }
506 }
507
508 # Walk the collation and add paths if necessary
509 foreach my $sig ( keys %ALL_SIGLA ) {
510 my $wit = $c->tradition->witness( $sig );
511 my @path = $c->reading_sequence( $c->start, $c->end, $sig );
512 $wit->path( \@path );
513 if( $ALL_SIGLA{$sig} > 1 ) {
514 my @ac_path = $c->reading_sequence( $c->start, $c->end,
515 $sig.$c->ac_label, $sig );
516 $wit->uncorrected_path( \@path );
517 # a.c. paths are already there by default.
518 }
519 foreach my $i ( 1 .. $#$path ) {
520 # If there is no explicit path for this sigil between n-1 and n,
521 # add it.
522 unless( grep { $_->label eq $sig } $path[$i]->edges_from( $path[$i-1] ) ) {
523 $c->add_path( $path[$i-1], $path[$i], $sig );
524 }
525 }
526 }
527
528 # Delete all baselabel edges
529 foreach my $edge ( grep { $_->label eq $c->baselabel } $c->paths ) {
530 $c->del_edge( $edge );
531 }
532
533 # Calculate ranks on graph nodes
534 $c->calculate_ranks();
535}
536
537sub _get_seq {
538 my( $str ) = @_;
539 my $seq = 1;
540 my $lw = $str;
541 if( $str =~ /^(.*)(\d)\x{80}$/ ) {
542 ( $lw, $seq) = ( $1, $2 );
543 }
544 return( $lw, $seq );
545}
546
547sub _has_next_reading {
548 my( $rdg, $sigil ) = @_;
549 return grep { $_->label eq $sigil } $rdg->outgoing();
550}
551sub _has_prior_reading {
552 my( $rdg, $sigil ) = @_;
553 return grep { $_->label eq $sigil } $rdg->incoming();
554}