change analysis graph calculation - closer but not correct yet.
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm
CommitLineData
e58153d6 1package Text::Tradition::Parser::BaseText;
b49c4318 2
3use strict;
4use warnings;
52ce987f 5use Module::Load;
910a0a6d 6use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );
b49c4318 7
2ceca8c3 8=head1 NAME
9
10Text::Tradition::Parser::BaseText
11
12=head1 SYNOPSIS
13
14use Text::Tradition::Parser::BaseText qw( merge_base );
15merge_base( $graph, 'reference.txt', @apparatus_entries )
16
17=head1 DESCRIPTION
18
19For an overview of the package, see the documentation for the
408449b7 20Text::Tradition module.
2ceca8c3 21
22This module is meant for use with certain of the other Parser classes
23- whenever a list of variants is given with reference to a base text,
24these must be joined into a single collation. The parser should
25therefore make a list of variants and their locations, and BaseText
26will join those listed variants onto the reference text.
27
28=head1 SUBROUTINES
29
30=over
31
52ce987f 32=item B<parse>
33
408449b7 34parse( $graph, $opts );
52ce987f 35
408449b7 36Takes an initialized graph and a hashref of options, which must include:
52ce987f 37- 'base' - the base text referenced by the variants
38- 'format' - the format of the variant list
39- 'data' - the variants, in the given format.
40
41=cut
42
43sub parse {
dfc37e38 44 my( $tradition, $opts ) = @_;
52ce987f 45
dfc37e38 46 my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
52ce987f 47 load( $format_mod );
408449b7 48 # TODO Handle a string someday if we ever have a format other than KUL
49 my @apparatus_entries = $format_mod->can('read')->( $opts );
dfc37e38 50 merge_base( $tradition->collation, $opts->{'base'}, @apparatus_entries );
52ce987f 51}
52
2ceca8c3 53=item B<merge_base>
54
55merge_base( $graph, 'reference.txt', @apparatus_entries )
56
57Takes three arguments: a newly-initialized Text::Tradition::Graph
58object, a text file containing the reference text, and a list of
59variants (apparatus entries). Adds the base text to the graph, and
60joins the variants to that.
61
62The list of variants is an array of hash references; each hash takes
63the form
64 { '_id' => line reference,
65 'rdg_0' => lemma reading,
66 'rdg_1' => first variant,
67 ... # and so on until all distinct readings are listed
68 'WitnessA' => 'rdg_0',
69 'WitnessB' => 'rdg_1',
70 ... # and so on until all witnesses are listed with their readings
71 }
72
73Any hash key that is not of the form /^rdg_\d+$/ and that does not
74begin with an underscore is assumed to be a witness name. Any 'meta'
75information to be passed must be passed in a key with a leading
76underscore in its name.
77
78=cut
79
b15511bf 80my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
4ca00eca 81
82my %base_text_index;
6a222840 83my $edits_required = {};
4ca00eca 84
85# edits_required -> wit -> [ { start_idx, end_idx, items } ]
930ff666 86
b49c4318 87sub merge_base {
e2902068 88 my( $collation, $base_file, @app_entries ) = @_;
89 my @base_line_starts = read_base( $base_file, $collation );
b49c4318 90
52ce987f 91 my %all_witnesses;
6a222840 92 my @unwitnessed_lemma_nodes;
b49c4318 93 foreach my $app ( @app_entries ) {
910a0a6d 94 my( $line, $num ) = split( /\./, $app->{_id} );
95 # DEBUG with a short graph
96 last if $SHORTEND && $line > $SHORTEND;
97 # DEBUG for problematic entries
98 my $scrutinize = '';
99 my $first_line_reading = $base_line_starts[ $line ];
100 my $too_far = $base_line_starts[ $line+1 ];
101
102 my $lemma = $app->{rdg_0};
103 my $seq = 1;
104 # Is this the Nth occurrence of this reading in the line?
105 if( $lemma =~ s/(_)?(\d)$// ) {
106 $seq = $2;
107 }
108 my @lemma_words = split( /\s+/, $lemma );
109
110 # Now search for the lemma words within this line.
111 my $lemma_start = $first_line_reading;
112 my $lemma_end;
113 my %seen;
114 while( $lemma_start ne $too_far ) {
115 # Loop detection
116 if( $seen{ $lemma_start->name() } ) {
117 warn "Detected loop at " . $lemma_start->name() .
118 ", ref $line,$num";
119 last;
120 }
121 $seen{ $lemma_start->name() } = 1;
122
123 # Try to match the lemma.
124 my $unmatch = 0;
125 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
126 $lemma_words[0] . "...\n"
127 if "$line.$num" eq $scrutinize;
128 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
129 # Skip it if we need a match that is not the first.
130 if( --$seq < 1 ) {
131 # Now we have to compare the rest of the words here.
132 if( scalar( @lemma_words ) > 1 ) {
133 my $next_reading =
134 $collation->next_reading( $lemma_start );
135 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
136 printf STDERR "Now matching %s against %s\n",
137 cmp_str($next_reading), $w
138 if "$line.$num" eq $scrutinize;
139 if( $w ne cmp_str($next_reading) ) {
140 $unmatch = 1;
141 last;
142 } else {
143 $lemma_end = $next_reading;
144 $next_reading =
145 $collation->next_reading( $lemma_end );
146 }
147 }
148 } else {
149 $lemma_end = $lemma_start;
150 }
151 } else {
152 $unmatch = 1;
153 }
154 }
155 last unless ( $unmatch || !defined( $lemma_end ) );
156 $lemma_end = undef;
157 $lemma_start = $collation->next_reading( $lemma_start );
158 }
159
160 unless( $lemma_end ) {
161 warn "No match found for @lemma_words at $line.$num";
162 next;
163 }
164
165 # Now we have found the lemma; we will record an 'edit', in
166 # terms of a splice operation, for each subsequent reading.
167 # We also note which witnesses take the given edit.
168
169 my @lemma_set = $collation->reading_sequence( $lemma_start,
170 $lemma_end );
171 my @reading_sets = [ @lemma_set ];
172
173 # For each reading that is not rdg_0, we create the variant
174 # reading nodes, and store the range as an edit operation on
175 # the base text.
176 my $variant_objects;
177 my %pc_seen; # Keep track of mss with explicit post-corr data
178 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
179 my @mss = grep { $app->{$_} eq $k } keys( %$app );
180
181 # Keep track of lemma nodes that don't actually appear in
182 # any MSS; we will want to remove them from the collation.
183 push( @unwitnessed_lemma_nodes, @lemma_set )
184 if !@mss && $k eq 'rdg_0';
185
186 # Keep track of what witnesses we have seen.
187 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
188 # Keep track of which witnesses bear corrected readings here.
189 foreach my $m ( @mss ) {
190 my $base = _is_post_corr( $m );
191 next unless $base;
192 $pc_seen{$base} = 1;
193 }
194 next if $k eq 'rdg_0';
195
196 # Parse the variant into reading tokens.
197 # TODO don't hardcode the reading split operation
198 my @variant = split( /\s+/, $app->{$k} );
199 @variant = () if $app->{$k} eq '/'; # This is an omission.
200
201 my @variant_readings;
202 my $ctr = 0;
203 foreach my $vw ( @variant ) {
204 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
205 my $vwreading = $collation->add_reading( $vwname );
206 $vwreading->text( $vw );
207 push( @variant_readings, $vwreading );
208 }
209
210 $variant_objects->{$k} = { 'mss' => \@mss,
211 'reading' => \@variant_readings,
212 };
213 push( @reading_sets, \@variant_readings );
214 }
215
216 # Now collate and collapse the identical readings within the
217 # collated sets. Modifies the reading sets that were passed.
218 collate_variants( $collation, @reading_sets );
219
220 # Record any stated relationships between the nodes and the lemma.
221 set_relationships( $collation, $app, \@lemma_set, $variant_objects );
222
223 # Now create the splice-edit objects that will be used
224 # to reconstruct each witness.
225
226 foreach my $rkey ( keys %$variant_objects ) {
227 # Object is argument list for splice, so:
228 # offset, length, replacements
229 my $edit_object = [ $lemma_start->name,
230 scalar( @lemma_set ),
231 $variant_objects->{$rkey}->{reading} ];
232 foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
233 # Is this a p.c. entry?
234 my $base = _is_post_corr( $ms );
235 if( $base ) { # this is a post-corr witness
236 my $pc_key = $base . "_post";
237 add_hash_entry( $edits_required, $pc_key, $edit_object );
238 } else { # this is an ante-corr witness
239 my $pc_key = $ms . "_post";
240 add_hash_entry( $edits_required, $ms, $edit_object );
241 unless( $pc_seen{$ms} ) {
242 # If this witness carries no correction, add this
243 # same object to its post-corrected state.
244 add_hash_entry( $edits_required, $pc_key,
245 $edit_object );
246 }
247 }
248 }
249 }
4ca00eca 250 } # Finished going through the apparatus entries
251
252 # Now make the witness objects, and create their text sequences
6a222840 253 foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
910a0a6d 254 print STDERR "Creating witness $w\n";
255 my $witness_obj = $collation->tradition->add_witness( sigil => $w );
256 my $debug; # = $w eq 'Vb11';
257 my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
258 my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
259 if exists( $edits_required->{$w."_post"} );
260
261 my @repeated = check_for_repeated( @ante_corr_seq );
262 warn "Repeated elements @repeated in $w a.c."
263 if @repeated;
264 @repeated = check_for_repeated( @post_corr_seq );
265 warn "Repeated elements @repeated in $w p.c."
266 if @repeated;
267
268 # Now save these paths in my witness object
269 if( @post_corr_seq ) {
270 $witness_obj->path( \@post_corr_seq );
271 $witness_obj->uncorrected_path( \@ante_corr_seq );
272 } else {
273 $witness_obj->path( \@ante_corr_seq );
274 }
b49c4318 275 }
e2902068 276
6a222840 277 # Now remove our 'base text' edges, which is to say, the only
1ed3973e 278 # ones we have created so far. Also remove any unwitnessed
279 # lemma nodes (TODO unless we are treating base as witness)
6a222840 280 foreach ( $collation->paths() ) {
910a0a6d 281 $collation->del_path( $_ );
6a222840 282 }
283 foreach( @unwitnessed_lemma_nodes ) {
910a0a6d 284 $collation->del_reading( $_ );
285 # TODO do we need to delete any relationship paths here?
6a222840 286 }
4ca00eca 287
b15511bf 288 ### HACKY HACKY Do some one-off path corrections here.
408449b7 289 require( 'data/boodts/s158.HACK' );
290 KUL::HACK::pre_path_hack( $collation );
b15511bf 291
910a0a6d 292 # Now walk paths and calculate positional rank.
7e450e44 293 $collation->make_witness_paths();
408449b7 294 KUL::HACK::post_path_hack( $collation );
910a0a6d 295 # Have to check relationship validity at this point, because before that
296 # we had no paths.
297# foreach my $rel ( $collation->relationships ) {
298# next unless $rel->equal_rank;
299# unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
300# warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
301# $rel->type, $rel->from->name, $rel->to->name );
302# }
303# }
304 $collation->calculate_ranks();
15d2d3df 305}
306
2ceca8c3 307=item B<read_base>
308
e2902068 309my @line_beginnings = read_base( 'reference.txt', $collation );
2ceca8c3 310
e2902068 311Takes a text file and a (presumed empty) collation object, adds the
312words as simple linear readings to the collation, and returns a
313list of readings that represent the beginning of lines. This collation
314is now the starting point for application of apparatus entries in
315merge_base, e.g. from a CSV file or a Classical Text Editor file.
2ceca8c3 316
317=cut
b49c4318 318
319sub read_base {
e2902068 320 my( $base_file, $collation ) = @_;
b49c4318 321
e2902068 322 # This array gives the first reading for each line. We put the
b49c4318 323 # common starting point in line zero.
e2902068 324 my $last_reading = $collation->start();
6a222840 325 $base_text_index{$last_reading->name} = 0;
e2902068 326 my $lineref_array = [ $last_reading ]; # There is no line zero.
b49c4318 327
328 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
6a222840 329 my $i = 1;
b49c4318 330 while(<BASE>) {
910a0a6d 331 # Make the readings, and connect them up for the base, but
332 # also save the first reading of each line in an array for the
333 # purpose.
334 # TODO use configurable reading separator
335 chomp;
336 my @words = split;
337 my $started = 0;
338 my $wordref = 0;
339 my $lineref = scalar @$lineref_array;
340 last if $SHORTEND && $lineref > $SHORTEND;
341 foreach my $w ( @words ) {
342 my $readingref = join( ',', $lineref, ++$wordref );
343 my $reading = $collation->add_reading( $readingref );
344 $reading->text( $w );
345 unless( $started ) {
346 push( @$lineref_array, $reading );
347 $started = 1;
348 }
349 # Add edge paths in the graph, for easier tracking when
350 # we start applying corrections. These paths will be
351 # removed when we're done.
352 my $path = $collation->add_path( $last_reading, $reading,
353 $collation->baselabel );
354 $last_reading = $reading;
355
356 # Note an array index for the reading, for later correction splices.
357 $base_text_index{$readingref} = $i++;
358 }
b49c4318 359 }
360 close BASE;
361 # Ending point for all texts
910a0a6d 362 $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
363 push( @$lineref_array, $collation->end );
364 $base_text_index{$collation->end->name} = $i;
b49c4318 365
366 return( @$lineref_array );
367}
368
15d2d3df 369sub set_relationships {
3265b0ce 370 my( $collation, $app, $lemma, $variants ) = @_;
15d2d3df 371 foreach my $rkey ( keys %$variants ) {
910a0a6d 372 my $var = $variants->{$rkey}->{'reading'};
373 my $type = $app->{sprintf( "_%s_type", $rkey )};
374 my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
375 my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
376
377 my %rel_options = ();
378 $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
379 $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
380
381 if( $type =~ /^(inv|tr|rep)$/i ) {
382 # Transposition or repetition: look for nodes with the
383 # same label but different IDs and mark them.
384 $type = 'repetition' if $type =~ /^rep/i;
385 $rel_options{'type'} = $type;
386 $rel_options{'equal_rank'} = undef;
387 my %labels;
388 foreach my $r ( @$lemma ) {
389 $labels{cmp_str( $r )} = $r;
390 }
391 foreach my $r( @$var ) {
392 if( exists $labels{$r->label} &&
393 $r->name ne $labels{$r->label}->name ) {
394 if( $type eq 'repetition' ) {
395 # Repetition
396 $collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
397 } else {
398 # Transposition
399 $r->set_identical( $labels{$r->label} );
400 }
401 }
402 }
403 } elsif( $type =~ /^(gr|sp(el)?)$/i ) {
404
405 # Grammar/spelling/lexical: this can be a one-to-one or
406 # one-to-many mapping. We should think about merging
407 # readings if it is one-to-many.
408
409 $type = 'grammatical' if $type =~ /gr/i;
410 $type = 'spelling' if $type =~ /sp/i;
411 $type = 'repetition' if $type =~ /rep/i;
412 # $type = 'lexical' if $type =~ /lex/i;
413 $rel_options{'type'} = $type;
414 $rel_options{'equal_rank'} = 1;
415 if( @$lemma == @$var ) {
416 foreach my $i ( 0 .. $#{$lemma} ) {
417 $collation->add_relationship( $var->[$i], $lemma->[$i],
418 \%rel_options );
419 }
420 } else {
421 # An uneven many-to-many mapping. Skip for now.
422 # We really want to make a segment out of whatever we have.
423 # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
424 # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
425 # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
426 if( @$lemma == 1 && @$var == 1 ) {
427 $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
428 }
429 }
430 } elsif( $type !~ /^(add|om|lex)$/i ) {
431 warn "Unrecognized type $type";
432 }
15d2d3df 433 }
434}
910a0a6d 435
15d2d3df 436
437
4ca00eca 438sub apply_edits {
b15511bf 439 my( $collation, $edit_sequence, $debug ) = @_;
c78feb69 440 my @lemma_text = $collation->reading_sequence( $collation->start,
910a0a6d 441 $collation->reading( '#END#' ) );
4ca00eca 442 my $drift = 0;
b15511bf 443 foreach my $correction ( @$edit_sequence ) {
910a0a6d 444 my( $lemma_start, $length, $items ) = @$correction;
445 my $offset = $base_text_index{$lemma_start};
446 my $realoffset = $offset + $drift;
447 if( $debug ||
448 $lemma_text[$realoffset]->name ne $lemma_start ) {
449 my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
450 my @base_phrase;
451 my $i = $realoffset;
452 my $l = $collation->reading( $lemma_start );
453 while( $i < $realoffset+$length ) {
454 push( @base_phrase, $l );
455 $l = $collation->next_reading( $l );
456 $i++;
457 }
458
459 print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
460 "with %s (%s) with drift %d\n",
461 join( ' ', map {$_->label} @base_phrase ),
462 join( ' ', map {$_->name} @base_phrase ),
463 $realoffset,
464 join( ' ', map {$_->label} @$items ),
465 join( ' ', map {$_->name} @$items ),
466 $drift,
467 ) if $debug;
468
469 if( $lemma_text[$realoffset]->name ne $lemma_start ) {
470 warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
471 "but %s (%s) is there instead",
472 join( ' ', map {$_->label} @base_phrase ),
473 join( ' ', map {$_->name} @base_phrase ),
474 join( ' ', map {$_->label} @$items ),
475 join( ' ', map {$_->name} @$items ),
476 join( ' ', map {$_->label} @this_phrase ),
477 join( ' ', map {$_->name} @this_phrase ),
478 ) );
479 # next;
480 }
481 }
482 splice( @lemma_text, $realoffset, $length, @$items );
483 $drift += @$items - $length;
b49c4318 484 }
b15511bf 485 return @lemma_text;
b49c4318 486}
910a0a6d 487
4ca00eca 488
e2902068 489# Helper function. Given a witness sigil, if it is a post-correctione
490# sigil,return the base witness. If not, return a false value.
491sub _is_post_corr {
492 my( $sigil ) = @_;
930ff666 493 if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
910a0a6d 494 return $1;
e2902068 495 }
496 return undef;
497}
498
b49c4318 499
2ceca8c3 500=back
501
502=head1 LICENSE
503
504This package is free software and is provided "as is" without express
505or implied warranty. You can redistribute it and/or modify it under
506the same terms as Perl itself.
507
508=head1 AUTHOR
509
510Tara L Andrews, aurum@cpan.org
511
512=cut
513
b49c4318 5141;