XML parsers should accept already-parsed XML object too
[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Parser / BaseText.pm
CommitLineData
e58153d6 1package Text::Tradition::Parser::BaseText;
b49c4318 2
3use strict;
4use warnings;
52ce987f 5use Module::Load;
63778331 6use TryCatch;
7use Text::Tradition::Parser::Util qw( collate_variants cmp_str
8 check_for_repeated add_hash_entry );
b49c4318 9
2ceca8c3 10=head1 NAME
11
12Text::Tradition::Parser::BaseText
13
14=head1 SYNOPSIS
15
16use Text::Tradition::Parser::BaseText qw( merge_base );
17merge_base( $graph, 'reference.txt', @apparatus_entries )
18
19=head1 DESCRIPTION
20
21For an overview of the package, see the documentation for the
408449b7 22Text::Tradition module.
2ceca8c3 23
24This module is meant for use with certain of the other Parser classes
25- whenever a list of variants is given with reference to a base text,
26these must be joined into a single collation. The parser should
27therefore make a list of variants and their locations, and BaseText
28will join those listed variants onto the reference text.
29
30=head1 SUBROUTINES
31
32=over
33
52ce987f 34=item B<parse>
35
408449b7 36parse( $graph, $opts );
52ce987f 37
408449b7 38Takes an initialized graph and a hashref of options, which must include:
52ce987f 39- 'base' - the base text referenced by the variants
40- 'format' - the format of the variant list
41- 'data' - the variants, in the given format.
42
43=cut
44
45sub parse {
dfc37e38 46 my( $tradition, $opts ) = @_;
52ce987f 47
dfc37e38 48 my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
52ce987f 49 load( $format_mod );
408449b7 50 # TODO Handle a string someday if we ever have a format other than KUL
51 my @apparatus_entries = $format_mod->can('read')->( $opts );
b0b4421a 52 merge_base( $tradition->collation, $opts, @apparatus_entries );
52ce987f 53}
54
2ceca8c3 55=item B<merge_base>
56
57merge_base( $graph, 'reference.txt', @apparatus_entries )
58
59Takes three arguments: a newly-initialized Text::Tradition::Graph
60object, a text file containing the reference text, and a list of
61variants (apparatus entries). Adds the base text to the graph, and
62joins the variants to that.
63
64The list of variants is an array of hash references; each hash takes
65the form
66 { '_id' => line reference,
67 'rdg_0' => lemma reading,
68 'rdg_1' => first variant,
69 ... # and so on until all distinct readings are listed
70 'WitnessA' => 'rdg_0',
71 'WitnessB' => 'rdg_1',
72 ... # and so on until all witnesses are listed with their readings
73 }
74
75Any hash key that is not of the form /^rdg_\d+$/ and that does not
76begin with an underscore is assumed to be a witness name. Any 'meta'
77information to be passed must be passed in a key with a leading
78underscore in its name.
79
80=cut
81
b15511bf 82my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
4ca00eca 83
84my %base_text_index;
6a222840 85my $edits_required = {};
4ca00eca 86
87# edits_required -> wit -> [ { start_idx, end_idx, items } ]
930ff666 88
b49c4318 89sub merge_base {
b0b4421a 90 my( $collation, $opts, @app_entries ) = @_;
91 my @base_line_starts = read_base( $opts->{'base'}, $collation );
b49c4318 92
52ce987f 93 my %all_witnesses;
b49c4318 94 foreach my $app ( @app_entries ) {
910a0a6d 95 my( $line, $num ) = split( /\./, $app->{_id} );
96 # DEBUG with a short graph
97 last if $SHORTEND && $line > $SHORTEND;
98 # DEBUG for problematic entries
99 my $scrutinize = '';
100 my $first_line_reading = $base_line_starts[ $line ];
101 my $too_far = $base_line_starts[ $line+1 ];
102
103 my $lemma = $app->{rdg_0};
104 my $seq = 1;
105 # Is this the Nth occurrence of this reading in the line?
106 if( $lemma =~ s/(_)?(\d)$// ) {
107 $seq = $2;
108 }
109 my @lemma_words = split( /\s+/, $lemma );
110
111 # Now search for the lemma words within this line.
112 my $lemma_start = $first_line_reading;
113 my $lemma_end;
114 my %seen;
115 while( $lemma_start ne $too_far ) {
116 # Loop detection
49d4f2ac 117 if( $seen{ $lemma_start->id() } ) {
118 warn "Detected loop at " . $lemma_start->id() .
910a0a6d 119 ", ref $line,$num";
120 last;
121 }
49d4f2ac 122 $seen{ $lemma_start->id() } = 1;
910a0a6d 123
124 # Try to match the lemma.
125 my $unmatch = 0;
126 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
127 $lemma_words[0] . "...\n"
128 if "$line.$num" eq $scrutinize;
129 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
130 # Skip it if we need a match that is not the first.
131 if( --$seq < 1 ) {
132 # Now we have to compare the rest of the words here.
133 if( scalar( @lemma_words ) > 1 ) {
134 my $next_reading =
135 $collation->next_reading( $lemma_start );
136 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
137 printf STDERR "Now matching %s against %s\n",
138 cmp_str($next_reading), $w
139 if "$line.$num" eq $scrutinize;
140 if( $w ne cmp_str($next_reading) ) {
141 $unmatch = 1;
142 last;
143 } else {
144 $lemma_end = $next_reading;
145 $next_reading =
146 $collation->next_reading( $lemma_end );
147 }
148 }
149 } else {
150 $lemma_end = $lemma_start;
151 }
152 } else {
153 $unmatch = 1;
154 }
155 }
156 last unless ( $unmatch || !defined( $lemma_end ) );
157 $lemma_end = undef;
158 $lemma_start = $collation->next_reading( $lemma_start );
159 }
160
161 unless( $lemma_end ) {
162 warn "No match found for @lemma_words at $line.$num";
163 next;
164 }
165
166 # Now we have found the lemma; we will record an 'edit', in
167 # terms of a splice operation, for each subsequent reading.
168 # We also note which witnesses take the given edit.
169
170 my @lemma_set = $collation->reading_sequence( $lemma_start,
171 $lemma_end );
172 my @reading_sets = [ @lemma_set ];
b0b4421a 173
910a0a6d 174 # For each reading that is not rdg_0, we create the variant
175 # reading nodes, and store the range as an edit operation on
176 # the base text.
177 my $variant_objects;
178 my %pc_seen; # Keep track of mss with explicit post-corr data
179 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
180 my @mss = grep { $app->{$_} eq $k } keys( %$app );
181
910a0a6d 182 # Keep track of what witnesses we have seen.
183 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
184 # Keep track of which witnesses bear corrected readings here.
185 foreach my $m ( @mss ) {
186 my $base = _is_post_corr( $m );
187 next unless $base;
188 $pc_seen{$base} = 1;
189 }
190 next if $k eq 'rdg_0';
191
192 # Parse the variant into reading tokens.
193 # TODO don't hardcode the reading split operation
194 my @variant = split( /\s+/, $app->{$k} );
195 @variant = () if $app->{$k} eq '/'; # This is an omission.
196
197 my @variant_readings;
198 my $ctr = 0;
199 foreach my $vw ( @variant ) {
200 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
49d4f2ac 201 my $vwreading = $collation->add_reading( {
202 'id' => $vwname,
203 'text' => $vw } );
910a0a6d 204 push( @variant_readings, $vwreading );
205 }
206
207 $variant_objects->{$k} = { 'mss' => \@mss,
208 'reading' => \@variant_readings,
209 };
210 push( @reading_sets, \@variant_readings );
211 }
212
213 # Now collate and collapse the identical readings within the
214 # collated sets. Modifies the reading sets that were passed.
215 collate_variants( $collation, @reading_sets );
216
217 # Record any stated relationships between the nodes and the lemma.
218 set_relationships( $collation, $app, \@lemma_set, $variant_objects );
219
220 # Now create the splice-edit objects that will be used
221 # to reconstruct each witness.
222
223 foreach my $rkey ( keys %$variant_objects ) {
224 # Object is argument list for splice, so:
225 # offset, length, replacements
49d4f2ac 226 my $edit_object = [ $lemma_start->id,
910a0a6d 227 scalar( @lemma_set ),
228 $variant_objects->{$rkey}->{reading} ];
229 foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
230 # Is this a p.c. entry?
231 my $base = _is_post_corr( $ms );
232 if( $base ) { # this is a post-corr witness
233 my $pc_key = $base . "_post";
234 add_hash_entry( $edits_required, $pc_key, $edit_object );
235 } else { # this is an ante-corr witness
236 my $pc_key = $ms . "_post";
237 add_hash_entry( $edits_required, $ms, $edit_object );
238 unless( $pc_seen{$ms} ) {
239 # If this witness carries no correction, add this
240 # same object to its post-corrected state.
241 add_hash_entry( $edits_required, $pc_key,
242 $edit_object );
243 }
244 }
245 }
246 }
4ca00eca 247 } # Finished going through the apparatus entries
248
249 # Now make the witness objects, and create their text sequences
6a222840 250 foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
910a0a6d 251 print STDERR "Creating witness $w\n";
82fa4d57 252 my $witness_obj = $collation->tradition->add_witness(
253 sigil => $w, sourcetype => 'collation' );
910a0a6d 254 my $debug; # = $w eq 'Vb11';
255 my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
256 my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
257 if exists( $edits_required->{$w."_post"} );
258
259 my @repeated = check_for_repeated( @ante_corr_seq );
260 warn "Repeated elements @repeated in $w a.c."
261 if @repeated;
262 @repeated = check_for_repeated( @post_corr_seq );
263 warn "Repeated elements @repeated in $w p.c."
264 if @repeated;
265
266 # Now save these paths in my witness object
267 if( @post_corr_seq ) {
268 $witness_obj->path( \@post_corr_seq );
269 $witness_obj->uncorrected_path( \@ante_corr_seq );
270 } else {
271 $witness_obj->path( \@ante_corr_seq );
272 }
b49c4318 273 }
e2902068 274
6a222840 275 # Now remove our 'base text' edges, which is to say, the only
1ed3973e 276 # ones we have created so far. Also remove any unwitnessed
277 # lemma nodes (TODO unless we are treating base as witness)
6a222840 278 foreach ( $collation->paths() ) {
49d4f2ac 279 $collation->del_path( $_, $collation->baselabel );
6a222840 280 }
4ca00eca 281
b15511bf 282 ### HACKY HACKY Do some one-off path corrections here.
b0b4421a 283 if( $opts->{'input'} eq 'KUL' ) {
284 require 'data/boodts/s158.HACK';
285 KUL::HACK::pre_path_hack( $collation );
286 }
287
910a0a6d 288 # Now walk paths and calculate positional rank.
7e450e44 289 $collation->make_witness_paths();
49d4f2ac 290 # Now delete any orphaned readings.
291 foreach my $r ( $collation->sequence->isolated_vertices ) {
292 print STDERR "Deleting unconnected reading $r / " .
293 $collation->reading( $r )->text . "\n";
294 $collation->del_reading( $r );
295 }
296
b0b4421a 297 KUL::HACK::post_path_hack( $collation ) if $opts->{'input'} eq 'KUL';
910a0a6d 298 # Have to check relationship validity at this point, because before that
299 # we had no paths.
300# foreach my $rel ( $collation->relationships ) {
301# next unless $rel->equal_rank;
302# unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
303# warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
49d4f2ac 304# $rel->type, $rel->from->id, $rel->to->id );
910a0a6d 305# }
306# }
82a45078 307 unless( $opts->{'nocalc'} ) {
308 $collation->calculate_common_readings(); # will implicitly rank
309 }
15d2d3df 310}
311
2ceca8c3 312=item B<read_base>
313
e2902068 314my @line_beginnings = read_base( 'reference.txt', $collation );
2ceca8c3 315
e2902068 316Takes a text file and a (presumed empty) collation object, adds the
317words as simple linear readings to the collation, and returns a
318list of readings that represent the beginning of lines. This collation
319is now the starting point for application of apparatus entries in
320merge_base, e.g. from a CSV file or a Classical Text Editor file.
2ceca8c3 321
322=cut
b49c4318 323
324sub read_base {
e2902068 325 my( $base_file, $collation ) = @_;
b49c4318 326
e2902068 327 # This array gives the first reading for each line. We put the
b49c4318 328 # common starting point in line zero.
49d4f2ac 329 my $last_reading = $collation->start;
330 $base_text_index{$last_reading->id} = 0;
e2902068 331 my $lineref_array = [ $last_reading ]; # There is no line zero.
b49c4318 332
333 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
6a222840 334 my $i = 1;
b49c4318 335 while(<BASE>) {
910a0a6d 336 # Make the readings, and connect them up for the base, but
337 # also save the first reading of each line in an array for the
338 # purpose.
339 # TODO use configurable reading separator
340 chomp;
341 my @words = split;
342 my $started = 0;
343 my $wordref = 0;
344 my $lineref = scalar @$lineref_array;
345 last if $SHORTEND && $lineref > $SHORTEND;
346 foreach my $w ( @words ) {
347 my $readingref = join( ',', $lineref, ++$wordref );
49d4f2ac 348 my $reading = $collation->add_reading( { id => $readingref, text => $w } );
910a0a6d 349 unless( $started ) {
350 push( @$lineref_array, $reading );
351 $started = 1;
352 }
353 # Add edge paths in the graph, for easier tracking when
354 # we start applying corrections. These paths will be
355 # removed when we're done.
356 my $path = $collation->add_path( $last_reading, $reading,
357 $collation->baselabel );
358 $last_reading = $reading;
359
360 # Note an array index for the reading, for later correction splices.
361 $base_text_index{$readingref} = $i++;
362 }
b49c4318 363 }
364 close BASE;
365 # Ending point for all texts
910a0a6d 366 $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
367 push( @$lineref_array, $collation->end );
49d4f2ac 368 $base_text_index{$collation->end->id} = $i;
b49c4318 369
370 return( @$lineref_array );
371}
372
15d2d3df 373sub set_relationships {
3265b0ce 374 my( $collation, $app, $lemma, $variants ) = @_;
15d2d3df 375 foreach my $rkey ( keys %$variants ) {
910a0a6d 376 my $var = $variants->{$rkey}->{'reading'};
377 my $type = $app->{sprintf( "_%s_type", $rkey )};
378 my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
379 my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
380
381 my %rel_options = ();
382 $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
383 $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
384
385 if( $type =~ /^(inv|tr|rep)$/i ) {
386 # Transposition or repetition: look for nodes with the
387 # same label but different IDs and mark them.
388 $type = 'repetition' if $type =~ /^rep/i;
389 $rel_options{'type'} = $type;
390 $rel_options{'equal_rank'} = undef;
391 my %labels;
392 foreach my $r ( @$lemma ) {
393 $labels{cmp_str( $r )} = $r;
394 }
395 foreach my $r( @$var ) {
49d4f2ac 396 if( exists $labels{$r->text} &&
397 $r->id ne $labels{$r->text}->id ) {
910a0a6d 398 if( $type eq 'repetition' ) {
399 # Repetition
63778331 400 try {
401 $collation->add_relationship( $r, $labels{$r->text}, \%rel_options );
402 } catch( Text::Tradition::Error $e ) {
403 warn "Could not set repetition relationship $r -> "
404 . $labels{$r->text} . ": " . $e->message;
405 }
910a0a6d 406 } else {
407 # Transposition
63778331 408 try {
409 $r->set_identical( $labels{$r->text} );
410 } catch( Text::Tradition::Error $e ) {
411 warn "Could not set transposition relationship $r -> "
412 . $labels{$r->text} . ": " . $e->message;
413 }
910a0a6d 414 }
415 }
416 }
417 } elsif( $type =~ /^(gr|sp(el)?)$/i ) {
418
419 # Grammar/spelling/lexical: this can be a one-to-one or
420 # one-to-many mapping. We should think about merging
421 # readings if it is one-to-many.
422
423 $type = 'grammatical' if $type =~ /gr/i;
424 $type = 'spelling' if $type =~ /sp/i;
425 $type = 'repetition' if $type =~ /rep/i;
426 # $type = 'lexical' if $type =~ /lex/i;
427 $rel_options{'type'} = $type;
428 $rel_options{'equal_rank'} = 1;
429 if( @$lemma == @$var ) {
430 foreach my $i ( 0 .. $#{$lemma} ) {
63778331 431 try {
432 $collation->add_relationship( $var->[$i], $lemma->[$i],
433 \%rel_options );
434 } catch( Text::Tradition::Error $e ) {
435 warn "Could not set $type relationship " . $var->[$i] . " -> "
436 . $lemma->[$i] . ": " . $e->message;
437 }
910a0a6d 438 }
439 } else {
440 # An uneven many-to-many mapping. Skip for now.
441 # We really want to make a segment out of whatever we have.
442 # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
443 # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
444 # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
63778331 445 # if( @$lemma == 1 && @$var == 1 ) {
446 # $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
447 # }
910a0a6d 448 }
449 } elsif( $type !~ /^(add|om|lex)$/i ) {
450 warn "Unrecognized type $type";
451 }
15d2d3df 452 }
453}
910a0a6d 454
15d2d3df 455
456
4ca00eca 457sub apply_edits {
b15511bf 458 my( $collation, $edit_sequence, $debug ) = @_;
49d4f2ac 459 my @lemma_text = $collation->reading_sequence(
460 $collation->start, $collation->end );
4ca00eca 461 my $drift = 0;
b15511bf 462 foreach my $correction ( @$edit_sequence ) {
910a0a6d 463 my( $lemma_start, $length, $items ) = @$correction;
464 my $offset = $base_text_index{$lemma_start};
465 my $realoffset = $offset + $drift;
466 if( $debug ||
49d4f2ac 467 $lemma_text[$realoffset]->id ne $lemma_start ) {
910a0a6d 468 my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
469 my @base_phrase;
470 my $i = $realoffset;
471 my $l = $collation->reading( $lemma_start );
472 while( $i < $realoffset+$length ) {
473 push( @base_phrase, $l );
474 $l = $collation->next_reading( $l );
475 $i++;
476 }
477
478 print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
479 "with %s (%s) with drift %d\n",
49d4f2ac 480 join( ' ', map {$_->text} @base_phrase ),
481 join( ' ', map {$_->id} @base_phrase ),
910a0a6d 482 $realoffset,
49d4f2ac 483 join( ' ', map {$_->text} @$items ),
484 join( ' ', map {$_->id} @$items ),
910a0a6d 485 $drift,
486 ) if $debug;
487
49d4f2ac 488 if( $lemma_text[$realoffset]->id ne $lemma_start ) {
910a0a6d 489 warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
490 "but %s (%s) is there instead",
49d4f2ac 491 join( ' ', map {$_->text} @base_phrase ),
492 join( ' ', map {$_->id} @base_phrase ),
493 join( ' ', map {$_->text} @$items ),
494 join( ' ', map {$_->id} @$items ),
495 join( ' ', map {$_->text} @this_phrase ),
496 join( ' ', map {$_->id} @this_phrase ),
910a0a6d 497 ) );
498 # next;
499 }
500 }
501 splice( @lemma_text, $realoffset, $length, @$items );
502 $drift += @$items - $length;
b49c4318 503 }
b15511bf 504 return @lemma_text;
b49c4318 505}
910a0a6d 506
4ca00eca 507
e2902068 508# Helper function. Given a witness sigil, if it is a post-correctione
509# sigil,return the base witness. If not, return a false value.
510sub _is_post_corr {
511 my( $sigil ) = @_;
930ff666 512 if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
910a0a6d 513 return $1;
e2902068 514 }
515 return undef;
516}
517
b49c4318 518
2ceca8c3 519=back
520
521=head1 LICENSE
522
523This package is free software and is provided "as is" without express
524or implied warranty. You can redistribute it and/or modify it under
525the same terms as Perl itself.
526
527=head1 AUTHOR
528
529Tara L Andrews, aurum@cpan.org
530
531=cut
532
b49c4318 5331;