UNTESTED saving work on base text parsing with new library
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm
CommitLineData
e58153d6 1package Text::Tradition::Parser::BaseText;
b49c4318 2
3use strict;
4use warnings;
52ce987f 5use Module::Load;
b49c4318 6
2ceca8c3 7=head1 NAME
8
9Text::Tradition::Parser::BaseText
10
11=head1 SYNOPSIS
12
13use Text::Tradition::Parser::BaseText qw( merge_base );
14merge_base( $graph, 'reference.txt', @apparatus_entries )
15
16=head1 DESCRIPTION
17
18For an overview of the package, see the documentation for the
19Text::Tradition::Graph module.
20
21This module is meant for use with certain of the other Parser classes
22- whenever a list of variants is given with reference to a base text,
23these must be joined into a single collation. The parser should
24therefore make a list of variants and their locations, and BaseText
25will join those listed variants onto the reference text.
26
27=head1 SUBROUTINES
28
29=over
30
52ce987f 31=item B<parse>
32
33parse( $graph, %opts );
34
35Takes an initialized graph and a set of options, which must include:
36- 'base' - the base text referenced by the variants
37- 'format' - the format of the variant list
38- 'data' - the variants, in the given format.
39
40=cut
41
42sub parse {
e2902068 43 my( $tradition, %opts ) = @_;
52ce987f 44
45 my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
46 load( $format_mod );
47 my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
e2902068 48 merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
52ce987f 49}
50
2ceca8c3 51=item B<merge_base>
52
53merge_base( $graph, 'reference.txt', @apparatus_entries )
54
55Takes three arguments: a newly-initialized Text::Tradition::Graph
56object, a text file containing the reference text, and a list of
57variants (apparatus entries). Adds the base text to the graph, and
58joins the variants to that.
59
60The list of variants is an array of hash references; each hash takes
61the form
62 { '_id' => line reference,
63 'rdg_0' => lemma reading,
64 'rdg_1' => first variant,
65 ... # and so on until all distinct readings are listed
66 'WitnessA' => 'rdg_0',
67 'WitnessB' => 'rdg_1',
68 ... # and so on until all witnesses are listed with their readings
69 }
70
71Any hash key that is not of the form /^rdg_\d+$/ and that does not
72begin with an underscore is assumed to be a witness name. Any 'meta'
73information to be passed must be passed in a key with a leading
74underscore in its name.
75
76=cut
77
b49c4318 78sub merge_base {
e2902068 79 my( $collation, $base_file, @app_entries ) = @_;
80 my @base_line_starts = read_base( $base_file, $collation );
b49c4318 81
52ce987f 82 my %all_witnesses;
b49c4318 83 foreach my $app ( @app_entries ) {
84 my( $line, $num ) = split( /\./, $app->{_id} );
85 # DEBUG with a short graph
86 # last if $line > 2;
2ceca8c3 87 # DEBUG for problematic entries
e49731d7 88 my $scrutinize = "";
e2902068 89 my $first_line_reading = $base_line_starts[ $line ];
b49c4318 90 my $too_far = $base_line_starts[ $line+1 ];
91
92 my $lemma = $app->{rdg_0};
93 my $seq = 1;
94 # Is this the Nth occurrence of this reading in the line?
95 if( $lemma =~ s/(_)?(\d)$// ) {
96 $seq = $2;
97 }
98 my @lemma_words = split( /\s+/, $lemma );
99
100 # Now search for the lemma words within this line.
e2902068 101 my $lemma_start = $first_line_reading;
b49c4318 102 my $lemma_end;
103 my %seen;
104 while( $lemma_start ne $too_far ) {
105 # Loop detection
106 if( $seen{ $lemma_start->name() } ) {
107 warn "Detected loop at " . $lemma_start->name() .
108 ", ref $line,$num";
109 last;
110 }
111 $seen{ $lemma_start->name() } = 1;
112
113 # Try to match the lemma.
114 my $unmatch = 0;
115 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
116 $lemma_words[0] . "...\n"
117 if "$line.$num" eq $scrutinize;
118 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
119 # Skip it if we need a match that is not the first.
120 if( --$seq < 1 ) {
121 # Now we have to compare the rest of the words here.
122 if( scalar( @lemma_words ) > 1 ) {
e2902068 123 my $next_reading =
124 $collation->next_reading( $lemma_start );
b49c4318 125 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
126 printf STDERR "Now matching %s against %s\n",
e2902068 127 cmp_str($next_reading), $w
b49c4318 128 if "$line.$num" eq $scrutinize;
e2902068 129 if( $w ne cmp_str($next_reading) ) {
b49c4318 130 $unmatch = 1;
131 last;
132 } else {
e2902068 133 $lemma_end = $next_reading;
134 $next_reading =
135 $collation->next_reading( $lemma_end );
b49c4318 136 }
137 }
138 } else {
139 $lemma_end = $lemma_start;
140 }
141 } else {
142 $unmatch = 1;
143 }
144 }
145 last unless ( $unmatch || !defined( $lemma_end ) );
146 $lemma_end = undef;
e2902068 147 $lemma_start = $collation->next_reading( $lemma_start );
b49c4318 148 }
149
150 unless( $lemma_end ) {
151 warn "No match found for @lemma_words at $line.$num";
152 next;
153 } else {
e2902068 154 # These are no longer common readings; unmark them as such.
155 my @lemma_readings = $collation->reading_sequence( $lemma_start,
b49c4318 156 $lemma_end );
e2902068 157 map { $_->set_attribute( 'class', 'lemma' ) } @lemma_readings;
b49c4318 158 }
159
e2902068 160 # Now we have our lemma readings; we add the variant readings
161 # to the collation.
b49c4318 162
e49731d7 163 # Keep track of the start and end point of each reading for later
e2902068 164 # reading collapse.
e49731d7 165 my @readings = ( $lemma_start, $lemma_end );
166
e2902068 167 # For each reading that is not rdg_0, we make a chain of readings
b49c4318 168 # and connect them to the anchor. Edges are named after the mss
169 # that are relevant.
170 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
171 next if $k eq 'rdg_0'; # that's the lemma.
e2902068 172 # TODO look at the lemma for any p.c. readings, and add
173 # them explicitly!
b49c4318 174 my @variant = split( /\s+/, $app->{$k} );
175 @variant = () if $app->{$k} eq '/'; # This is an omission.
176 my @mss = grep { $app->{$_} eq $k } keys( %$app );
177
178 unless( @mss ) {
179 print STDERR "Skipping '@variant' at $line.$num: no mss\n";
180 next;
181 }
182
e2902068 183 # Keep track of what witnesses we have seen.
52ce987f 184 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
b49c4318 185
e2902068 186 # Make the variant into a set of readings.
b49c4318 187 my $ctr = 0;
e2902068 188 my $last_reading = $collation->prior_reading( $lemma_start );
b49c4318 189 my $var_start;
190 foreach my $vw ( @variant ) {
191 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
e2902068 192 my $vwreading = $collation->add_reading( $vwname );
193 $vwreading->text( $vw );
194 $vwreading->make_variant();
195 foreach ( @mss ) {
196 $collation->add_path( $last_reading, $vwreading, $_ );
197 }
198 $var_start = $vwreading unless $var_start;
199 $last_reading = $vwreading;
b49c4318 200 }
201 # Now hook it up at the end.
e2902068 202 foreach ( @mss ) {
203 $collation->add_path( $last_reading,
204 $collation->next_word( $lemma_end ),
205 $_ );
206 }
b49c4318 207
e49731d7 208 if( $var_start ) { # if it wasn't an empty reading
e2902068 209 push( @readings, $var_start, $last_reading );
e49731d7 210 }
b49c4318 211 }
e49731d7 212
e2902068 213 # Now collate and collapse the identical readings within the collation.
214 collate_variants( $collation, @readings );
b49c4318 215 }
216
e2902068 217 # Now make the witness objects
52ce987f 218 foreach my $w ( keys %all_witnesses ) {
e2902068 219 my $base = _is_post_corr( $w );
220 if( $base ) {
221 my $pctag = substr( $w, length( $base ) );
222 my $existing_wit = $collation->tradition->witness( $base );
223 unless( $existing_wit ) {
224 $existing_wit = $collation->tradition->add_witness( $base );
225 }
226 $existing_wit->post_correctione( $pctag );
227 } else {
228 $collation->tradition->add_witness( $w )
229 unless $collation->tradition->witness( $w );
52ce987f 230 }
b49c4318 231 }
e2902068 232
233 # Now walk paths and calculate positions.
234 my @common_readings =
235 $collation->walk_and_expand_base( $collation->reading( '#END#' ) );
236 $collation->calculate_positions( @common_readings );
b49c4318 237}
238
2ceca8c3 239=item B<read_base>
240
e2902068 241my @line_beginnings = read_base( 'reference.txt', $collation );
2ceca8c3 242
e2902068 243Takes a text file and a (presumed empty) collation object, adds the
244words as simple linear readings to the collation, and returns a
245list of readings that represent the beginning of lines. This collation
246is now the starting point for application of apparatus entries in
247merge_base, e.g. from a CSV file or a Classical Text Editor file.
2ceca8c3 248
249=cut
b49c4318 250
251sub read_base {
e2902068 252 my( $base_file, $collation ) = @_;
b49c4318 253
e2902068 254 # This array gives the first reading for each line. We put the
b49c4318 255 # common starting point in line zero.
e2902068 256 my $last_reading = $collation->start();
257 my $lineref_array = [ $last_reading ]; # There is no line zero.
b49c4318 258
259 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
260 while(<BASE>) {
e2902068 261 # Make the readings, and connect them up for the base, but
262 # also save the first reading of each line in an array for the
263 # purpose.
264 # TODO use configurable reading separator
b49c4318 265 chomp;
266 my @words = split;
267 my $started = 0;
268 my $wordref = 0;
269 my $lineref = scalar @$lineref_array;
270 foreach my $w ( @words ) {
e2902068 271 my $readingref = join( ',', $lineref, ++$wordref );
272 my $reading = $collation->add_reading( $readingref );
273 $reading->text( $w );
274 $reading->make_common();
b49c4318 275 unless( $started ) {
e2902068 276 push( @$lineref_array, $reading );
b49c4318 277 $started = 1;
278 }
e2902068 279 if( $last_reading ) {
280 my $path = $collation->add_path( $last_reading, $reading,
281 "base text" );
282 $path->set_attribute( 'class', 'basetext' );
283 $last_reading = $reading;
b49c4318 284 } # TODO there should be no else here...
285 }
286 }
287 close BASE;
288 # Ending point for all texts
e2902068 289 my $endpoint = $collation->add_reading( '#END#' );
290 $collation->add_path( $last_reading, $endpoint, "base text" );
b49c4318 291 push( @$lineref_array, $endpoint );
292
293 return( @$lineref_array );
294}
295
e49731d7 296=item B<collate_variants>
2ceca8c3 297
e2902068 298collate_variants( $collation, @readings )
2ceca8c3 299
e49731d7 300Given a set of readings in the form
301( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
e2902068 302walks through each to identify those readings that are identical. The
303collation is a Text::Tradition::Collation object; the elements of
304@readings are Text::Tradition::Collation::Reading objects that appear
305on the collation graph.
b49c4318 306
2ceca8c3 307TODO: Handle collapsed and non-collapsed transpositions.
308
309=cut
b49c4318 310
e49731d7 311sub collate_variants {
e2902068 312 my( $collation, @readings ) = @_;
e49731d7 313 my $lemma_start = shift @readings;
314 my $lemma_end = shift @readings;
52ce987f 315 my $detranspose = 0;
b49c4318 316
1b9423d5 317 # We need to calculate positions at this point, which is where
318 # we are getting the implicit information from the apparatus.
319
e2902068 320 # Start the list of distinct readings with those readings in the lemma.
321 my @distinct_readings;
1b9423d5 322 my $position = 0;
b49c4318 323 while( $lemma_start ne $lemma_end ) {
e2902068 324 push( @distinct_readings, [ $lemma_start, 'base text', $position++ ] );
325 $lemma_start = $collation->next_word( $lemma_start );
b49c4318 326 }
e2902068 327 push( @distinct_readings, [ $lemma_end, 'base text', $position++ ] );
b49c4318 328
e49731d7 329
330 while( scalar @readings ) {
331 my( $var_start, $var_end ) = splice( @readings, 0, 2 );
332
e2902068 333 # I want to look at the readings in the variant and lemma, and
334 # collapse readings that are the same word. This is mini-collation.
e49731d7 335 # Each word in the 'main' list can only be collapsed once with a
336 # word from the current reading.
337 my %collapsed = ();
338
e2902068 339 # Get the label. There will only be one outgoing path to start
e49731d7 340 # with, so this is safe.
341 my @out = $var_start->outgoing();
342 my $var_label = $out[0]->label();
343
e2902068 344 my @variant_readings;
e49731d7 345 while( $var_start ne $var_end ) {
e2902068 346 push( @variant_readings, $var_start );
347 $var_start = $collation->next_word( $var_start, $var_label );
e49731d7 348 }
e2902068 349 push( @variant_readings, $var_end );
e49731d7 350
e2902068 351 # Go through the variant readings, and if we find a lemma reading that
352 # hasn't yet been collapsed with a reading, equate them. If we do
353 # not, keep them to push onto the end of all_readings.
354 my @remaining_readings;
e49731d7 355 my $last_index = 0;
1b9423d5 356 my $curr_pos = 0;
e2902068 357 foreach my $w ( @variant_readings ) {
e49731d7 358 my $word = $w->label();
359 my $matched = 0;
e2902068 360 foreach my $idx ( $last_index .. $#distinct_readings ) {
361 my( $l, $pathlabel, $pos ) = @{$distinct_readings[$idx]};
e49731d7 362 if( $word eq cmp_str( $l ) ) {
363 next if exists( $collapsed{ $l->label } )
364 && $collapsed{ $l->label } eq $l;
365 $matched = 1;
366 $last_index = $idx if $detranspose;
e2902068 367 # Collapse the readings.
368 printf STDERR "Merging readings %s/%s and %s/%s\n",
e49731d7 369 $l->name, $l->label, $w->name, $w->label;
e2902068 370 $collation->merge_readings( $l, $w );
e49731d7 371 $collapsed{ $l->label } = $l;
e2902068 372 # Now collapse any multiple paths to and from the reading.
373 remove_duplicate_paths( $collation,
374 $collation->prior_word( $l, $pathlabel ), $l );
375 remove_duplicate_paths( $collation, $l,
376 $collation->next_word( $l, $pathlabel ) );
1b9423d5 377 $curr_pos = $pos;
378 last;
e49731d7 379 }
b49c4318 380 }
e2902068 381 push( @remaining_readings, [ $w, $var_label, $curr_pos++ ] ) unless $matched;
b49c4318 382 }
e2902068 383 push( @distinct_readings, @remaining_readings) if scalar( @remaining_readings );
b49c4318 384 }
1b9423d5 385
e2902068 386 # Now set the positions of all the readings in this variation.
1b9423d5 387 #$DB::single = 1;
e2902068 388 print STDERR "Readings and their positions are:\n";
389 foreach my $n ( @distinct_readings ) {
1b9423d5 390 printf STDERR "\t%s (position %s)\n", $n->[0]->label(), $n->[2];
391 }
b49c4318 392}
393
e2902068 394=item B<remove_duplicate_paths>
2ceca8c3 395
e2902068 396remove_duplicate_paths( $collation, $from, $to );
2ceca8c3 397
e2902068 398Given two readings, reduce the number of paths between those readings to
399one. If neither path represents a base text, combine their labels.
2ceca8c3 400
401=cut
402
e2902068 403sub remove_duplicate_paths {
404 my( $collation, $from, $to ) = @_;
405 my @paths = $from->paths_to( $to );
406 if( scalar @paths > 1 ) {
407 my @base = grep { $_->label eq 'base text' } @paths;
b49c4318 408 if ( scalar @base ) {
e2902068 409 # Remove the paths that are not base.
410 foreach my $e ( @paths ) {
411 $collation->del_path( $e )
b49c4318 412 unless $e eq $base[0];
413 }
414 } else {
e2902068 415 # Combine the paths into one.
416 my $new_path_name = join( ', ', map { $_->label() } @paths );
417 my $new_path = shift @paths;
418 $new_path->set_attribute( 'label', $new_path_name );
419 foreach my $e ( @paths ) {
420 $collation->del_path( $e );
b49c4318 421 }
422 }
423 }
424}
425
e2902068 426# Helper function. Given a witness sigil, if it is a post-correctione
427# sigil,return the base witness. If not, return a false value.
428sub _is_post_corr {
429 my( $sigil ) = @_;
430 if( $sigil =~ /^(.*?)(\s*\(p\.\s*c\.\))$/ ) {
431 return $1;
432 }
433 return undef;
434}
435
2ceca8c3 436=item B<cmp_str>
437
438Pretend you never saw this method. Really it needs to not be hardcoded.
439
440=cut
441
b49c4318 442sub cmp_str {
e2902068 443 my( $reading ) = @_;
444 my $word = $reading->label();
b49c4318 445 $word = lc( $word );
446 $word =~ s/\W//g;
447 $word =~ s/v/u/g;
448 $word =~ s/j/i/g;
449 $word =~ s/cha/ca/g;
450 $word =~ s/quatuor/quattuor/g;
451 $word =~ s/ioannes/iohannes/g;
452 return $word;
453}
454
2ceca8c3 455=back
456
457=head1 LICENSE
458
459This package is free software and is provided "as is" without express
460or implied warranty. You can redistribute it and/or modify it under
461the same terms as Perl itself.
462
463=head1 AUTHOR
464
465Tara L Andrews, aurum@cpan.org
466
467=cut
468
b49c4318 4691;