add expand/collapse path edge functionality
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm
CommitLineData
e58153d6 1package Text::Tradition::Parser::BaseText;
b49c4318 2
3use strict;
4use warnings;
52ce987f 5use Module::Load;
b49c4318 6
2ceca8c3 7=head1 NAME
8
9Text::Tradition::Parser::BaseText
10
11=head1 SYNOPSIS
12
13use Text::Tradition::Parser::BaseText qw( merge_base );
14merge_base( $graph, 'reference.txt', @apparatus_entries )
15
16=head1 DESCRIPTION
17
18For an overview of the package, see the documentation for the
19Text::Tradition::Graph module.
20
21This module is meant for use with certain of the other Parser classes
22- whenever a list of variants is given with reference to a base text,
23these must be joined into a single collation. The parser should
24therefore make a list of variants and their locations, and BaseText
25will join those listed variants onto the reference text.
26
27=head1 SUBROUTINES
28
29=over
30
52ce987f 31=item B<parse>
32
33parse( $graph, %opts );
34
35Takes an initialized graph and a set of options, which must include:
36- 'base' - the base text referenced by the variants
37- 'format' - the format of the variant list
38- 'data' - the variants, in the given format.
39
40=cut
41
42sub parse {
e2902068 43 my( $tradition, %opts ) = @_;
52ce987f 44
45 my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
46 load( $format_mod );
47 my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
e2902068 48 merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
52ce987f 49}
50
2ceca8c3 51=item B<merge_base>
52
53merge_base( $graph, 'reference.txt', @apparatus_entries )
54
55Takes three arguments: a newly-initialized Text::Tradition::Graph
56object, a text file containing the reference text, and a list of
57variants (apparatus entries). Adds the base text to the graph, and
58joins the variants to that.
59
60The list of variants is an array of hash references; each hash takes
61the form
62 { '_id' => line reference,
63 'rdg_0' => lemma reading,
64 'rdg_1' => first variant,
65 ... # and so on until all distinct readings are listed
66 'WitnessA' => 'rdg_0',
67 'WitnessB' => 'rdg_1',
68 ... # and so on until all witnesses are listed with their readings
69 }
70
71Any hash key that is not of the form /^rdg_\d+$/ and that does not
72begin with an underscore is assumed to be a witness name. Any 'meta'
73information to be passed must be passed in a key with a leading
74underscore in its name.
75
76=cut
77
1f563ac3 78 my $SHORT = 25;
930ff666 79
b49c4318 80sub merge_base {
e2902068 81 my( $collation, $base_file, @app_entries ) = @_;
82 my @base_line_starts = read_base( $base_file, $collation );
b49c4318 83
52ce987f 84 my %all_witnesses;
b49c4318 85 foreach my $app ( @app_entries ) {
86 my( $line, $num ) = split( /\./, $app->{_id} );
87 # DEBUG with a short graph
930ff666 88 last if $SHORT && $line > $SHORT;
2ceca8c3 89 # DEBUG for problematic entries
930ff666 90 my $scrutinize = "";
e2902068 91 my $first_line_reading = $base_line_starts[ $line ];
b49c4318 92 my $too_far = $base_line_starts[ $line+1 ];
93
94 my $lemma = $app->{rdg_0};
95 my $seq = 1;
96 # Is this the Nth occurrence of this reading in the line?
97 if( $lemma =~ s/(_)?(\d)$// ) {
98 $seq = $2;
99 }
100 my @lemma_words = split( /\s+/, $lemma );
101
102 # Now search for the lemma words within this line.
e2902068 103 my $lemma_start = $first_line_reading;
b49c4318 104 my $lemma_end;
105 my %seen;
106 while( $lemma_start ne $too_far ) {
107 # Loop detection
108 if( $seen{ $lemma_start->name() } ) {
109 warn "Detected loop at " . $lemma_start->name() .
110 ", ref $line,$num";
111 last;
112 }
113 $seen{ $lemma_start->name() } = 1;
114
115 # Try to match the lemma.
116 my $unmatch = 0;
117 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
118 $lemma_words[0] . "...\n"
119 if "$line.$num" eq $scrutinize;
120 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
121 # Skip it if we need a match that is not the first.
122 if( --$seq < 1 ) {
123 # Now we have to compare the rest of the words here.
124 if( scalar( @lemma_words ) > 1 ) {
e2902068 125 my $next_reading =
126 $collation->next_reading( $lemma_start );
b49c4318 127 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
128 printf STDERR "Now matching %s against %s\n",
e2902068 129 cmp_str($next_reading), $w
b49c4318 130 if "$line.$num" eq $scrutinize;
e2902068 131 if( $w ne cmp_str($next_reading) ) {
b49c4318 132 $unmatch = 1;
133 last;
134 } else {
e2902068 135 $lemma_end = $next_reading;
136 $next_reading =
137 $collation->next_reading( $lemma_end );
b49c4318 138 }
139 }
140 } else {
141 $lemma_end = $lemma_start;
142 }
143 } else {
144 $unmatch = 1;
145 }
146 }
147 last unless ( $unmatch || !defined( $lemma_end ) );
148 $lemma_end = undef;
e2902068 149 $lemma_start = $collation->next_reading( $lemma_start );
b49c4318 150 }
151
152 unless( $lemma_end ) {
153 warn "No match found for @lemma_words at $line.$num";
154 next;
155 } else {
e2902068 156 # These are no longer common readings; unmark them as such.
157 my @lemma_readings = $collation->reading_sequence( $lemma_start,
b49c4318 158 $lemma_end );
930ff666 159 map { $_->make_variant } @lemma_readings;
b49c4318 160 }
161
e2902068 162 # Now we have our lemma readings; we add the variant readings
163 # to the collation.
b49c4318 164
e49731d7 165 # Keep track of the start and end point of each reading for later
e2902068 166 # reading collapse.
e49731d7 167 my @readings = ( $lemma_start, $lemma_end );
168
e2902068 169 # For each reading that is not rdg_0, we make a chain of readings
b49c4318 170 # and connect them to the anchor. Edges are named after the mss
171 # that are relevant.
172 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
7854e12e 173 if( $k eq 'rdg_0' ) { # that's the lemma
174 # The lemma is already in the graph, but we need to look for
175 # any explicit post-correctione readings and add the
176 # relevant path.
177 my @mss = grep { $app->{$_} eq $k } keys( %$app );
930ff666 178 # Keep track of what witnesses we have seen.
179 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
7854e12e 180 foreach my $m ( @mss ) {
181 my $base = _is_post_corr( $m );
182 next unless $base;
183 my @lem = $collation->reading_sequence( $lemma_start, $lemma_end );
930ff666 184 $collation->add_path( $collation->prior_reading( $lem[0] ), $lem[0], $m );
7854e12e 185 foreach my $i ( 0 .. $#lem-1 ) {
930ff666 186 $collation->add_path( $lem[$i], $lem[++$i], $m );
7854e12e 187 }
930ff666 188 $collation->add_path( $lem[-1], $collation->next_reading( $lem[-1] ), $m );
7854e12e 189 }
930ff666 190 next;
7854e12e 191 }
b49c4318 192 my @variant = split( /\s+/, $app->{$k} );
193 @variant = () if $app->{$k} eq '/'; # This is an omission.
194 my @mss = grep { $app->{$_} eq $k } keys( %$app );
195
196 unless( @mss ) {
197 print STDERR "Skipping '@variant' at $line.$num: no mss\n";
198 next;
199 }
200
e2902068 201 # Keep track of what witnesses we have seen.
52ce987f 202 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
b49c4318 203
e2902068 204 # Make the variant into a set of readings.
b49c4318 205 my $ctr = 0;
e2902068 206 my $last_reading = $collation->prior_reading( $lemma_start );
b49c4318 207 my $var_start;
208 foreach my $vw ( @variant ) {
209 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
e2902068 210 my $vwreading = $collation->add_reading( $vwname );
211 $vwreading->text( $vw );
212 $vwreading->make_variant();
213 foreach ( @mss ) {
214 $collation->add_path( $last_reading, $vwreading, $_ );
215 }
216 $var_start = $vwreading unless $var_start;
217 $last_reading = $vwreading;
b49c4318 218 }
219 # Now hook it up at the end.
e2902068 220 foreach ( @mss ) {
221 $collation->add_path( $last_reading,
7854e12e 222 $collation->next_reading( $lemma_end ),
e2902068 223 $_ );
224 }
b49c4318 225
e49731d7 226 if( $var_start ) { # if it wasn't an empty reading
e2902068 227 push( @readings, $var_start, $last_reading );
e49731d7 228 }
b49c4318 229 }
e49731d7 230
e2902068 231 # Now collate and collapse the identical readings within the collation.
232 collate_variants( $collation, @readings );
b49c4318 233 }
234
e2902068 235 # Now make the witness objects
52ce987f 236 foreach my $w ( keys %all_witnesses ) {
e2902068 237 my $base = _is_post_corr( $w );
238 if( $base ) {
239 my $pctag = substr( $w, length( $base ) );
240 my $existing_wit = $collation->tradition->witness( $base );
241 unless( $existing_wit ) {
7854e12e 242 $existing_wit = $collation->tradition->add_witness( sigil => $base );
e2902068 243 }
244 $existing_wit->post_correctione( $pctag );
245 } else {
7854e12e 246 $collation->tradition->add_witness( sigil => $w )
e2902068 247 unless $collation->tradition->witness( $w );
52ce987f 248 }
b49c4318 249 }
e2902068 250
251 # Now walk paths and calculate positions.
252 my @common_readings =
1f563ac3 253 $collation->walk_and_expand_base( $collation->reading( '#END#' ) );
e2902068 254 $collation->calculate_positions( @common_readings );
b49c4318 255}
256
2ceca8c3 257=item B<read_base>
258
e2902068 259my @line_beginnings = read_base( 'reference.txt', $collation );
2ceca8c3 260
e2902068 261Takes a text file and a (presumed empty) collation object, adds the
262words as simple linear readings to the collation, and returns a
263list of readings that represent the beginning of lines. This collation
264is now the starting point for application of apparatus entries in
265merge_base, e.g. from a CSV file or a Classical Text Editor file.
2ceca8c3 266
267=cut
b49c4318 268
269sub read_base {
e2902068 270 my( $base_file, $collation ) = @_;
b49c4318 271
e2902068 272 # This array gives the first reading for each line. We put the
b49c4318 273 # common starting point in line zero.
e2902068 274 my $last_reading = $collation->start();
275 my $lineref_array = [ $last_reading ]; # There is no line zero.
b49c4318 276
277 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
278 while(<BASE>) {
e2902068 279 # Make the readings, and connect them up for the base, but
280 # also save the first reading of each line in an array for the
281 # purpose.
282 # TODO use configurable reading separator
b49c4318 283 chomp;
284 my @words = split;
285 my $started = 0;
286 my $wordref = 0;
287 my $lineref = scalar @$lineref_array;
930ff666 288 last if $SHORT && $lineref > $SHORT;
b49c4318 289 foreach my $w ( @words ) {
e2902068 290 my $readingref = join( ',', $lineref, ++$wordref );
291 my $reading = $collation->add_reading( $readingref );
292 $reading->text( $w );
293 $reading->make_common();
b49c4318 294 unless( $started ) {
e2902068 295 push( @$lineref_array, $reading );
b49c4318 296 $started = 1;
297 }
e2902068 298 if( $last_reading ) {
299 my $path = $collation->add_path( $last_reading, $reading,
930ff666 300 $collation->baselabel );
e2902068 301 $path->set_attribute( 'class', 'basetext' );
302 $last_reading = $reading;
b49c4318 303 } # TODO there should be no else here...
304 }
305 }
306 close BASE;
307 # Ending point for all texts
e2902068 308 my $endpoint = $collation->add_reading( '#END#' );
930ff666 309 $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
b49c4318 310 push( @$lineref_array, $endpoint );
311
312 return( @$lineref_array );
313}
314
e49731d7 315=item B<collate_variants>
2ceca8c3 316
e2902068 317collate_variants( $collation, @readings )
2ceca8c3 318
e49731d7 319Given a set of readings in the form
320( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
e2902068 321walks through each to identify those readings that are identical. The
322collation is a Text::Tradition::Collation object; the elements of
323@readings are Text::Tradition::Collation::Reading objects that appear
324on the collation graph.
b49c4318 325
2ceca8c3 326TODO: Handle collapsed and non-collapsed transpositions.
327
328=cut
b49c4318 329
e49731d7 330sub collate_variants {
e2902068 331 my( $collation, @readings ) = @_;
e49731d7 332 my $lemma_start = shift @readings;
333 my $lemma_end = shift @readings;
7854e12e 334 my $detranspose = 1;
1b9423d5 335
e2902068 336 # Start the list of distinct readings with those readings in the lemma.
337 my @distinct_readings;
b49c4318 338 while( $lemma_start ne $lemma_end ) {
930ff666 339 push( @distinct_readings, [ $lemma_start, $collation->baselabel ] );
7854e12e 340 $lemma_start = $collation->next_reading( $lemma_start );
b49c4318 341 }
930ff666 342 push( @distinct_readings, [ $lemma_end, $collation->baselabel ] );
b49c4318 343
e49731d7 344
345 while( scalar @readings ) {
346 my( $var_start, $var_end ) = splice( @readings, 0, 2 );
347
e2902068 348 # I want to look at the readings in the variant and lemma, and
349 # collapse readings that are the same word. This is mini-collation.
e49731d7 350 # Each word in the 'main' list can only be collapsed once with a
351 # word from the current reading.
352 my %collapsed = ();
353
7854e12e 354 # Get the variant witnesses. They will all be going along the
355 # same path, so just use the first one as representative for
356 # the purpose of following the path.
357 my @var_wits = map { $_->label } $var_start->outgoing();
358 my $rep_wit = $var_wits[0];
e49731d7 359
e2902068 360 my @variant_readings;
e49731d7 361 while( $var_start ne $var_end ) {
e2902068 362 push( @variant_readings, $var_start );
7854e12e 363 $var_start = $collation->next_reading( $var_start, $rep_wit );
e49731d7 364 }
e2902068 365 push( @variant_readings, $var_end );
e49731d7 366
e2902068 367 # Go through the variant readings, and if we find a lemma reading that
368 # hasn't yet been collapsed with a reading, equate them. If we do
369 # not, keep them to push onto the end of all_readings.
7854e12e 370 # TODO replace this with proper mini-collation
e2902068 371 my @remaining_readings;
e49731d7 372 my $last_index = 0;
1b9423d5 373 my $curr_pos = 0;
e2902068 374 foreach my $w ( @variant_readings ) {
e49731d7 375 my $word = $w->label();
376 my $matched = 0;
e2902068 377 foreach my $idx ( $last_index .. $#distinct_readings ) {
7854e12e 378 my( $l, $pathlabel ) = @{$distinct_readings[$idx]};
e49731d7 379 if( $word eq cmp_str( $l ) ) {
380 next if exists( $collapsed{ $l->label } )
381 && $collapsed{ $l->label } eq $l;
382 $matched = 1;
383 $last_index = $idx if $detranspose;
e2902068 384 # Collapse the readings.
385 printf STDERR "Merging readings %s/%s and %s/%s\n",
e49731d7 386 $l->name, $l->label, $w->name, $w->label;
e2902068 387 $collation->merge_readings( $l, $w );
e49731d7 388 $collapsed{ $l->label } = $l;
e2902068 389 # Now collapse any multiple paths to and from the reading.
390 remove_duplicate_paths( $collation,
7854e12e 391 $collation->prior_reading( $l, $rep_wit ), $l );
e2902068 392 remove_duplicate_paths( $collation, $l,
7854e12e 393 $collation->next_reading( $l, $rep_wit ) );
1b9423d5 394 last;
e49731d7 395 }
b49c4318 396 }
7854e12e 397 push( @remaining_readings, [ $w, $rep_wit ] ) unless $matched;
b49c4318 398 }
e2902068 399 push( @distinct_readings, @remaining_readings) if scalar( @remaining_readings );
b49c4318 400 }
401}
402
e2902068 403=item B<remove_duplicate_paths>
2ceca8c3 404
e2902068 405remove_duplicate_paths( $collation, $from, $to );
2ceca8c3 406
e2902068 407Given two readings, reduce the number of paths between those readings to
7854e12e 408a set of unique paths.
2ceca8c3 409
410=cut
411
7854e12e 412# TODO wonder if this is necessary
e2902068 413sub remove_duplicate_paths {
414 my( $collation, $from, $to ) = @_;
7854e12e 415 my %seen_paths;
416 foreach my $p ( $from->edges_to( $to ) ) {
417 if( exists $seen_paths{$p->name} ) {
418 $collation->del_path( $p );
b49c4318 419 } else {
7854e12e 420 $seen_paths{$p->name} = 1;
b49c4318 421 }
422 }
423}
424
e2902068 425# Helper function. Given a witness sigil, if it is a post-correctione
426# sigil,return the base witness. If not, return a false value.
427sub _is_post_corr {
428 my( $sigil ) = @_;
930ff666 429 if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
e2902068 430 return $1;
431 }
432 return undef;
433}
434
2ceca8c3 435=item B<cmp_str>
436
437Pretend you never saw this method. Really it needs to not be hardcoded.
438
439=cut
440
b49c4318 441sub cmp_str {
e2902068 442 my( $reading ) = @_;
443 my $word = $reading->label();
b49c4318 444 $word = lc( $word );
445 $word =~ s/\W//g;
446 $word =~ s/v/u/g;
447 $word =~ s/j/i/g;
448 $word =~ s/cha/ca/g;
449 $word =~ s/quatuor/quattuor/g;
450 $word =~ s/ioannes/iohannes/g;
451 return $word;
452}
453
2ceca8c3 454=back
455
456=head1 LICENSE
457
458This package is free software and is provided "as is" without express
459or implied warranty. You can redistribute it and/or modify it under
460the same terms as Perl itself.
461
462=head1 AUTHOR
463
464Tara L Andrews, aurum@cpan.org
465
466=cut
467
b49c4318 4681;