remove old files, no longer used
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm
CommitLineData
e58153d6 1package Text::Tradition::Parser::BaseText;
b49c4318 2
3use strict;
4use warnings;
52ce987f 5use Module::Load;
b49c4318 6
2ceca8c3 7=head1 NAME
8
9Text::Tradition::Parser::BaseText
10
11=head1 SYNOPSIS
12
13use Text::Tradition::Parser::BaseText qw( merge_base );
14merge_base( $graph, 'reference.txt', @apparatus_entries )
15
16=head1 DESCRIPTION
17
18For an overview of the package, see the documentation for the
19Text::Tradition::Graph module.
20
21This module is meant for use with certain of the other Parser classes
22- whenever a list of variants is given with reference to a base text,
23these must be joined into a single collation. The parser should
24therefore make a list of variants and their locations, and BaseText
25will join those listed variants onto the reference text.
26
27=head1 SUBROUTINES
28
29=over
30
52ce987f 31=item B<parse>
32
33parse( $graph, %opts );
34
35Takes an initialized graph and a set of options, which must include:
36- 'base' - the base text referenced by the variants
37- 'format' - the format of the variant list
38- 'data' - the variants, in the given format.
39
40=cut
41
42sub parse {
43 my( $graph, %opts ) = @_;
44
45 my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
46 load( $format_mod );
47 my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
48 merge_base( $graph, $opts{'base'}, @apparatus_entries );
49}
50
2ceca8c3 51=item B<merge_base>
52
53merge_base( $graph, 'reference.txt', @apparatus_entries )
54
55Takes three arguments: a newly-initialized Text::Tradition::Graph
56object, a text file containing the reference text, and a list of
57variants (apparatus entries). Adds the base text to the graph, and
58joins the variants to that.
59
60The list of variants is an array of hash references; each hash takes
61the form
62 { '_id' => line reference,
63 'rdg_0' => lemma reading,
64 'rdg_1' => first variant,
65 ... # and so on until all distinct readings are listed
66 'WitnessA' => 'rdg_0',
67 'WitnessB' => 'rdg_1',
68 ... # and so on until all witnesses are listed with their readings
69 }
70
71Any hash key that is not of the form /^rdg_\d+$/ and that does not
72begin with an underscore is assumed to be a witness name. Any 'meta'
73information to be passed must be passed in a key with a leading
74underscore in its name.
75
76=cut
77
b49c4318 78sub merge_base {
79 my( $graph, $base_file, @app_entries ) = @_;
80 my @base_line_starts = read_base( $base_file, $graph );
81
52ce987f 82 my %all_witnesses;
b49c4318 83 foreach my $app ( @app_entries ) {
84 my( $line, $num ) = split( /\./, $app->{_id} );
85 # DEBUG with a short graph
86 # last if $line > 2;
2ceca8c3 87 # DEBUG for problematic entries
e49731d7 88 my $scrutinize = "";
b49c4318 89 my $first_line_node = $base_line_starts[ $line ];
90 my $too_far = $base_line_starts[ $line+1 ];
91
92 my $lemma = $app->{rdg_0};
93 my $seq = 1;
94 # Is this the Nth occurrence of this reading in the line?
95 if( $lemma =~ s/(_)?(\d)$// ) {
96 $seq = $2;
97 }
98 my @lemma_words = split( /\s+/, $lemma );
99
100 # Now search for the lemma words within this line.
101 my $lemma_start = $first_line_node;
102 my $lemma_end;
103 my %seen;
104 while( $lemma_start ne $too_far ) {
105 # Loop detection
106 if( $seen{ $lemma_start->name() } ) {
107 warn "Detected loop at " . $lemma_start->name() .
108 ", ref $line,$num";
109 last;
110 }
111 $seen{ $lemma_start->name() } = 1;
112
113 # Try to match the lemma.
114 my $unmatch = 0;
115 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
116 $lemma_words[0] . "...\n"
117 if "$line.$num" eq $scrutinize;
118 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
119 # Skip it if we need a match that is not the first.
120 if( --$seq < 1 ) {
121 # Now we have to compare the rest of the words here.
122 if( scalar( @lemma_words ) > 1 ) {
123 my $next_node = $graph->next_word( $lemma_start );
124 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
125 printf STDERR "Now matching %s against %s\n",
126 cmp_str($next_node), $w
127 if "$line.$num" eq $scrutinize;
128 if( $w ne cmp_str($next_node) ) {
129 $unmatch = 1;
130 last;
131 } else {
132 $lemma_end = $next_node;
133 $next_node = $graph->next_word( $lemma_end );
134 }
135 }
136 } else {
137 $lemma_end = $lemma_start;
138 }
139 } else {
140 $unmatch = 1;
141 }
142 }
143 last unless ( $unmatch || !defined( $lemma_end ) );
144 $lemma_end = undef;
145 $lemma_start = $graph->next_word( $lemma_start );
146 }
147
148 unless( $lemma_end ) {
149 warn "No match found for @lemma_words at $line.$num";
150 next;
151 } else {
152 # These are no longer common nodes; unmark them as such.
153 my @lemma_nodes = $graph->node_sequence( $lemma_start,
154 $lemma_end );
155 map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
156 }
157
158 # Now we have our lemma nodes; we add the variant nodes to the graph.
159
e49731d7 160 # Keep track of the start and end point of each reading for later
161 # node collapse.
162 my @readings = ( $lemma_start, $lemma_end );
163
b49c4318 164 # For each reading that is not rdg_0, we make a chain of nodes
165 # and connect them to the anchor. Edges are named after the mss
166 # that are relevant.
167 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
168 next if $k eq 'rdg_0'; # that's the lemma.
169 my @variant = split( /\s+/, $app->{$k} );
170 @variant = () if $app->{$k} eq '/'; # This is an omission.
171 my @mss = grep { $app->{$_} eq $k } keys( %$app );
172
173 unless( @mss ) {
174 print STDERR "Skipping '@variant' at $line.$num: no mss\n";
175 next;
176 }
177
178 # Determine the label name for the edges here.
179 my $edge_name = join(', ', @mss );
52ce987f 180 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
b49c4318 181
182 # Make the variant into a set of nodes.
183 my $ctr = 0;
184 my $last_node = $graph->prior_word( $lemma_start );
185 my $var_start;
186 foreach my $vw ( @variant ) {
187 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
188 my $vwnode = $graph->add_node( $vwname );
189 $vwnode->set_attribute( 'label', $vw );
190 $vwnode->set_attribute( 'class', 'variant' );
191 $graph->add_edge( $last_node, $vwnode, $edge_name );
192 $var_start = $vwnode unless $var_start;
193 $last_node = $vwnode;
194 }
195 # Now hook it up at the end.
196 $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
197 $edge_name );
198
e49731d7 199 if( $var_start ) { # if it wasn't an empty reading
200 push( @readings, $var_start, $last_node );
201 }
b49c4318 202 }
e49731d7 203
204 # Now collate and collapse the identical nodes within the graph.
205 collate_variants( $graph, @readings );
b49c4318 206 }
207
208 ## Now in theory I have a graph. I want to make it a little easier to
209 ## read. So I collapse nodes that have only one edge in and one edge
210 ## out, and I do this by looking at the edges.
211
52ce987f 212# foreach my $edge ( $graph->edges() ) {
213# my @out_edges = $edge->from()->outgoing();
214# my @in_edges = $edge->to()->incoming();
b49c4318 215
52ce987f 216# next if $edge->from() eq $graph->start();
217# next if $edge->to()->name() eq '#END#';
218# next unless scalar( @out_edges ) == 1;
219# next unless scalar( @in_edges ) == 1;
220# next unless $out_edges[0] eq $in_edges[0];
221# # In theory if we've got this far, we're safe, but just to
222# # double-check...
223# next unless $out_edges[0] eq $edge;
b49c4318 224
52ce987f 225# $graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
226# }
227
228 # Now walk the path for each witness, so that we can do the
229 # position calculations.
230 my $paths = {};
231 foreach my $w ( keys %all_witnesses ) {
232 my $back = undef;
233 if( $w =~ /^(.*)\s*\(p\.\s*c\.\)/ ) {
234 $back = $1;
235 }
236 my @wit_nodes = $graph->node_sequence( $graph->start,
237 $graph->node( '#END#' ),
238 $w, $back );
239 my @wn_names = map { $_->name() } @wit_nodes;
240 $paths->{$w} = \@wn_names;
b49c4318 241 }
52ce987f 242 $DB::single = 1;
243 my @common_nodes = grep { $graph->is_common( $_ ) } $graph->nodes();
244 $graph->make_positions( \@common_nodes, $paths );
b49c4318 245}
246
2ceca8c3 247=item B<read_base>
248
249my @line_beginnings = read_base( 'reference.txt', $graph );
250
251Takes a text file and a (presumed empty) graph object, adds the words
252as simple linear nodes to the graph, and returns a list of nodes that
253represent the beginning of lines. This graph is now the starting point
254for application of apparatus entries in merge_base, e.g. from a CSV
255file or a Classical Text Editor file.
256
257=cut
b49c4318 258
259sub read_base {
260 my( $base_file, $graph ) = @_;
261
262 # This array gives the first node for each line. We put the
263 # common starting point in line zero.
264 my $last_node = $graph->start();
265 my $lineref_array = [ $last_node ]; # There is no line zero.
266
267 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
268 while(<BASE>) {
269 # Make the nodes, and connect them up for the base, but also
270 # save the first node of each line in an array for the purpose.
271 chomp;
272 my @words = split;
273 my $started = 0;
274 my $wordref = 0;
275 my $lineref = scalar @$lineref_array;
276 foreach my $w ( @words ) {
277 my $noderef = join( ',', $lineref, ++$wordref );
278 my $node = $graph->add_node( $noderef );
279 $node->set_attribute( 'label', $w );
280 $node->set_attribute( 'class', 'common' );
281 unless( $started ) {
282 push( @$lineref_array, $node );
283 $started = 1;
284 }
285 if( $last_node ) {
e49731d7 286 my $edge = $graph->add_edge( $last_node, $node, "base text" );
287 $edge->set_attribute( 'class', 'basetext' );
b49c4318 288 $last_node = $node;
289 } # TODO there should be no else here...
290 }
291 }
292 close BASE;
293 # Ending point for all texts
294 my $endpoint = $graph->add_node( '#END#' );
295 $graph->add_edge( $last_node, $endpoint, "base text" );
296 push( @$lineref_array, $endpoint );
297
298 return( @$lineref_array );
299}
300
e49731d7 301=item B<collate_variants>
2ceca8c3 302
e49731d7 303collate_variants( $graph, @readings )
2ceca8c3 304
e49731d7 305Given a set of readings in the form
306( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
2ceca8c3 307walks through each to identify those nodes that are identical. The
e49731d7 308graph is a Text::Tradition::Graph object; the elements of @readings are
2ceca8c3 309Graph::Easy::Node objects that appear on the graph.
b49c4318 310
2ceca8c3 311TODO: Handle collapsed and non-collapsed transpositions.
312
313=cut
b49c4318 314
e49731d7 315sub collate_variants {
316 my( $graph, @readings ) = @_;
317 my $lemma_start = shift @readings;
318 my $lemma_end = shift @readings;
52ce987f 319 my $detranspose = 0;
b49c4318 320
1b9423d5 321 # We need to calculate positions at this point, which is where
322 # we are getting the implicit information from the apparatus.
323
e49731d7 324 # Start the list of distinct nodes with those nodes in the lemma.
325 my @distinct_nodes;
1b9423d5 326 my $position = 0;
b49c4318 327 while( $lemma_start ne $lemma_end ) {
1b9423d5 328 push( @distinct_nodes, [ $lemma_start, 'base text', $position++ ] );
b49c4318 329 $lemma_start = $graph->next_word( $lemma_start );
330 }
1b9423d5 331 push( @distinct_nodes, [ $lemma_end, 'base text', $position++ ] );
b49c4318 332
e49731d7 333
334 while( scalar @readings ) {
335 my( $var_start, $var_end ) = splice( @readings, 0, 2 );
336
337 # I want to look at the nodes in the variant and lemma, and
338 # collapse nodes that are the same word. This is mini-collation.
339 # Each word in the 'main' list can only be collapsed once with a
340 # word from the current reading.
341 my %collapsed = ();
342
343 # Get the label. There will only be one outgoing edge to start
344 # with, so this is safe.
345 my @out = $var_start->outgoing();
346 my $var_label = $out[0]->label();
347
348 my @variant_nodes;
349 while( $var_start ne $var_end ) {
350 push( @variant_nodes, $var_start );
351 $var_start = $graph->next_word( $var_start, $var_label );
352 }
353 push( @variant_nodes, $var_end );
354
355 # Go through the variant nodes, and if we find a lemma node that
356 # hasn't yet been collapsed with a node, equate them. If we do
357 # not, keep them to push onto the end of all_nodes.
358 my @remaining_nodes;
359 my $last_index = 0;
1b9423d5 360 my $curr_pos = 0;
e49731d7 361 foreach my $w ( @variant_nodes ) {
362 my $word = $w->label();
363 my $matched = 0;
364 foreach my $idx ( $last_index .. $#distinct_nodes ) {
1b9423d5 365 my( $l, $edgelabel, $pos ) = @{$distinct_nodes[$idx]};
e49731d7 366 if( $word eq cmp_str( $l ) ) {
367 next if exists( $collapsed{ $l->label } )
368 && $collapsed{ $l->label } eq $l;
369 $matched = 1;
370 $last_index = $idx if $detranspose;
371 # Collapse the nodes.
372 printf STDERR "Merging nodes %s/%s and %s/%s\n",
373 $l->name, $l->label, $w->name, $w->label;
374 $graph->merge_nodes( $l, $w );
375 $collapsed{ $l->label } = $l;
376 # Now collapse any multiple edges to and from the node.
377 remove_duplicate_edges( $graph,
378 $graph->prior_word( $l, $edgelabel ), $l );
379 remove_duplicate_edges( $graph, $l,
380 $graph->next_word( $l, $edgelabel ) );
1b9423d5 381 $curr_pos = $pos;
382 last;
e49731d7 383 }
b49c4318 384 }
1b9423d5 385 push( @remaining_nodes, [ $w, $var_label, $curr_pos++ ] ) unless $matched;
b49c4318 386 }
e49731d7 387 push( @distinct_nodes, @remaining_nodes) if scalar( @remaining_nodes );
b49c4318 388 }
1b9423d5 389
390 # Now set the positions of all the nodes in this variation.
391 #$DB::single = 1;
392 print STDERR "Nodes and their positions are:\n";
393 foreach my $n ( @distinct_nodes ) {
394 printf STDERR "\t%s (position %s)\n", $n->[0]->label(), $n->[2];
395 }
b49c4318 396}
397
2ceca8c3 398=item B<remove_duplicate_edges>
399
400remove_duplicate_edges( $graph, $from, $to );
401
402Given two nodes, reduce the number of edges between those nodes to
403one. If neither edge represents a base text, combine their labels.
404
405=cut
406
b49c4318 407sub remove_duplicate_edges {
408 my( $graph, $from, $to ) = @_;
409 my @edges = $from->edges_to( $to );
410 if( scalar @edges > 1 ) {
411 my @base = grep { $_->label eq 'base text' } @edges;
412 if ( scalar @base ) {
413 # Remove the edges that are not base.
414 foreach my $e ( @edges ) {
415 $graph->del_edge( $e )
416 unless $e eq $base[0];
417 }
418 } else {
419 # Combine the edges into one.
420 my $new_edge_name = join( ', ', map { $_->label() } @edges );
421 my $new_edge = shift @edges;
422 $new_edge->set_attribute( 'label', $new_edge_name );
423 foreach my $e ( @edges ) {
424 $graph->del_edge( $e );
425 }
426 }
427 }
428}
429
2ceca8c3 430=item B<cmp_str>
431
432Pretend you never saw this method. Really it needs to not be hardcoded.
433
434=cut
435
b49c4318 436sub cmp_str {
437 my( $node ) = @_;
438 my $word = $node->label();
439 $word = lc( $word );
440 $word =~ s/\W//g;
441 $word =~ s/v/u/g;
442 $word =~ s/j/i/g;
443 $word =~ s/cha/ca/g;
444 $word =~ s/quatuor/quattuor/g;
445 $word =~ s/ioannes/iohannes/g;
446 return $word;
447}
448
2ceca8c3 449=back
450
451=head1 LICENSE
452
453This package is free software and is provided "as is" without express
454or implied warranty. You can redistribute it and/or modify it under
455the same terms as Perl itself.
456
457=head1 AUTHOR
458
459Tara L Andrews, aurum@cpan.org
460
461=cut
462
b49c4318 4631;