Add some documentation
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm
CommitLineData
e58153d6 1package Text::Tradition::Parser::BaseText;
b49c4318 2
3use strict;
4use warnings;
5use Exporter 'import';
6use vars qw( @EXPORT_OK );
7@EXPORT_OK = qw( merge_base );
8
2ceca8c3 9=head1 NAME
10
11Text::Tradition::Parser::BaseText
12
13=head1 SYNOPSIS
14
15use Text::Tradition::Parser::BaseText qw( merge_base );
16merge_base( $graph, 'reference.txt', @apparatus_entries )
17
18=head1 DESCRIPTION
19
20For an overview of the package, see the documentation for the
21Text::Tradition::Graph module.
22
23This module is meant for use with certain of the other Parser classes
24- whenever a list of variants is given with reference to a base text,
25these must be joined into a single collation. The parser should
26therefore make a list of variants and their locations, and BaseText
27will join those listed variants onto the reference text.
28
29=head1 SUBROUTINES
30
31=over
32
33=item B<merge_base>
34
35merge_base( $graph, 'reference.txt', @apparatus_entries )
36
37Takes three arguments: a newly-initialized Text::Tradition::Graph
38object, a text file containing the reference text, and a list of
39variants (apparatus entries). Adds the base text to the graph, and
40joins the variants to that.
41
42The list of variants is an array of hash references; each hash takes
43the form
44 { '_id' => line reference,
45 'rdg_0' => lemma reading,
46 'rdg_1' => first variant,
47 ... # and so on until all distinct readings are listed
48 'WitnessA' => 'rdg_0',
49 'WitnessB' => 'rdg_1',
50 ... # and so on until all witnesses are listed with their readings
51 }
52
53Any hash key that is not of the form /^rdg_\d+$/ and that does not
54begin with an underscore is assumed to be a witness name. Any 'meta'
55information to be passed must be passed in a key with a leading
56underscore in its name.
57
58=cut
59
b49c4318 60sub merge_base {
61 my( $graph, $base_file, @app_entries ) = @_;
62 my @base_line_starts = read_base( $base_file, $graph );
63
64 foreach my $app ( @app_entries ) {
65 my( $line, $num ) = split( /\./, $app->{_id} );
66 # DEBUG with a short graph
67 # last if $line > 2;
2ceca8c3 68 # DEBUG for problematic entries
69 # my $scrutinize = "21.8";
b49c4318 70 my $first_line_node = $base_line_starts[ $line ];
71 my $too_far = $base_line_starts[ $line+1 ];
72
73 my $lemma = $app->{rdg_0};
74 my $seq = 1;
75 # Is this the Nth occurrence of this reading in the line?
76 if( $lemma =~ s/(_)?(\d)$// ) {
77 $seq = $2;
78 }
79 my @lemma_words = split( /\s+/, $lemma );
80
81 # Now search for the lemma words within this line.
82 my $lemma_start = $first_line_node;
83 my $lemma_end;
84 my %seen;
85 while( $lemma_start ne $too_far ) {
86 # Loop detection
87 if( $seen{ $lemma_start->name() } ) {
88 warn "Detected loop at " . $lemma_start->name() .
89 ", ref $line,$num";
90 last;
91 }
92 $seen{ $lemma_start->name() } = 1;
93
94 # Try to match the lemma.
95 my $unmatch = 0;
96 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
97 $lemma_words[0] . "...\n"
98 if "$line.$num" eq $scrutinize;
99 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
100 # Skip it if we need a match that is not the first.
101 if( --$seq < 1 ) {
102 # Now we have to compare the rest of the words here.
103 if( scalar( @lemma_words ) > 1 ) {
104 my $next_node = $graph->next_word( $lemma_start );
105 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
106 printf STDERR "Now matching %s against %s\n",
107 cmp_str($next_node), $w
108 if "$line.$num" eq $scrutinize;
109 if( $w ne cmp_str($next_node) ) {
110 $unmatch = 1;
111 last;
112 } else {
113 $lemma_end = $next_node;
114 $next_node = $graph->next_word( $lemma_end );
115 }
116 }
117 } else {
118 $lemma_end = $lemma_start;
119 }
120 } else {
121 $unmatch = 1;
122 }
123 }
124 last unless ( $unmatch || !defined( $lemma_end ) );
125 $lemma_end = undef;
126 $lemma_start = $graph->next_word( $lemma_start );
127 }
128
129 unless( $lemma_end ) {
130 warn "No match found for @lemma_words at $line.$num";
131 next;
132 } else {
133 # These are no longer common nodes; unmark them as such.
134 my @lemma_nodes = $graph->node_sequence( $lemma_start,
135 $lemma_end );
136 map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
137 }
138
139 # Now we have our lemma nodes; we add the variant nodes to the graph.
140
141 # For each reading that is not rdg_0, we make a chain of nodes
142 # and connect them to the anchor. Edges are named after the mss
143 # that are relevant.
144 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
145 next if $k eq 'rdg_0'; # that's the lemma.
146 my @variant = split( /\s+/, $app->{$k} );
147 @variant = () if $app->{$k} eq '/'; # This is an omission.
148 my @mss = grep { $app->{$_} eq $k } keys( %$app );
149
150 unless( @mss ) {
151 print STDERR "Skipping '@variant' at $line.$num: no mss\n";
152 next;
153 }
154
155 # Determine the label name for the edges here.
156 my $edge_name = join(', ', @mss );
157
158 # Make the variant into a set of nodes.
159 my $ctr = 0;
160 my $last_node = $graph->prior_word( $lemma_start );
161 my $var_start;
162 foreach my $vw ( @variant ) {
163 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
164 my $vwnode = $graph->add_node( $vwname );
165 $vwnode->set_attribute( 'label', $vw );
166 $vwnode->set_attribute( 'class', 'variant' );
167 $graph->add_edge( $last_node, $vwnode, $edge_name );
168 $var_start = $vwnode unless $var_start;
169 $last_node = $vwnode;
170 }
171 # Now hook it up at the end.
172 $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
173 $edge_name );
174
175 # Now collate and collapse the identical nodes within the graph.
176 collate_variant( $graph, $lemma_start, $lemma_end,
177 $var_start, $last_node );
178
179 }
180 }
181
182 ## Now in theory I have a graph. I want to make it a little easier to
183 ## read. So I collapse nodes that have only one edge in and one edge
184 ## out, and I do this by looking at the edges.
185
186 foreach my $edge ( $graph->edges() ) {
187 my @out_edges = $edge->from()->outgoing();
188 my @in_edges = $edge->to()->incoming();
189
190 next unless scalar( @out_edges ) == 1;
191 next unless scalar( @in_edges ) == 1;
192 next unless $out_edges[0] eq $in_edges[0];
193 # In theory if we've got this far, we're safe, but just to
194 # double-check...
195 next unless $out_edges[0] eq $edge;
196
197 $graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
198 }
199}
200
2ceca8c3 201=item B<read_base>
202
203my @line_beginnings = read_base( 'reference.txt', $graph );
204
205Takes a text file and a (presumed empty) graph object, adds the words
206as simple linear nodes to the graph, and returns a list of nodes that
207represent the beginning of lines. This graph is now the starting point
208for application of apparatus entries in merge_base, e.g. from a CSV
209file or a Classical Text Editor file.
210
211=cut
b49c4318 212
213sub read_base {
214 my( $base_file, $graph ) = @_;
215
216 # This array gives the first node for each line. We put the
217 # common starting point in line zero.
218 my $last_node = $graph->start();
219 my $lineref_array = [ $last_node ]; # There is no line zero.
220
221 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
222 while(<BASE>) {
223 # Make the nodes, and connect them up for the base, but also
224 # save the first node of each line in an array for the purpose.
225 chomp;
226 my @words = split;
227 my $started = 0;
228 my $wordref = 0;
229 my $lineref = scalar @$lineref_array;
230 foreach my $w ( @words ) {
231 my $noderef = join( ',', $lineref, ++$wordref );
232 my $node = $graph->add_node( $noderef );
233 $node->set_attribute( 'label', $w );
234 $node->set_attribute( 'class', 'common' );
235 unless( $started ) {
236 push( @$lineref_array, $node );
237 $started = 1;
238 }
239 if( $last_node ) {
240 $graph->add_edge( $last_node, $node, "base text" );
241 $last_node = $node;
242 } # TODO there should be no else here...
243 }
244 }
245 close BASE;
246 # Ending point for all texts
247 my $endpoint = $graph->add_node( '#END#' );
248 $graph->add_edge( $last_node, $endpoint, "base text" );
249 push( @$lineref_array, $endpoint );
250
251 return( @$lineref_array );
252}
253
2ceca8c3 254=item B<collate_variant>
255
256collate_variant( $graph, $lemma_start, $lemma_end, $var_start, $var_end );
257
258Given a lemma and a variant as start- and endpoints on the graph,
259walks through each to identify those nodes that are identical. The
260graph is a Text::Tradition::Graph object; the other arguments are
261Graph::Easy::Node objects that appear on the graph.
b49c4318 262
2ceca8c3 263TODO: Handle collapsed and non-collapsed transpositions.
264
265=cut
b49c4318 266
267sub collate_variant {
268 my( $graph, $lemma_start, $lemma_end, $var_start, $var_end ) = @_;
269 # If var_start is undef, then the variant is an omission and
270 # there's nothing to collate. Return.
271 return unless $var_start;
272
273 # I want to look at the nodes in the variant and lemma, and
274 # collapse nodes that are the same word. This is mini-collation.
275 my %collapsed = ();
276 # There will only be one outgoing edge at first, so this is safe.
277 my @out = $var_start->outgoing();
278 my $var_label = $out[0]->label();
279
280 my @lemma_nodes;
281 while( $lemma_start ne $lemma_end ) {
282 push( @lemma_nodes, $lemma_start );
283 $lemma_start = $graph->next_word( $lemma_start );
284 }
285 push( @lemma_nodes, $lemma_end );
286
287 my @variant_nodes;
288 while( $var_start ne $var_end ) {
289 push( @variant_nodes, $var_start );
290 $var_start = $graph->next_word( $var_start, $var_label );
291 }
292 push( @variant_nodes, $var_end );
293
294 # Go through the variant nodes, and if we find a lemma node that
295 # hasn't yet been collapsed with a node, equate them.
296
297 foreach my $w ( @variant_nodes ) {
298 my $word = $w->label();
299 foreach my $l ( @lemma_nodes ) {
300 if( $word eq cmp_str( $l ) ) {
301 next if exists( $collapsed{ $l->label } )
302 && $collapsed{ $l->label } eq $l;
303 # Collapse the nodes.
304 printf STDERR "Merging nodes %s/%s and %s/%s\n",
305 $l->name, $l->label, $w->name, $w->label;
306 $graph->merge_nodes( $l, $w );
307 $collapsed{ $l->label } = $l;
308 # Now collapse any multiple edges to and from the node.
309 # Rely on the presence of the 'base text' edge.
310 remove_duplicate_edges( $graph, $graph->prior_word( $l ), $l );
311 remove_duplicate_edges( $graph, $l, $graph->next_word( $l ) );
312 }
313 }
314 }
315}
316
2ceca8c3 317=item B<remove_duplicate_edges>
318
319remove_duplicate_edges( $graph, $from, $to );
320
321Given two nodes, reduce the number of edges between those nodes to
322one. If neither edge represents a base text, combine their labels.
323
324=cut
325
b49c4318 326sub remove_duplicate_edges {
327 my( $graph, $from, $to ) = @_;
328 my @edges = $from->edges_to( $to );
329 if( scalar @edges > 1 ) {
330 my @base = grep { $_->label eq 'base text' } @edges;
331 if ( scalar @base ) {
332 # Remove the edges that are not base.
333 foreach my $e ( @edges ) {
334 $graph->del_edge( $e )
335 unless $e eq $base[0];
336 }
337 } else {
338 # Combine the edges into one.
339 my $new_edge_name = join( ', ', map { $_->label() } @edges );
340 my $new_edge = shift @edges;
341 $new_edge->set_attribute( 'label', $new_edge_name );
342 foreach my $e ( @edges ) {
343 $graph->del_edge( $e );
344 }
345 }
346 }
347}
348
2ceca8c3 349=item B<cmp_str>
350
351Pretend you never saw this method. Really it needs to not be hardcoded.
352
353=cut
354
b49c4318 355sub cmp_str {
356 my( $node ) = @_;
357 my $word = $node->label();
358 $word = lc( $word );
359 $word =~ s/\W//g;
360 $word =~ s/v/u/g;
361 $word =~ s/j/i/g;
362 $word =~ s/cha/ca/g;
363 $word =~ s/quatuor/quattuor/g;
364 $word =~ s/ioannes/iohannes/g;
365 return $word;
366}
367
2ceca8c3 368=back
369
370=head1 LICENSE
371
372This package is free software and is provided "as is" without express
373or implied warranty. You can redistribute it and/or modify it under
374the same terms as Perl itself.
375
376=head1 AUTHOR
377
378Tara L Andrews, aurum@cpan.org
379
380=cut
381
b49c4318 3821;