Commit | Line | Data |
e58153d6 |
1 | package Text::Tradition::Parser::BaseText; |
b49c4318 |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Exporter 'import'; |
6 | use vars qw( @EXPORT_OK ); |
7 | @EXPORT_OK = qw( merge_base ); |
8 | |
2ceca8c3 |
9 | =head1 NAME |
10 | |
11 | Text::Tradition::Parser::BaseText |
12 | |
13 | =head1 SYNOPSIS |
14 | |
15 | use Text::Tradition::Parser::BaseText qw( merge_base ); |
16 | merge_base( $graph, 'reference.txt', @apparatus_entries ) |
17 | |
18 | =head1 DESCRIPTION |
19 | |
20 | For an overview of the package, see the documentation for the |
21 | Text::Tradition::Graph module. |
22 | |
23 | This module is meant for use with certain of the other Parser classes |
24 | - whenever a list of variants is given with reference to a base text, |
25 | these must be joined into a single collation. The parser should |
26 | therefore make a list of variants and their locations, and BaseText |
27 | will join those listed variants onto the reference text. |
28 | |
29 | =head1 SUBROUTINES |
30 | |
31 | =over |
32 | |
33 | =item B<merge_base> |
34 | |
35 | merge_base( $graph, 'reference.txt', @apparatus_entries ) |
36 | |
37 | Takes three arguments: a newly-initialized Text::Tradition::Graph |
38 | object, a text file containing the reference text, and a list of |
39 | variants (apparatus entries). Adds the base text to the graph, and |
40 | joins the variants to that. |
41 | |
42 | The list of variants is an array of hash references; each hash takes |
43 | the form |
44 | { '_id' => line reference, |
45 | 'rdg_0' => lemma reading, |
46 | 'rdg_1' => first variant, |
47 | ... # and so on until all distinct readings are listed |
48 | 'WitnessA' => 'rdg_0', |
49 | 'WitnessB' => 'rdg_1', |
50 | ... # and so on until all witnesses are listed with their readings |
51 | } |
52 | |
53 | Any hash key that is not of the form /^rdg_\d+$/ and that does not |
54 | begin with an underscore is assumed to be a witness name. Any 'meta' |
55 | information to be passed must be passed in a key with a leading |
56 | underscore in its name. |
57 | |
58 | =cut |
59 | |
b49c4318 |
60 | sub merge_base { |
61 | my( $graph, $base_file, @app_entries ) = @_; |
62 | my @base_line_starts = read_base( $base_file, $graph ); |
63 | |
64 | foreach my $app ( @app_entries ) { |
65 | my( $line, $num ) = split( /\./, $app->{_id} ); |
66 | # DEBUG with a short graph |
67 | # last if $line > 2; |
2ceca8c3 |
68 | # DEBUG for problematic entries |
e49731d7 |
69 | my $scrutinize = ""; |
b49c4318 |
70 | my $first_line_node = $base_line_starts[ $line ]; |
71 | my $too_far = $base_line_starts[ $line+1 ]; |
72 | |
73 | my $lemma = $app->{rdg_0}; |
74 | my $seq = 1; |
75 | # Is this the Nth occurrence of this reading in the line? |
76 | if( $lemma =~ s/(_)?(\d)$// ) { |
77 | $seq = $2; |
78 | } |
79 | my @lemma_words = split( /\s+/, $lemma ); |
80 | |
81 | # Now search for the lemma words within this line. |
82 | my $lemma_start = $first_line_node; |
83 | my $lemma_end; |
84 | my %seen; |
85 | while( $lemma_start ne $too_far ) { |
86 | # Loop detection |
87 | if( $seen{ $lemma_start->name() } ) { |
88 | warn "Detected loop at " . $lemma_start->name() . |
89 | ", ref $line,$num"; |
90 | last; |
91 | } |
92 | $seen{ $lemma_start->name() } = 1; |
93 | |
94 | # Try to match the lemma. |
95 | my $unmatch = 0; |
96 | print STDERR "Matching " . cmp_str( $lemma_start) . " against " . |
97 | $lemma_words[0] . "...\n" |
98 | if "$line.$num" eq $scrutinize; |
99 | if( cmp_str( $lemma_start ) eq $lemma_words[0] ) { |
100 | # Skip it if we need a match that is not the first. |
101 | if( --$seq < 1 ) { |
102 | # Now we have to compare the rest of the words here. |
103 | if( scalar( @lemma_words ) > 1 ) { |
104 | my $next_node = $graph->next_word( $lemma_start ); |
105 | foreach my $w ( @lemma_words[1..$#lemma_words] ) { |
106 | printf STDERR "Now matching %s against %s\n", |
107 | cmp_str($next_node), $w |
108 | if "$line.$num" eq $scrutinize; |
109 | if( $w ne cmp_str($next_node) ) { |
110 | $unmatch = 1; |
111 | last; |
112 | } else { |
113 | $lemma_end = $next_node; |
114 | $next_node = $graph->next_word( $lemma_end ); |
115 | } |
116 | } |
117 | } else { |
118 | $lemma_end = $lemma_start; |
119 | } |
120 | } else { |
121 | $unmatch = 1; |
122 | } |
123 | } |
124 | last unless ( $unmatch || !defined( $lemma_end ) ); |
125 | $lemma_end = undef; |
126 | $lemma_start = $graph->next_word( $lemma_start ); |
127 | } |
128 | |
129 | unless( $lemma_end ) { |
130 | warn "No match found for @lemma_words at $line.$num"; |
131 | next; |
132 | } else { |
133 | # These are no longer common nodes; unmark them as such. |
134 | my @lemma_nodes = $graph->node_sequence( $lemma_start, |
135 | $lemma_end ); |
136 | map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes; |
137 | } |
138 | |
139 | # Now we have our lemma nodes; we add the variant nodes to the graph. |
140 | |
e49731d7 |
141 | # Keep track of the start and end point of each reading for later |
142 | # node collapse. |
143 | my @readings = ( $lemma_start, $lemma_end ); |
144 | |
b49c4318 |
145 | # For each reading that is not rdg_0, we make a chain of nodes |
146 | # and connect them to the anchor. Edges are named after the mss |
147 | # that are relevant. |
148 | foreach my $k ( grep { /^rdg/ } keys( %$app ) ) { |
149 | next if $k eq 'rdg_0'; # that's the lemma. |
150 | my @variant = split( /\s+/, $app->{$k} ); |
151 | @variant = () if $app->{$k} eq '/'; # This is an omission. |
152 | my @mss = grep { $app->{$_} eq $k } keys( %$app ); |
153 | |
154 | unless( @mss ) { |
155 | print STDERR "Skipping '@variant' at $line.$num: no mss\n"; |
156 | next; |
157 | } |
158 | |
159 | # Determine the label name for the edges here. |
160 | my $edge_name = join(', ', @mss ); |
161 | |
162 | # Make the variant into a set of nodes. |
163 | my $ctr = 0; |
164 | my $last_node = $graph->prior_word( $lemma_start ); |
165 | my $var_start; |
166 | foreach my $vw ( @variant ) { |
167 | my $vwname = "$k/$line.$num.$ctr"; $ctr++; |
168 | my $vwnode = $graph->add_node( $vwname ); |
169 | $vwnode->set_attribute( 'label', $vw ); |
170 | $vwnode->set_attribute( 'class', 'variant' ); |
171 | $graph->add_edge( $last_node, $vwnode, $edge_name ); |
172 | $var_start = $vwnode unless $var_start; |
173 | $last_node = $vwnode; |
174 | } |
175 | # Now hook it up at the end. |
176 | $graph->add_edge( $last_node, $graph->next_word( $lemma_end ), |
177 | $edge_name ); |
178 | |
e49731d7 |
179 | if( $var_start ) { # if it wasn't an empty reading |
180 | push( @readings, $var_start, $last_node ); |
181 | } |
b49c4318 |
182 | } |
e49731d7 |
183 | |
184 | # Now collate and collapse the identical nodes within the graph. |
185 | collate_variants( $graph, @readings ); |
b49c4318 |
186 | } |
187 | |
188 | ## Now in theory I have a graph. I want to make it a little easier to |
189 | ## read. So I collapse nodes that have only one edge in and one edge |
190 | ## out, and I do this by looking at the edges. |
191 | |
192 | foreach my $edge ( $graph->edges() ) { |
193 | my @out_edges = $edge->from()->outgoing(); |
194 | my @in_edges = $edge->to()->incoming(); |
195 | |
196 | next unless scalar( @out_edges ) == 1; |
197 | next unless scalar( @in_edges ) == 1; |
198 | next unless $out_edges[0] eq $in_edges[0]; |
199 | # In theory if we've got this far, we're safe, but just to |
200 | # double-check... |
201 | next unless $out_edges[0] eq $edge; |
202 | |
203 | $graph->merge_nodes( $edge->from(), $edge->to(), ' ' ); |
204 | } |
205 | } |
206 | |
2ceca8c3 |
207 | =item B<read_base> |
208 | |
209 | my @line_beginnings = read_base( 'reference.txt', $graph ); |
210 | |
211 | Takes a text file and a (presumed empty) graph object, adds the words |
212 | as simple linear nodes to the graph, and returns a list of nodes that |
213 | represent the beginning of lines. This graph is now the starting point |
214 | for application of apparatus entries in merge_base, e.g. from a CSV |
215 | file or a Classical Text Editor file. |
216 | |
217 | =cut |
b49c4318 |
218 | |
219 | sub read_base { |
220 | my( $base_file, $graph ) = @_; |
221 | |
222 | # This array gives the first node for each line. We put the |
223 | # common starting point in line zero. |
224 | my $last_node = $graph->start(); |
225 | my $lineref_array = [ $last_node ]; # There is no line zero. |
226 | |
227 | open( BASE, $base_file ) or die "Could not open file $base_file: $!"; |
228 | while(<BASE>) { |
229 | # Make the nodes, and connect them up for the base, but also |
230 | # save the first node of each line in an array for the purpose. |
231 | chomp; |
232 | my @words = split; |
233 | my $started = 0; |
234 | my $wordref = 0; |
235 | my $lineref = scalar @$lineref_array; |
236 | foreach my $w ( @words ) { |
237 | my $noderef = join( ',', $lineref, ++$wordref ); |
238 | my $node = $graph->add_node( $noderef ); |
239 | $node->set_attribute( 'label', $w ); |
240 | $node->set_attribute( 'class', 'common' ); |
241 | unless( $started ) { |
242 | push( @$lineref_array, $node ); |
243 | $started = 1; |
244 | } |
245 | if( $last_node ) { |
e49731d7 |
246 | my $edge = $graph->add_edge( $last_node, $node, "base text" ); |
247 | $edge->set_attribute( 'class', 'basetext' ); |
b49c4318 |
248 | $last_node = $node; |
249 | } # TODO there should be no else here... |
250 | } |
251 | } |
252 | close BASE; |
253 | # Ending point for all texts |
254 | my $endpoint = $graph->add_node( '#END#' ); |
255 | $graph->add_edge( $last_node, $endpoint, "base text" ); |
256 | push( @$lineref_array, $endpoint ); |
257 | |
258 | return( @$lineref_array ); |
259 | } |
260 | |
e49731d7 |
261 | =item B<collate_variants> |
2ceca8c3 |
262 | |
e49731d7 |
263 | collate_variants( $graph, @readings ) |
2ceca8c3 |
264 | |
e49731d7 |
265 | Given a set of readings in the form |
266 | ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... ) |
2ceca8c3 |
267 | walks through each to identify those nodes that are identical. The |
e49731d7 |
268 | graph is a Text::Tradition::Graph object; the elements of @readings are |
2ceca8c3 |
269 | Graph::Easy::Node objects that appear on the graph. |
b49c4318 |
270 | |
2ceca8c3 |
271 | TODO: Handle collapsed and non-collapsed transpositions. |
272 | |
273 | =cut |
b49c4318 |
274 | |
e49731d7 |
275 | sub collate_variants { |
276 | my( $graph, @readings ) = @_; |
277 | my $lemma_start = shift @readings; |
278 | my $lemma_end = shift @readings; |
279 | my $detranspose = 1; |
b49c4318 |
280 | |
e49731d7 |
281 | # Start the list of distinct nodes with those nodes in the lemma. |
282 | my @distinct_nodes; |
b49c4318 |
283 | while( $lemma_start ne $lemma_end ) { |
e49731d7 |
284 | push( @distinct_nodes, [ $lemma_start, 'base text' ] ); |
b49c4318 |
285 | $lemma_start = $graph->next_word( $lemma_start ); |
286 | } |
e49731d7 |
287 | push( @distinct_nodes, [ $lemma_end, 'base text' ] ); |
b49c4318 |
288 | |
e49731d7 |
289 | |
290 | while( scalar @readings ) { |
291 | my( $var_start, $var_end ) = splice( @readings, 0, 2 ); |
292 | |
293 | # I want to look at the nodes in the variant and lemma, and |
294 | # collapse nodes that are the same word. This is mini-collation. |
295 | # Each word in the 'main' list can only be collapsed once with a |
296 | # word from the current reading. |
297 | my %collapsed = (); |
298 | |
299 | # Get the label. There will only be one outgoing edge to start |
300 | # with, so this is safe. |
301 | my @out = $var_start->outgoing(); |
302 | my $var_label = $out[0]->label(); |
303 | |
304 | my @variant_nodes; |
305 | while( $var_start ne $var_end ) { |
306 | push( @variant_nodes, $var_start ); |
307 | $var_start = $graph->next_word( $var_start, $var_label ); |
308 | } |
309 | push( @variant_nodes, $var_end ); |
310 | |
311 | # Go through the variant nodes, and if we find a lemma node that |
312 | # hasn't yet been collapsed with a node, equate them. If we do |
313 | # not, keep them to push onto the end of all_nodes. |
314 | my @remaining_nodes; |
315 | my $last_index = 0; |
316 | foreach my $w ( @variant_nodes ) { |
317 | my $word = $w->label(); |
318 | my $matched = 0; |
319 | foreach my $idx ( $last_index .. $#distinct_nodes ) { |
320 | my( $l, $edgelabel ) = @{$distinct_nodes[$idx]}; |
321 | if( $word eq cmp_str( $l ) ) { |
322 | next if exists( $collapsed{ $l->label } ) |
323 | && $collapsed{ $l->label } eq $l; |
324 | $matched = 1; |
325 | $last_index = $idx if $detranspose; |
326 | # Collapse the nodes. |
327 | printf STDERR "Merging nodes %s/%s and %s/%s\n", |
328 | $l->name, $l->label, $w->name, $w->label; |
329 | $graph->merge_nodes( $l, $w ); |
330 | $collapsed{ $l->label } = $l; |
331 | # Now collapse any multiple edges to and from the node. |
332 | remove_duplicate_edges( $graph, |
333 | $graph->prior_word( $l, $edgelabel ), $l ); |
334 | remove_duplicate_edges( $graph, $l, |
335 | $graph->next_word( $l, $edgelabel ) ); |
336 | } |
b49c4318 |
337 | } |
e49731d7 |
338 | push( @remaining_nodes, [ $w, $var_label ] ) unless $matched; |
b49c4318 |
339 | } |
e49731d7 |
340 | push( @distinct_nodes, @remaining_nodes) if scalar( @remaining_nodes ); |
b49c4318 |
341 | } |
342 | } |
343 | |
2ceca8c3 |
344 | =item B<remove_duplicate_edges> |
345 | |
346 | remove_duplicate_edges( $graph, $from, $to ); |
347 | |
348 | Given two nodes, reduce the number of edges between those nodes to |
349 | one. If neither edge represents a base text, combine their labels. |
350 | |
351 | =cut |
352 | |
b49c4318 |
353 | sub remove_duplicate_edges { |
354 | my( $graph, $from, $to ) = @_; |
355 | my @edges = $from->edges_to( $to ); |
356 | if( scalar @edges > 1 ) { |
357 | my @base = grep { $_->label eq 'base text' } @edges; |
358 | if ( scalar @base ) { |
359 | # Remove the edges that are not base. |
360 | foreach my $e ( @edges ) { |
361 | $graph->del_edge( $e ) |
362 | unless $e eq $base[0]; |
363 | } |
364 | } else { |
365 | # Combine the edges into one. |
366 | my $new_edge_name = join( ', ', map { $_->label() } @edges ); |
367 | my $new_edge = shift @edges; |
368 | $new_edge->set_attribute( 'label', $new_edge_name ); |
369 | foreach my $e ( @edges ) { |
370 | $graph->del_edge( $e ); |
371 | } |
372 | } |
373 | } |
374 | } |
375 | |
2ceca8c3 |
376 | =item B<cmp_str> |
377 | |
378 | Pretend you never saw this method. Really it needs to not be hardcoded. |
379 | |
380 | =cut |
381 | |
b49c4318 |
382 | sub cmp_str { |
383 | my( $node ) = @_; |
384 | my $word = $node->label(); |
385 | $word = lc( $word ); |
386 | $word =~ s/\W//g; |
387 | $word =~ s/v/u/g; |
388 | $word =~ s/j/i/g; |
389 | $word =~ s/cha/ca/g; |
390 | $word =~ s/quatuor/quattuor/g; |
391 | $word =~ s/ioannes/iohannes/g; |
392 | return $word; |
393 | } |
394 | |
2ceca8c3 |
395 | =back |
396 | |
397 | =head1 LICENSE |
398 | |
399 | This package is free software and is provided "as is" without express |
400 | or implied warranty. You can redistribute it and/or modify it under |
401 | the same terms as Perl itself. |
402 | |
403 | =head1 AUTHOR |
404 | |
405 | Tara L Andrews, aurum@cpan.org |
406 | |
407 | =cut |
408 | |
b49c4318 |
409 | 1; |