Commit | Line | Data |
e58153d6 |
1 | package Text::Tradition::Parser::BaseText; |
b49c4318 |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Exporter 'import'; |
6 | use vars qw( @EXPORT_OK ); |
7 | @EXPORT_OK = qw( merge_base ); |
8 | |
9 | sub merge_base { |
10 | my( $graph, $base_file, @app_entries ) = @_; |
11 | my @base_line_starts = read_base( $base_file, $graph ); |
12 | |
13 | foreach my $app ( @app_entries ) { |
14 | my( $line, $num ) = split( /\./, $app->{_id} ); |
15 | # DEBUG with a short graph |
16 | # last if $line > 2; |
17 | my $scrutinize = "21.8"; |
18 | my $first_line_node = $base_line_starts[ $line ]; |
19 | my $too_far = $base_line_starts[ $line+1 ]; |
20 | |
21 | my $lemma = $app->{rdg_0}; |
22 | my $seq = 1; |
23 | # Is this the Nth occurrence of this reading in the line? |
24 | if( $lemma =~ s/(_)?(\d)$// ) { |
25 | $seq = $2; |
26 | } |
27 | my @lemma_words = split( /\s+/, $lemma ); |
28 | |
29 | # Now search for the lemma words within this line. |
30 | my $lemma_start = $first_line_node; |
31 | my $lemma_end; |
32 | my %seen; |
33 | while( $lemma_start ne $too_far ) { |
34 | # Loop detection |
35 | if( $seen{ $lemma_start->name() } ) { |
36 | warn "Detected loop at " . $lemma_start->name() . |
37 | ", ref $line,$num"; |
38 | last; |
39 | } |
40 | $seen{ $lemma_start->name() } = 1; |
41 | |
42 | # Try to match the lemma. |
43 | my $unmatch = 0; |
44 | print STDERR "Matching " . cmp_str( $lemma_start) . " against " . |
45 | $lemma_words[0] . "...\n" |
46 | if "$line.$num" eq $scrutinize; |
47 | if( cmp_str( $lemma_start ) eq $lemma_words[0] ) { |
48 | # Skip it if we need a match that is not the first. |
49 | if( --$seq < 1 ) { |
50 | # Now we have to compare the rest of the words here. |
51 | if( scalar( @lemma_words ) > 1 ) { |
52 | my $next_node = $graph->next_word( $lemma_start ); |
53 | foreach my $w ( @lemma_words[1..$#lemma_words] ) { |
54 | printf STDERR "Now matching %s against %s\n", |
55 | cmp_str($next_node), $w |
56 | if "$line.$num" eq $scrutinize; |
57 | if( $w ne cmp_str($next_node) ) { |
58 | $unmatch = 1; |
59 | last; |
60 | } else { |
61 | $lemma_end = $next_node; |
62 | $next_node = $graph->next_word( $lemma_end ); |
63 | } |
64 | } |
65 | } else { |
66 | $lemma_end = $lemma_start; |
67 | } |
68 | } else { |
69 | $unmatch = 1; |
70 | } |
71 | } |
72 | last unless ( $unmatch || !defined( $lemma_end ) ); |
73 | $lemma_end = undef; |
74 | $lemma_start = $graph->next_word( $lemma_start ); |
75 | } |
76 | |
77 | unless( $lemma_end ) { |
78 | warn "No match found for @lemma_words at $line.$num"; |
79 | next; |
80 | } else { |
81 | # These are no longer common nodes; unmark them as such. |
82 | my @lemma_nodes = $graph->node_sequence( $lemma_start, |
83 | $lemma_end ); |
84 | map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes; |
85 | } |
86 | |
87 | # Now we have our lemma nodes; we add the variant nodes to the graph. |
88 | |
89 | # For each reading that is not rdg_0, we make a chain of nodes |
90 | # and connect them to the anchor. Edges are named after the mss |
91 | # that are relevant. |
92 | foreach my $k ( grep { /^rdg/ } keys( %$app ) ) { |
93 | next if $k eq 'rdg_0'; # that's the lemma. |
94 | my @variant = split( /\s+/, $app->{$k} ); |
95 | @variant = () if $app->{$k} eq '/'; # This is an omission. |
96 | my @mss = grep { $app->{$_} eq $k } keys( %$app ); |
97 | |
98 | unless( @mss ) { |
99 | print STDERR "Skipping '@variant' at $line.$num: no mss\n"; |
100 | next; |
101 | } |
102 | |
103 | # Determine the label name for the edges here. |
104 | my $edge_name = join(', ', @mss ); |
105 | |
106 | # Make the variant into a set of nodes. |
107 | my $ctr = 0; |
108 | my $last_node = $graph->prior_word( $lemma_start ); |
109 | my $var_start; |
110 | foreach my $vw ( @variant ) { |
111 | my $vwname = "$k/$line.$num.$ctr"; $ctr++; |
112 | my $vwnode = $graph->add_node( $vwname ); |
113 | $vwnode->set_attribute( 'label', $vw ); |
114 | $vwnode->set_attribute( 'class', 'variant' ); |
115 | $graph->add_edge( $last_node, $vwnode, $edge_name ); |
116 | $var_start = $vwnode unless $var_start; |
117 | $last_node = $vwnode; |
118 | } |
119 | # Now hook it up at the end. |
120 | $graph->add_edge( $last_node, $graph->next_word( $lemma_end ), |
121 | $edge_name ); |
122 | |
123 | # Now collate and collapse the identical nodes within the graph. |
124 | collate_variant( $graph, $lemma_start, $lemma_end, |
125 | $var_start, $last_node ); |
126 | |
127 | } |
128 | } |
129 | |
130 | ## Now in theory I have a graph. I want to make it a little easier to |
131 | ## read. So I collapse nodes that have only one edge in and one edge |
132 | ## out, and I do this by looking at the edges. |
133 | |
134 | foreach my $edge ( $graph->edges() ) { |
135 | my @out_edges = $edge->from()->outgoing(); |
136 | my @in_edges = $edge->to()->incoming(); |
137 | |
138 | next unless scalar( @out_edges ) == 1; |
139 | next unless scalar( @in_edges ) == 1; |
140 | next unless $out_edges[0] eq $in_edges[0]; |
141 | # In theory if we've got this far, we're safe, but just to |
142 | # double-check... |
143 | next unless $out_edges[0] eq $edge; |
144 | |
145 | $graph->merge_nodes( $edge->from(), $edge->to(), ' ' ); |
146 | } |
147 | } |
148 | |
149 | # read_base: Takes a text file and a (presumed empty) graph object, |
150 | # adds the words as simple linear nodes to the graph, and returns a |
151 | # list of nodes that represent the beginning of lines. This graph is |
152 | # now the starting point for application of apparatus entries in |
153 | # merge_base, e.g. from a CSV file or a CTE file. |
154 | |
155 | sub read_base { |
156 | my( $base_file, $graph ) = @_; |
157 | |
158 | # This array gives the first node for each line. We put the |
159 | # common starting point in line zero. |
160 | my $last_node = $graph->start(); |
161 | my $lineref_array = [ $last_node ]; # There is no line zero. |
162 | |
163 | open( BASE, $base_file ) or die "Could not open file $base_file: $!"; |
164 | while(<BASE>) { |
165 | # Make the nodes, and connect them up for the base, but also |
166 | # save the first node of each line in an array for the purpose. |
167 | chomp; |
168 | my @words = split; |
169 | my $started = 0; |
170 | my $wordref = 0; |
171 | my $lineref = scalar @$lineref_array; |
172 | foreach my $w ( @words ) { |
173 | my $noderef = join( ',', $lineref, ++$wordref ); |
174 | my $node = $graph->add_node( $noderef ); |
175 | $node->set_attribute( 'label', $w ); |
176 | $node->set_attribute( 'class', 'common' ); |
177 | unless( $started ) { |
178 | push( @$lineref_array, $node ); |
179 | $started = 1; |
180 | } |
181 | if( $last_node ) { |
182 | $graph->add_edge( $last_node, $node, "base text" ); |
183 | $last_node = $node; |
184 | } # TODO there should be no else here... |
185 | } |
186 | } |
187 | close BASE; |
188 | # Ending point for all texts |
189 | my $endpoint = $graph->add_node( '#END#' ); |
190 | $graph->add_edge( $last_node, $endpoint, "base text" ); |
191 | push( @$lineref_array, $endpoint ); |
192 | |
193 | return( @$lineref_array ); |
194 | } |
195 | |
196 | |
197 | ## Helper methods for merge_base |
198 | |
199 | sub collate_variant { |
200 | my( $graph, $lemma_start, $lemma_end, $var_start, $var_end ) = @_; |
201 | # If var_start is undef, then the variant is an omission and |
202 | # there's nothing to collate. Return. |
203 | return unless $var_start; |
204 | |
205 | # I want to look at the nodes in the variant and lemma, and |
206 | # collapse nodes that are the same word. This is mini-collation. |
207 | my %collapsed = (); |
208 | # There will only be one outgoing edge at first, so this is safe. |
209 | my @out = $var_start->outgoing(); |
210 | my $var_label = $out[0]->label(); |
211 | |
212 | my @lemma_nodes; |
213 | while( $lemma_start ne $lemma_end ) { |
214 | push( @lemma_nodes, $lemma_start ); |
215 | $lemma_start = $graph->next_word( $lemma_start ); |
216 | } |
217 | push( @lemma_nodes, $lemma_end ); |
218 | |
219 | my @variant_nodes; |
220 | while( $var_start ne $var_end ) { |
221 | push( @variant_nodes, $var_start ); |
222 | $var_start = $graph->next_word( $var_start, $var_label ); |
223 | } |
224 | push( @variant_nodes, $var_end ); |
225 | |
226 | # Go through the variant nodes, and if we find a lemma node that |
227 | # hasn't yet been collapsed with a node, equate them. |
228 | |
229 | foreach my $w ( @variant_nodes ) { |
230 | my $word = $w->label(); |
231 | foreach my $l ( @lemma_nodes ) { |
232 | if( $word eq cmp_str( $l ) ) { |
233 | next if exists( $collapsed{ $l->label } ) |
234 | && $collapsed{ $l->label } eq $l; |
235 | # Collapse the nodes. |
236 | printf STDERR "Merging nodes %s/%s and %s/%s\n", |
237 | $l->name, $l->label, $w->name, $w->label; |
238 | $graph->merge_nodes( $l, $w ); |
239 | $collapsed{ $l->label } = $l; |
240 | # Now collapse any multiple edges to and from the node. |
241 | # Rely on the presence of the 'base text' edge. |
242 | remove_duplicate_edges( $graph, $graph->prior_word( $l ), $l ); |
243 | remove_duplicate_edges( $graph, $l, $graph->next_word( $l ) ); |
244 | } |
245 | } |
246 | } |
247 | } |
248 | |
249 | sub remove_duplicate_edges { |
250 | my( $graph, $from, $to ) = @_; |
251 | my @edges = $from->edges_to( $to ); |
252 | if( scalar @edges > 1 ) { |
253 | my @base = grep { $_->label eq 'base text' } @edges; |
254 | if ( scalar @base ) { |
255 | # Remove the edges that are not base. |
256 | foreach my $e ( @edges ) { |
257 | $graph->del_edge( $e ) |
258 | unless $e eq $base[0]; |
259 | } |
260 | } else { |
261 | # Combine the edges into one. |
262 | my $new_edge_name = join( ', ', map { $_->label() } @edges ); |
263 | my $new_edge = shift @edges; |
264 | $new_edge->set_attribute( 'label', $new_edge_name ); |
265 | foreach my $e ( @edges ) { |
266 | $graph->del_edge( $e ); |
267 | } |
268 | } |
269 | } |
270 | } |
271 | |
272 | # TODO need to make this configurable! |
273 | sub cmp_str { |
274 | my( $node ) = @_; |
275 | my $word = $node->label(); |
276 | $word = lc( $word ); |
277 | $word =~ s/\W//g; |
278 | $word =~ s/v/u/g; |
279 | $word =~ s/j/i/g; |
280 | $word =~ s/cha/ca/g; |
281 | $word =~ s/quatuor/quattuor/g; |
282 | $word =~ s/ioannes/iohannes/g; |
283 | return $word; |
284 | } |
285 | |
286 | 1; |