...and fix the namespace in the tests
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm
CommitLineData
e58153d6 1package Text::Tradition::Parser::BaseText;
b49c4318 2
3use strict;
4use warnings;
5use Exporter 'import';
6use vars qw( @EXPORT_OK );
7@EXPORT_OK = qw( merge_base );
8
9sub merge_base {
10 my( $graph, $base_file, @app_entries ) = @_;
11 my @base_line_starts = read_base( $base_file, $graph );
12
13 foreach my $app ( @app_entries ) {
14 my( $line, $num ) = split( /\./, $app->{_id} );
15 # DEBUG with a short graph
16 # last if $line > 2;
17 my $scrutinize = "21.8";
18 my $first_line_node = $base_line_starts[ $line ];
19 my $too_far = $base_line_starts[ $line+1 ];
20
21 my $lemma = $app->{rdg_0};
22 my $seq = 1;
23 # Is this the Nth occurrence of this reading in the line?
24 if( $lemma =~ s/(_)?(\d)$// ) {
25 $seq = $2;
26 }
27 my @lemma_words = split( /\s+/, $lemma );
28
29 # Now search for the lemma words within this line.
30 my $lemma_start = $first_line_node;
31 my $lemma_end;
32 my %seen;
33 while( $lemma_start ne $too_far ) {
34 # Loop detection
35 if( $seen{ $lemma_start->name() } ) {
36 warn "Detected loop at " . $lemma_start->name() .
37 ", ref $line,$num";
38 last;
39 }
40 $seen{ $lemma_start->name() } = 1;
41
42 # Try to match the lemma.
43 my $unmatch = 0;
44 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
45 $lemma_words[0] . "...\n"
46 if "$line.$num" eq $scrutinize;
47 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
48 # Skip it if we need a match that is not the first.
49 if( --$seq < 1 ) {
50 # Now we have to compare the rest of the words here.
51 if( scalar( @lemma_words ) > 1 ) {
52 my $next_node = $graph->next_word( $lemma_start );
53 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
54 printf STDERR "Now matching %s against %s\n",
55 cmp_str($next_node), $w
56 if "$line.$num" eq $scrutinize;
57 if( $w ne cmp_str($next_node) ) {
58 $unmatch = 1;
59 last;
60 } else {
61 $lemma_end = $next_node;
62 $next_node = $graph->next_word( $lemma_end );
63 }
64 }
65 } else {
66 $lemma_end = $lemma_start;
67 }
68 } else {
69 $unmatch = 1;
70 }
71 }
72 last unless ( $unmatch || !defined( $lemma_end ) );
73 $lemma_end = undef;
74 $lemma_start = $graph->next_word( $lemma_start );
75 }
76
77 unless( $lemma_end ) {
78 warn "No match found for @lemma_words at $line.$num";
79 next;
80 } else {
81 # These are no longer common nodes; unmark them as such.
82 my @lemma_nodes = $graph->node_sequence( $lemma_start,
83 $lemma_end );
84 map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
85 }
86
87 # Now we have our lemma nodes; we add the variant nodes to the graph.
88
89 # For each reading that is not rdg_0, we make a chain of nodes
90 # and connect them to the anchor. Edges are named after the mss
91 # that are relevant.
92 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
93 next if $k eq 'rdg_0'; # that's the lemma.
94 my @variant = split( /\s+/, $app->{$k} );
95 @variant = () if $app->{$k} eq '/'; # This is an omission.
96 my @mss = grep { $app->{$_} eq $k } keys( %$app );
97
98 unless( @mss ) {
99 print STDERR "Skipping '@variant' at $line.$num: no mss\n";
100 next;
101 }
102
103 # Determine the label name for the edges here.
104 my $edge_name = join(', ', @mss );
105
106 # Make the variant into a set of nodes.
107 my $ctr = 0;
108 my $last_node = $graph->prior_word( $lemma_start );
109 my $var_start;
110 foreach my $vw ( @variant ) {
111 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
112 my $vwnode = $graph->add_node( $vwname );
113 $vwnode->set_attribute( 'label', $vw );
114 $vwnode->set_attribute( 'class', 'variant' );
115 $graph->add_edge( $last_node, $vwnode, $edge_name );
116 $var_start = $vwnode unless $var_start;
117 $last_node = $vwnode;
118 }
119 # Now hook it up at the end.
120 $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
121 $edge_name );
122
123 # Now collate and collapse the identical nodes within the graph.
124 collate_variant( $graph, $lemma_start, $lemma_end,
125 $var_start, $last_node );
126
127 }
128 }
129
130 ## Now in theory I have a graph. I want to make it a little easier to
131 ## read. So I collapse nodes that have only one edge in and one edge
132 ## out, and I do this by looking at the edges.
133
134 foreach my $edge ( $graph->edges() ) {
135 my @out_edges = $edge->from()->outgoing();
136 my @in_edges = $edge->to()->incoming();
137
138 next unless scalar( @out_edges ) == 1;
139 next unless scalar( @in_edges ) == 1;
140 next unless $out_edges[0] eq $in_edges[0];
141 # In theory if we've got this far, we're safe, but just to
142 # double-check...
143 next unless $out_edges[0] eq $edge;
144
145 $graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
146 }
147}
148
149# read_base: Takes a text file and a (presumed empty) graph object,
150# adds the words as simple linear nodes to the graph, and returns a
151# list of nodes that represent the beginning of lines. This graph is
152# now the starting point for application of apparatus entries in
153# merge_base, e.g. from a CSV file or a CTE file.
154
155sub read_base {
156 my( $base_file, $graph ) = @_;
157
158 # This array gives the first node for each line. We put the
159 # common starting point in line zero.
160 my $last_node = $graph->start();
161 my $lineref_array = [ $last_node ]; # There is no line zero.
162
163 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
164 while(<BASE>) {
165 # Make the nodes, and connect them up for the base, but also
166 # save the first node of each line in an array for the purpose.
167 chomp;
168 my @words = split;
169 my $started = 0;
170 my $wordref = 0;
171 my $lineref = scalar @$lineref_array;
172 foreach my $w ( @words ) {
173 my $noderef = join( ',', $lineref, ++$wordref );
174 my $node = $graph->add_node( $noderef );
175 $node->set_attribute( 'label', $w );
176 $node->set_attribute( 'class', 'common' );
177 unless( $started ) {
178 push( @$lineref_array, $node );
179 $started = 1;
180 }
181 if( $last_node ) {
182 $graph->add_edge( $last_node, $node, "base text" );
183 $last_node = $node;
184 } # TODO there should be no else here...
185 }
186 }
187 close BASE;
188 # Ending point for all texts
189 my $endpoint = $graph->add_node( '#END#' );
190 $graph->add_edge( $last_node, $endpoint, "base text" );
191 push( @$lineref_array, $endpoint );
192
193 return( @$lineref_array );
194}
195
196
197## Helper methods for merge_base
198
199sub collate_variant {
200 my( $graph, $lemma_start, $lemma_end, $var_start, $var_end ) = @_;
201 # If var_start is undef, then the variant is an omission and
202 # there's nothing to collate. Return.
203 return unless $var_start;
204
205 # I want to look at the nodes in the variant and lemma, and
206 # collapse nodes that are the same word. This is mini-collation.
207 my %collapsed = ();
208 # There will only be one outgoing edge at first, so this is safe.
209 my @out = $var_start->outgoing();
210 my $var_label = $out[0]->label();
211
212 my @lemma_nodes;
213 while( $lemma_start ne $lemma_end ) {
214 push( @lemma_nodes, $lemma_start );
215 $lemma_start = $graph->next_word( $lemma_start );
216 }
217 push( @lemma_nodes, $lemma_end );
218
219 my @variant_nodes;
220 while( $var_start ne $var_end ) {
221 push( @variant_nodes, $var_start );
222 $var_start = $graph->next_word( $var_start, $var_label );
223 }
224 push( @variant_nodes, $var_end );
225
226 # Go through the variant nodes, and if we find a lemma node that
227 # hasn't yet been collapsed with a node, equate them.
228
229 foreach my $w ( @variant_nodes ) {
230 my $word = $w->label();
231 foreach my $l ( @lemma_nodes ) {
232 if( $word eq cmp_str( $l ) ) {
233 next if exists( $collapsed{ $l->label } )
234 && $collapsed{ $l->label } eq $l;
235 # Collapse the nodes.
236 printf STDERR "Merging nodes %s/%s and %s/%s\n",
237 $l->name, $l->label, $w->name, $w->label;
238 $graph->merge_nodes( $l, $w );
239 $collapsed{ $l->label } = $l;
240 # Now collapse any multiple edges to and from the node.
241 # Rely on the presence of the 'base text' edge.
242 remove_duplicate_edges( $graph, $graph->prior_word( $l ), $l );
243 remove_duplicate_edges( $graph, $l, $graph->next_word( $l ) );
244 }
245 }
246 }
247}
248
249sub remove_duplicate_edges {
250 my( $graph, $from, $to ) = @_;
251 my @edges = $from->edges_to( $to );
252 if( scalar @edges > 1 ) {
253 my @base = grep { $_->label eq 'base text' } @edges;
254 if ( scalar @base ) {
255 # Remove the edges that are not base.
256 foreach my $e ( @edges ) {
257 $graph->del_edge( $e )
258 unless $e eq $base[0];
259 }
260 } else {
261 # Combine the edges into one.
262 my $new_edge_name = join( ', ', map { $_->label() } @edges );
263 my $new_edge = shift @edges;
264 $new_edge->set_attribute( 'label', $new_edge_name );
265 foreach my $e ( @edges ) {
266 $graph->del_edge( $e );
267 }
268 }
269 }
270}
271
272# TODO need to make this configurable!
273sub cmp_str {
274 my( $node ) = @_;
275 my $word = $node->label();
276 $word = lc( $word );
277 $word =~ s/\W//g;
278 $word =~ s/v/u/g;
279 $word =~ s/j/i/g;
280 $word =~ s/cha/ca/g;
281 $word =~ s/quatuor/quattuor/g;
282 $word =~ s/ioannes/iohannes/g;
283 return $word;
284}
285
2861;