1 package Text::Tradition::Parser::BaseText;
6 use vars qw( @EXPORT_OK );
7 @EXPORT_OK = qw( merge_base );
11 Text::Tradition::Parser::BaseText
15 use Text::Tradition::Parser::BaseText qw( merge_base );
16 merge_base( $graph, 'reference.txt', @apparatus_entries )
20 For an overview of the package, see the documentation for the
21 Text::Tradition::Graph module.
23 This module is meant for use with certain of the other Parser classes
24 - whenever a list of variants is given with reference to a base text,
25 these must be joined into a single collation. The parser should
26 therefore make a list of variants and their locations, and BaseText
27 will join those listed variants onto the reference text.
35 merge_base( $graph, 'reference.txt', @apparatus_entries )
37 Takes three arguments: a newly-initialized Text::Tradition::Graph
38 object, a text file containing the reference text, and a list of
39 variants (apparatus entries). Adds the base text to the graph, and
40 joins the variants to that.
42 The list of variants is an array of hash references; each hash takes
44 { '_id' => line reference,
45 'rdg_0' => lemma reading,
46 'rdg_1' => first variant,
47 ... # and so on until all distinct readings are listed
48 'WitnessA' => 'rdg_0',
49 'WitnessB' => 'rdg_1',
50 ... # and so on until all witnesses are listed with their readings
53 Any hash key that is not of the form /^rdg_\d+$/ and that does not
54 begin with an underscore is assumed to be a witness name. Any 'meta'
55 information to be passed must be passed in a key with a leading
56 underscore in its name.
61 my( $graph, $base_file, @app_entries ) = @_;
62 my @base_line_starts = read_base( $base_file, $graph );
64 foreach my $app ( @app_entries ) {
65 my( $line, $num ) = split( /\./, $app->{_id} );
66 # DEBUG with a short graph
68 # DEBUG for problematic entries
69 # my $scrutinize = "21.8";
70 my $first_line_node = $base_line_starts[ $line ];
71 my $too_far = $base_line_starts[ $line+1 ];
73 my $lemma = $app->{rdg_0};
75 # Is this the Nth occurrence of this reading in the line?
76 if( $lemma =~ s/(_)?(\d)$// ) {
79 my @lemma_words = split( /\s+/, $lemma );
81 # Now search for the lemma words within this line.
82 my $lemma_start = $first_line_node;
85 while( $lemma_start ne $too_far ) {
87 if( $seen{ $lemma_start->name() } ) {
88 warn "Detected loop at " . $lemma_start->name() .
92 $seen{ $lemma_start->name() } = 1;
94 # Try to match the lemma.
96 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
97 $lemma_words[0] . "...\n"
98 if "$line.$num" eq $scrutinize;
99 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
100 # Skip it if we need a match that is not the first.
102 # Now we have to compare the rest of the words here.
103 if( scalar( @lemma_words ) > 1 ) {
104 my $next_node = $graph->next_word( $lemma_start );
105 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
106 printf STDERR "Now matching %s against %s\n",
107 cmp_str($next_node), $w
108 if "$line.$num" eq $scrutinize;
109 if( $w ne cmp_str($next_node) ) {
113 $lemma_end = $next_node;
114 $next_node = $graph->next_word( $lemma_end );
118 $lemma_end = $lemma_start;
124 last unless ( $unmatch || !defined( $lemma_end ) );
126 $lemma_start = $graph->next_word( $lemma_start );
129 unless( $lemma_end ) {
130 warn "No match found for @lemma_words at $line.$num";
133 # These are no longer common nodes; unmark them as such.
134 my @lemma_nodes = $graph->node_sequence( $lemma_start,
136 map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
139 # Now we have our lemma nodes; we add the variant nodes to the graph.
141 # For each reading that is not rdg_0, we make a chain of nodes
142 # and connect them to the anchor. Edges are named after the mss
144 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
145 next if $k eq 'rdg_0'; # that's the lemma.
146 my @variant = split( /\s+/, $app->{$k} );
147 @variant = () if $app->{$k} eq '/'; # This is an omission.
148 my @mss = grep { $app->{$_} eq $k } keys( %$app );
151 print STDERR "Skipping '@variant' at $line.$num: no mss\n";
155 # Determine the label name for the edges here.
156 my $edge_name = join(', ', @mss );
158 # Make the variant into a set of nodes.
160 my $last_node = $graph->prior_word( $lemma_start );
162 foreach my $vw ( @variant ) {
163 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
164 my $vwnode = $graph->add_node( $vwname );
165 $vwnode->set_attribute( 'label', $vw );
166 $vwnode->set_attribute( 'class', 'variant' );
167 $graph->add_edge( $last_node, $vwnode, $edge_name );
168 $var_start = $vwnode unless $var_start;
169 $last_node = $vwnode;
171 # Now hook it up at the end.
172 $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
175 # Now collate and collapse the identical nodes within the graph.
176 collate_variant( $graph, $lemma_start, $lemma_end,
177 $var_start, $last_node );
182 ## Now in theory I have a graph. I want to make it a little easier to
183 ## read. So I collapse nodes that have only one edge in and one edge
184 ## out, and I do this by looking at the edges.
186 foreach my $edge ( $graph->edges() ) {
187 my @out_edges = $edge->from()->outgoing();
188 my @in_edges = $edge->to()->incoming();
190 next unless scalar( @out_edges ) == 1;
191 next unless scalar( @in_edges ) == 1;
192 next unless $out_edges[0] eq $in_edges[0];
193 # In theory if we've got this far, we're safe, but just to
195 next unless $out_edges[0] eq $edge;
197 $graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
203 my @line_beginnings = read_base( 'reference.txt', $graph );
205 Takes a text file and a (presumed empty) graph object, adds the words
206 as simple linear nodes to the graph, and returns a list of nodes that
207 represent the beginning of lines. This graph is now the starting point
208 for application of apparatus entries in merge_base, e.g. from a CSV
209 file or a Classical Text Editor file.
214 my( $base_file, $graph ) = @_;
216 # This array gives the first node for each line. We put the
217 # common starting point in line zero.
218 my $last_node = $graph->start();
219 my $lineref_array = [ $last_node ]; # There is no line zero.
221 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
223 # Make the nodes, and connect them up for the base, but also
224 # save the first node of each line in an array for the purpose.
229 my $lineref = scalar @$lineref_array;
230 foreach my $w ( @words ) {
231 my $noderef = join( ',', $lineref, ++$wordref );
232 my $node = $graph->add_node( $noderef );
233 $node->set_attribute( 'label', $w );
234 $node->set_attribute( 'class', 'common' );
236 push( @$lineref_array, $node );
240 $graph->add_edge( $last_node, $node, "base text" );
242 } # TODO there should be no else here...
246 # Ending point for all texts
247 my $endpoint = $graph->add_node( '#END#' );
248 $graph->add_edge( $last_node, $endpoint, "base text" );
249 push( @$lineref_array, $endpoint );
251 return( @$lineref_array );
254 =item B<collate_variant>
256 collate_variant( $graph, $lemma_start, $lemma_end, $var_start, $var_end );
258 Given a lemma and a variant as start- and endpoints on the graph,
259 walks through each to identify those nodes that are identical. The
260 graph is a Text::Tradition::Graph object; the other arguments are
261 Graph::Easy::Node objects that appear on the graph.
263 TODO: Handle collapsed and non-collapsed transpositions.
267 sub collate_variant {
268 my( $graph, $lemma_start, $lemma_end, $var_start, $var_end ) = @_;
269 # If var_start is undef, then the variant is an omission and
270 # there's nothing to collate. Return.
271 return unless $var_start;
273 # I want to look at the nodes in the variant and lemma, and
274 # collapse nodes that are the same word. This is mini-collation.
276 # There will only be one outgoing edge at first, so this is safe.
277 my @out = $var_start->outgoing();
278 my $var_label = $out[0]->label();
281 while( $lemma_start ne $lemma_end ) {
282 push( @lemma_nodes, $lemma_start );
283 $lemma_start = $graph->next_word( $lemma_start );
285 push( @lemma_nodes, $lemma_end );
288 while( $var_start ne $var_end ) {
289 push( @variant_nodes, $var_start );
290 $var_start = $graph->next_word( $var_start, $var_label );
292 push( @variant_nodes, $var_end );
294 # Go through the variant nodes, and if we find a lemma node that
295 # hasn't yet been collapsed with a node, equate them.
297 foreach my $w ( @variant_nodes ) {
298 my $word = $w->label();
299 foreach my $l ( @lemma_nodes ) {
300 if( $word eq cmp_str( $l ) ) {
301 next if exists( $collapsed{ $l->label } )
302 && $collapsed{ $l->label } eq $l;
303 # Collapse the nodes.
304 printf STDERR "Merging nodes %s/%s and %s/%s\n",
305 $l->name, $l->label, $w->name, $w->label;
306 $graph->merge_nodes( $l, $w );
307 $collapsed{ $l->label } = $l;
308 # Now collapse any multiple edges to and from the node.
309 # Rely on the presence of the 'base text' edge.
310 remove_duplicate_edges( $graph, $graph->prior_word( $l ), $l );
311 remove_duplicate_edges( $graph, $l, $graph->next_word( $l ) );
317 =item B<remove_duplicate_edges>
319 remove_duplicate_edges( $graph, $from, $to );
321 Given two nodes, reduce the number of edges between those nodes to
322 one. If neither edge represents a base text, combine their labels.
326 sub remove_duplicate_edges {
327 my( $graph, $from, $to ) = @_;
328 my @edges = $from->edges_to( $to );
329 if( scalar @edges > 1 ) {
330 my @base = grep { $_->label eq 'base text' } @edges;
331 if ( scalar @base ) {
332 # Remove the edges that are not base.
333 foreach my $e ( @edges ) {
334 $graph->del_edge( $e )
335 unless $e eq $base[0];
338 # Combine the edges into one.
339 my $new_edge_name = join( ', ', map { $_->label() } @edges );
340 my $new_edge = shift @edges;
341 $new_edge->set_attribute( 'label', $new_edge_name );
342 foreach my $e ( @edges ) {
343 $graph->del_edge( $e );
351 Pretend you never saw this method. Really it needs to not be hardcoded.
357 my $word = $node->label();
363 $word =~ s/quatuor/quattuor/g;
364 $word =~ s/ioannes/iohannes/g;
372 This package is free software and is provided "as is" without express
373 or implied warranty. You can redistribute it and/or modify it under
374 the same terms as Perl itself.
378 Tara L Andrews, aurum@cpan.org