1 package Text::Tradition::Parser::BaseText;
6 use vars qw( @EXPORT_OK );
7 @EXPORT_OK = qw( merge_base );
11 Text::Tradition::Parser::BaseText
15 use Text::Tradition::Parser::BaseText qw( merge_base );
16 merge_base( $graph, 'reference.txt', @apparatus_entries )
20 For an overview of the package, see the documentation for the
21 Text::Tradition::Graph module.
23 This module is meant for use with certain of the other Parser classes
24 - whenever a list of variants is given with reference to a base text,
25 these must be joined into a single collation. The parser should
26 therefore make a list of variants and their locations, and BaseText
27 will join those listed variants onto the reference text.
35 merge_base( $graph, 'reference.txt', @apparatus_entries )
37 Takes three arguments: a newly-initialized Text::Tradition::Graph
38 object, a text file containing the reference text, and a list of
39 variants (apparatus entries). Adds the base text to the graph, and
40 joins the variants to that.
42 The list of variants is an array of hash references; each hash takes
44 { '_id' => line reference,
45 'rdg_0' => lemma reading,
46 'rdg_1' => first variant,
47 ... # and so on until all distinct readings are listed
48 'WitnessA' => 'rdg_0',
49 'WitnessB' => 'rdg_1',
50 ... # and so on until all witnesses are listed with their readings
53 Any hash key that is not of the form /^rdg_\d+$/ and that does not
54 begin with an underscore is assumed to be a witness name. Any 'meta'
55 information to be passed must be passed in a key with a leading
56 underscore in its name.
61 my( $graph, $base_file, @app_entries ) = @_;
62 my @base_line_starts = read_base( $base_file, $graph );
64 foreach my $app ( @app_entries ) {
65 my( $line, $num ) = split( /\./, $app->{_id} );
66 # DEBUG with a short graph
68 # DEBUG for problematic entries
70 my $first_line_node = $base_line_starts[ $line ];
71 my $too_far = $base_line_starts[ $line+1 ];
73 my $lemma = $app->{rdg_0};
75 # Is this the Nth occurrence of this reading in the line?
76 if( $lemma =~ s/(_)?(\d)$// ) {
79 my @lemma_words = split( /\s+/, $lemma );
81 # Now search for the lemma words within this line.
82 my $lemma_start = $first_line_node;
85 while( $lemma_start ne $too_far ) {
87 if( $seen{ $lemma_start->name() } ) {
88 warn "Detected loop at " . $lemma_start->name() .
92 $seen{ $lemma_start->name() } = 1;
94 # Try to match the lemma.
96 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
97 $lemma_words[0] . "...\n"
98 if "$line.$num" eq $scrutinize;
99 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
100 # Skip it if we need a match that is not the first.
102 # Now we have to compare the rest of the words here.
103 if( scalar( @lemma_words ) > 1 ) {
104 my $next_node = $graph->next_word( $lemma_start );
105 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
106 printf STDERR "Now matching %s against %s\n",
107 cmp_str($next_node), $w
108 if "$line.$num" eq $scrutinize;
109 if( $w ne cmp_str($next_node) ) {
113 $lemma_end = $next_node;
114 $next_node = $graph->next_word( $lemma_end );
118 $lemma_end = $lemma_start;
124 last unless ( $unmatch || !defined( $lemma_end ) );
126 $lemma_start = $graph->next_word( $lemma_start );
129 unless( $lemma_end ) {
130 warn "No match found for @lemma_words at $line.$num";
133 # These are no longer common nodes; unmark them as such.
134 my @lemma_nodes = $graph->node_sequence( $lemma_start,
136 map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
139 # Now we have our lemma nodes; we add the variant nodes to the graph.
141 # Keep track of the start and end point of each reading for later
143 my @readings = ( $lemma_start, $lemma_end );
145 # For each reading that is not rdg_0, we make a chain of nodes
146 # and connect them to the anchor. Edges are named after the mss
148 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
149 next if $k eq 'rdg_0'; # that's the lemma.
150 my @variant = split( /\s+/, $app->{$k} );
151 @variant = () if $app->{$k} eq '/'; # This is an omission.
152 my @mss = grep { $app->{$_} eq $k } keys( %$app );
155 print STDERR "Skipping '@variant' at $line.$num: no mss\n";
159 # Determine the label name for the edges here.
160 my $edge_name = join(', ', @mss );
162 # Make the variant into a set of nodes.
164 my $last_node = $graph->prior_word( $lemma_start );
166 foreach my $vw ( @variant ) {
167 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
168 my $vwnode = $graph->add_node( $vwname );
169 $vwnode->set_attribute( 'label', $vw );
170 $vwnode->set_attribute( 'class', 'variant' );
171 $graph->add_edge( $last_node, $vwnode, $edge_name );
172 $var_start = $vwnode unless $var_start;
173 $last_node = $vwnode;
175 # Now hook it up at the end.
176 $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
179 if( $var_start ) { # if it wasn't an empty reading
180 push( @readings, $var_start, $last_node );
184 # Now collate and collapse the identical nodes within the graph.
185 collate_variants( $graph, @readings );
188 ## Now in theory I have a graph. I want to make it a little easier to
189 ## read. So I collapse nodes that have only one edge in and one edge
190 ## out, and I do this by looking at the edges.
192 foreach my $edge ( $graph->edges() ) {
193 my @out_edges = $edge->from()->outgoing();
194 my @in_edges = $edge->to()->incoming();
196 next unless scalar( @out_edges ) == 1;
197 next unless scalar( @in_edges ) == 1;
198 next unless $out_edges[0] eq $in_edges[0];
199 # In theory if we've got this far, we're safe, but just to
201 next unless $out_edges[0] eq $edge;
203 $graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
209 my @line_beginnings = read_base( 'reference.txt', $graph );
211 Takes a text file and a (presumed empty) graph object, adds the words
212 as simple linear nodes to the graph, and returns a list of nodes that
213 represent the beginning of lines. This graph is now the starting point
214 for application of apparatus entries in merge_base, e.g. from a CSV
215 file or a Classical Text Editor file.
220 my( $base_file, $graph ) = @_;
222 # This array gives the first node for each line. We put the
223 # common starting point in line zero.
224 my $last_node = $graph->start();
225 my $lineref_array = [ $last_node ]; # There is no line zero.
227 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
229 # Make the nodes, and connect them up for the base, but also
230 # save the first node of each line in an array for the purpose.
235 my $lineref = scalar @$lineref_array;
236 foreach my $w ( @words ) {
237 my $noderef = join( ',', $lineref, ++$wordref );
238 my $node = $graph->add_node( $noderef );
239 $node->set_attribute( 'label', $w );
240 $node->set_attribute( 'class', 'common' );
242 push( @$lineref_array, $node );
246 my $edge = $graph->add_edge( $last_node, $node, "base text" );
247 $edge->set_attribute( 'class', 'basetext' );
249 } # TODO there should be no else here...
253 # Ending point for all texts
254 my $endpoint = $graph->add_node( '#END#' );
255 $graph->add_edge( $last_node, $endpoint, "base text" );
256 push( @$lineref_array, $endpoint );
258 return( @$lineref_array );
261 =item B<collate_variants>
263 collate_variants( $graph, @readings )
265 Given a set of readings in the form
266 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
267 walks through each to identify those nodes that are identical. The
268 graph is a Text::Tradition::Graph object; the elements of @readings are
269 Graph::Easy::Node objects that appear on the graph.
271 TODO: Handle collapsed and non-collapsed transpositions.
275 sub collate_variants {
276 my( $graph, @readings ) = @_;
277 my $lemma_start = shift @readings;
278 my $lemma_end = shift @readings;
281 # Start the list of distinct nodes with those nodes in the lemma.
283 while( $lemma_start ne $lemma_end ) {
284 push( @distinct_nodes, [ $lemma_start, 'base text' ] );
285 $lemma_start = $graph->next_word( $lemma_start );
287 push( @distinct_nodes, [ $lemma_end, 'base text' ] );
290 while( scalar @readings ) {
291 my( $var_start, $var_end ) = splice( @readings, 0, 2 );
293 # I want to look at the nodes in the variant and lemma, and
294 # collapse nodes that are the same word. This is mini-collation.
295 # Each word in the 'main' list can only be collapsed once with a
296 # word from the current reading.
299 # Get the label. There will only be one outgoing edge to start
300 # with, so this is safe.
301 my @out = $var_start->outgoing();
302 my $var_label = $out[0]->label();
305 while( $var_start ne $var_end ) {
306 push( @variant_nodes, $var_start );
307 $var_start = $graph->next_word( $var_start, $var_label );
309 push( @variant_nodes, $var_end );
311 # Go through the variant nodes, and if we find a lemma node that
312 # hasn't yet been collapsed with a node, equate them. If we do
313 # not, keep them to push onto the end of all_nodes.
316 foreach my $w ( @variant_nodes ) {
317 my $word = $w->label();
319 foreach my $idx ( $last_index .. $#distinct_nodes ) {
320 my( $l, $edgelabel ) = @{$distinct_nodes[$idx]};
321 if( $word eq cmp_str( $l ) ) {
322 next if exists( $collapsed{ $l->label } )
323 && $collapsed{ $l->label } eq $l;
325 $last_index = $idx if $detranspose;
326 # Collapse the nodes.
327 printf STDERR "Merging nodes %s/%s and %s/%s\n",
328 $l->name, $l->label, $w->name, $w->label;
329 $graph->merge_nodes( $l, $w );
330 $collapsed{ $l->label } = $l;
331 # Now collapse any multiple edges to and from the node.
332 remove_duplicate_edges( $graph,
333 $graph->prior_word( $l, $edgelabel ), $l );
334 remove_duplicate_edges( $graph, $l,
335 $graph->next_word( $l, $edgelabel ) );
338 push( @remaining_nodes, [ $w, $var_label ] ) unless $matched;
340 push( @distinct_nodes, @remaining_nodes) if scalar( @remaining_nodes );
344 =item B<remove_duplicate_edges>
346 remove_duplicate_edges( $graph, $from, $to );
348 Given two nodes, reduce the number of edges between those nodes to
349 one. If neither edge represents a base text, combine their labels.
353 sub remove_duplicate_edges {
354 my( $graph, $from, $to ) = @_;
355 my @edges = $from->edges_to( $to );
356 if( scalar @edges > 1 ) {
357 my @base = grep { $_->label eq 'base text' } @edges;
358 if ( scalar @base ) {
359 # Remove the edges that are not base.
360 foreach my $e ( @edges ) {
361 $graph->del_edge( $e )
362 unless $e eq $base[0];
365 # Combine the edges into one.
366 my $new_edge_name = join( ', ', map { $_->label() } @edges );
367 my $new_edge = shift @edges;
368 $new_edge->set_attribute( 'label', $new_edge_name );
369 foreach my $e ( @edges ) {
370 $graph->del_edge( $e );
378 Pretend you never saw this method. Really it needs to not be hardcoded.
384 my $word = $node->label();
390 $word =~ s/quatuor/quattuor/g;
391 $word =~ s/ioannes/iohannes/g;
399 This package is free software and is provided "as is" without express
400 or implied warranty. You can redistribute it and/or modify it under
401 the same terms as Perl itself.
405 Tara L Andrews, aurum@cpan.org