1 package Text::Tradition::Parser::BaseText;
9 Text::Tradition::Parser::BaseText
13 use Text::Tradition::Parser::BaseText qw( merge_base );
14 merge_base( $graph, 'reference.txt', @apparatus_entries )
18 For an overview of the package, see the documentation for the
19 Text::Tradition::Graph module.
21 This module is meant for use with certain of the other Parser classes
22 - whenever a list of variants is given with reference to a base text,
23 these must be joined into a single collation. The parser should
24 therefore make a list of variants and their locations, and BaseText
25 will join those listed variants onto the reference text.
33 parse( $graph, %opts );
35 Takes an initialized graph and a set of options, which must include:
36 - 'base' - the base text referenced by the variants
37 - 'format' - the format of the variant list
38 - 'data' - the variants, in the given format.
43 my( $graph, %opts ) = @_;
45 my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
47 my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
48 merge_base( $graph, $opts{'base'}, @apparatus_entries );
53 merge_base( $graph, 'reference.txt', @apparatus_entries )
55 Takes three arguments: a newly-initialized Text::Tradition::Graph
56 object, a text file containing the reference text, and a list of
57 variants (apparatus entries). Adds the base text to the graph, and
58 joins the variants to that.
60 The list of variants is an array of hash references; each hash takes
62 { '_id' => line reference,
63 'rdg_0' => lemma reading,
64 'rdg_1' => first variant,
65 ... # and so on until all distinct readings are listed
66 'WitnessA' => 'rdg_0',
67 'WitnessB' => 'rdg_1',
68 ... # and so on until all witnesses are listed with their readings
71 Any hash key that is not of the form /^rdg_\d+$/ and that does not
72 begin with an underscore is assumed to be a witness name. Any 'meta'
73 information to be passed must be passed in a key with a leading
74 underscore in its name.
79 my( $graph, $base_file, @app_entries ) = @_;
80 my @base_line_starts = read_base( $base_file, $graph );
83 foreach my $app ( @app_entries ) {
84 my( $line, $num ) = split( /\./, $app->{_id} );
85 # DEBUG with a short graph
87 # DEBUG for problematic entries
89 my $first_line_node = $base_line_starts[ $line ];
90 my $too_far = $base_line_starts[ $line+1 ];
92 my $lemma = $app->{rdg_0};
94 # Is this the Nth occurrence of this reading in the line?
95 if( $lemma =~ s/(_)?(\d)$// ) {
98 my @lemma_words = split( /\s+/, $lemma );
100 # Now search for the lemma words within this line.
101 my $lemma_start = $first_line_node;
104 while( $lemma_start ne $too_far ) {
106 if( $seen{ $lemma_start->name() } ) {
107 warn "Detected loop at " . $lemma_start->name() .
111 $seen{ $lemma_start->name() } = 1;
113 # Try to match the lemma.
115 print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
116 $lemma_words[0] . "...\n"
117 if "$line.$num" eq $scrutinize;
118 if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
119 # Skip it if we need a match that is not the first.
121 # Now we have to compare the rest of the words here.
122 if( scalar( @lemma_words ) > 1 ) {
123 my $next_node = $graph->next_word( $lemma_start );
124 foreach my $w ( @lemma_words[1..$#lemma_words] ) {
125 printf STDERR "Now matching %s against %s\n",
126 cmp_str($next_node), $w
127 if "$line.$num" eq $scrutinize;
128 if( $w ne cmp_str($next_node) ) {
132 $lemma_end = $next_node;
133 $next_node = $graph->next_word( $lemma_end );
137 $lemma_end = $lemma_start;
143 last unless ( $unmatch || !defined( $lemma_end ) );
145 $lemma_start = $graph->next_word( $lemma_start );
148 unless( $lemma_end ) {
149 warn "No match found for @lemma_words at $line.$num";
152 # These are no longer common nodes; unmark them as such.
153 my @lemma_nodes = $graph->node_sequence( $lemma_start,
155 map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
158 # Now we have our lemma nodes; we add the variant nodes to the graph.
160 # Keep track of the start and end point of each reading for later
162 my @readings = ( $lemma_start, $lemma_end );
164 # For each reading that is not rdg_0, we make a chain of nodes
165 # and connect them to the anchor. Edges are named after the mss
167 foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
168 next if $k eq 'rdg_0'; # that's the lemma.
169 my @variant = split( /\s+/, $app->{$k} );
170 @variant = () if $app->{$k} eq '/'; # This is an omission.
171 my @mss = grep { $app->{$_} eq $k } keys( %$app );
174 print STDERR "Skipping '@variant' at $line.$num: no mss\n";
178 # Determine the label name for the edges here.
179 my $edge_name = join(', ', @mss );
180 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
182 # Make the variant into a set of nodes.
184 my $last_node = $graph->prior_word( $lemma_start );
186 foreach my $vw ( @variant ) {
187 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
188 my $vwnode = $graph->add_node( $vwname );
189 $vwnode->set_attribute( 'label', $vw );
190 $vwnode->set_attribute( 'class', 'variant' );
191 $graph->add_edge( $last_node, $vwnode, $edge_name );
192 $var_start = $vwnode unless $var_start;
193 $last_node = $vwnode;
195 # Now hook it up at the end.
196 $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
199 if( $var_start ) { # if it wasn't an empty reading
200 push( @readings, $var_start, $last_node );
204 # Now collate and collapse the identical nodes within the graph.
205 collate_variants( $graph, @readings );
208 ## Now in theory I have a graph. I want to make it a little easier to
209 ## read. So I collapse nodes that have only one edge in and one edge
210 ## out, and I do this by looking at the edges.
212 # foreach my $edge ( $graph->edges() ) {
213 # my @out_edges = $edge->from()->outgoing();
214 # my @in_edges = $edge->to()->incoming();
216 # next if $edge->from() eq $graph->start();
217 # next if $edge->to()->name() eq '#END#';
218 # next unless scalar( @out_edges ) == 1;
219 # next unless scalar( @in_edges ) == 1;
220 # next unless $out_edges[0] eq $in_edges[0];
221 # # In theory if we've got this far, we're safe, but just to
223 # next unless $out_edges[0] eq $edge;
225 # $graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
228 # Now walk the path for each witness, so that we can do the
229 # position calculations.
231 foreach my $w ( keys %all_witnesses ) {
233 if( $w =~ /^(.*)\s*\(p\.\s*c\.\)/ ) {
236 my @wit_nodes = $graph->node_sequence( $graph->start,
237 $graph->node( '#END#' ),
239 my @wn_names = map { $_->name() } @wit_nodes;
240 $paths->{$w} = \@wn_names;
243 my @common_nodes = grep { $graph->is_common( $_ ) } $graph->nodes();
244 $graph->make_positions( \@common_nodes, $paths );
249 my @line_beginnings = read_base( 'reference.txt', $graph );
251 Takes a text file and a (presumed empty) graph object, adds the words
252 as simple linear nodes to the graph, and returns a list of nodes that
253 represent the beginning of lines. This graph is now the starting point
254 for application of apparatus entries in merge_base, e.g. from a CSV
255 file or a Classical Text Editor file.
260 my( $base_file, $graph ) = @_;
262 # This array gives the first node for each line. We put the
263 # common starting point in line zero.
264 my $last_node = $graph->start();
265 my $lineref_array = [ $last_node ]; # There is no line zero.
267 open( BASE, $base_file ) or die "Could not open file $base_file: $!";
269 # Make the nodes, and connect them up for the base, but also
270 # save the first node of each line in an array for the purpose.
275 my $lineref = scalar @$lineref_array;
276 foreach my $w ( @words ) {
277 my $noderef = join( ',', $lineref, ++$wordref );
278 my $node = $graph->add_node( $noderef );
279 $node->set_attribute( 'label', $w );
280 $node->set_attribute( 'class', 'common' );
282 push( @$lineref_array, $node );
286 my $edge = $graph->add_edge( $last_node, $node, "base text" );
287 $edge->set_attribute( 'class', 'basetext' );
289 } # TODO there should be no else here...
293 # Ending point for all texts
294 my $endpoint = $graph->add_node( '#END#' );
295 $graph->add_edge( $last_node, $endpoint, "base text" );
296 push( @$lineref_array, $endpoint );
298 return( @$lineref_array );
301 =item B<collate_variants>
303 collate_variants( $graph, @readings )
305 Given a set of readings in the form
306 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
307 walks through each to identify those nodes that are identical. The
308 graph is a Text::Tradition::Graph object; the elements of @readings are
309 Graph::Easy::Node objects that appear on the graph.
311 TODO: Handle collapsed and non-collapsed transpositions.
315 sub collate_variants {
316 my( $graph, @readings ) = @_;
317 my $lemma_start = shift @readings;
318 my $lemma_end = shift @readings;
321 # We need to calculate positions at this point, which is where
322 # we are getting the implicit information from the apparatus.
324 # Start the list of distinct nodes with those nodes in the lemma.
327 while( $lemma_start ne $lemma_end ) {
328 push( @distinct_nodes, [ $lemma_start, 'base text', $position++ ] );
329 $lemma_start = $graph->next_word( $lemma_start );
331 push( @distinct_nodes, [ $lemma_end, 'base text', $position++ ] );
334 while( scalar @readings ) {
335 my( $var_start, $var_end ) = splice( @readings, 0, 2 );
337 # I want to look at the nodes in the variant and lemma, and
338 # collapse nodes that are the same word. This is mini-collation.
339 # Each word in the 'main' list can only be collapsed once with a
340 # word from the current reading.
343 # Get the label. There will only be one outgoing edge to start
344 # with, so this is safe.
345 my @out = $var_start->outgoing();
346 my $var_label = $out[0]->label();
349 while( $var_start ne $var_end ) {
350 push( @variant_nodes, $var_start );
351 $var_start = $graph->next_word( $var_start, $var_label );
353 push( @variant_nodes, $var_end );
355 # Go through the variant nodes, and if we find a lemma node that
356 # hasn't yet been collapsed with a node, equate them. If we do
357 # not, keep them to push onto the end of all_nodes.
361 foreach my $w ( @variant_nodes ) {
362 my $word = $w->label();
364 foreach my $idx ( $last_index .. $#distinct_nodes ) {
365 my( $l, $edgelabel, $pos ) = @{$distinct_nodes[$idx]};
366 if( $word eq cmp_str( $l ) ) {
367 next if exists( $collapsed{ $l->label } )
368 && $collapsed{ $l->label } eq $l;
370 $last_index = $idx if $detranspose;
371 # Collapse the nodes.
372 printf STDERR "Merging nodes %s/%s and %s/%s\n",
373 $l->name, $l->label, $w->name, $w->label;
374 $graph->merge_nodes( $l, $w );
375 $collapsed{ $l->label } = $l;
376 # Now collapse any multiple edges to and from the node.
377 remove_duplicate_edges( $graph,
378 $graph->prior_word( $l, $edgelabel ), $l );
379 remove_duplicate_edges( $graph, $l,
380 $graph->next_word( $l, $edgelabel ) );
385 push( @remaining_nodes, [ $w, $var_label, $curr_pos++ ] ) unless $matched;
387 push( @distinct_nodes, @remaining_nodes) if scalar( @remaining_nodes );
390 # Now set the positions of all the nodes in this variation.
392 print STDERR "Nodes and their positions are:\n";
393 foreach my $n ( @distinct_nodes ) {
394 printf STDERR "\t%s (position %s)\n", $n->[0]->label(), $n->[2];
398 =item B<remove_duplicate_edges>
400 remove_duplicate_edges( $graph, $from, $to );
402 Given two nodes, reduce the number of edges between those nodes to
403 one. If neither edge represents a base text, combine their labels.
407 sub remove_duplicate_edges {
408 my( $graph, $from, $to ) = @_;
409 my @edges = $from->edges_to( $to );
410 if( scalar @edges > 1 ) {
411 my @base = grep { $_->label eq 'base text' } @edges;
412 if ( scalar @base ) {
413 # Remove the edges that are not base.
414 foreach my $e ( @edges ) {
415 $graph->del_edge( $e )
416 unless $e eq $base[0];
419 # Combine the edges into one.
420 my $new_edge_name = join( ', ', map { $_->label() } @edges );
421 my $new_edge = shift @edges;
422 $new_edge->set_attribute( 'label', $new_edge_name );
423 foreach my $e ( @edges ) {
424 $graph->del_edge( $e );
432 Pretend you never saw this method. Really it needs to not be hardcoded.
438 my $word = $node->label();
444 $word =~ s/quatuor/quattuor/g;
445 $word =~ s/ioannes/iohannes/g;
453 This package is free software and is provided "as is" without express
454 or implied warranty. You can redistribute it and/or modify it under
455 the same terms as Perl itself.
459 Tara L Andrews, aurum@cpan.org