use strict;
use warnings;
-use Exporter 'import';
-use vars qw( @EXPORT_OK );
-@EXPORT_OK = qw( merge_base );
+use Module::Load;
+use Algorithm::Diff;
=head1 NAME
=over
+=item B<parse>
+
+parse( $graph, %opts );
+
+Takes an initialized graph and a set of options, which must include:
+- 'base' - the base text referenced by the variants
+- 'format' - the format of the variant list
+- 'data' - the variants, in the given format.
+
+=cut
+
+sub parse {
+ my( $tradition, %opts ) = @_;
+
+ my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
+ load( $format_mod );
+ my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
+ merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
+}
+
=item B<merge_base>
merge_base( $graph, 'reference.txt', @apparatus_entries )
=cut
+my $SHORT = undef; # Debug var - set this to limit the number of lines parsed
+
+my %base_text_index;
+my $edits_required;
+
+# edits_required -> wit -> [ { start_idx, end_idx, items } ]
+
sub merge_base {
- my( $graph, $base_file, @app_entries ) = @_;
- my @base_line_starts = read_base( $base_file, $graph );
+ my( $collation, $base_file, @app_entries ) = @_;
+ my @base_line_starts = read_base( $base_file, $collation );
+ my %all_witnesses;
foreach my $app ( @app_entries ) {
my( $line, $num ) = split( /\./, $app->{_id} );
# DEBUG with a short graph
- # last if $line > 2;
+ last if $SHORT && $line > $SHORT;
# DEBUG for problematic entries
- # my $scrutinize = "21.8";
- my $first_line_node = $base_line_starts[ $line ];
+ my $scrutinize = '';
+ my $first_line_reading = $base_line_starts[ $line ];
my $too_far = $base_line_starts[ $line+1 ];
my $lemma = $app->{rdg_0};
my @lemma_words = split( /\s+/, $lemma );
# Now search for the lemma words within this line.
- my $lemma_start = $first_line_node;
+ my $lemma_start = $first_line_reading;
my $lemma_end;
my %seen;
while( $lemma_start ne $too_far ) {
if( --$seq < 1 ) {
# Now we have to compare the rest of the words here.
if( scalar( @lemma_words ) > 1 ) {
- my $next_node = $graph->next_word( $lemma_start );
+ my $next_reading =
+ $collation->next_reading( $lemma_start );
foreach my $w ( @lemma_words[1..$#lemma_words] ) {
printf STDERR "Now matching %s against %s\n",
- cmp_str($next_node), $w
+ cmp_str($next_reading), $w
if "$line.$num" eq $scrutinize;
- if( $w ne cmp_str($next_node) ) {
+ if( $w ne cmp_str($next_reading) ) {
$unmatch = 1;
last;
} else {
- $lemma_end = $next_node;
- $next_node = $graph->next_word( $lemma_end );
+ $lemma_end = $next_reading;
+ $next_reading =
+ $collation->next_reading( $lemma_end );
}
}
} else {
}
last unless ( $unmatch || !defined( $lemma_end ) );
$lemma_end = undef;
- $lemma_start = $graph->next_word( $lemma_start );
+ $lemma_start = $collation->next_reading( $lemma_start );
}
unless( $lemma_end ) {
warn "No match found for @lemma_words at $line.$num";
next;
- } else {
- # These are no longer common nodes; unmark them as such.
- my @lemma_nodes = $graph->node_sequence( $lemma_start,
- $lemma_end );
- map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
}
- # Now we have our lemma nodes; we add the variant nodes to the graph.
-
- # For each reading that is not rdg_0, we make a chain of nodes
- # and connect them to the anchor. Edges are named after the mss
- # that are relevant.
+ # Now we have found the lemma; we will record an 'edit', in
+ # terms of a splice operation, for each subsequent reading.
+ # We also note which witnesses take the given edit.
+
+ my @lemma_set = $collation->reading_sequence( $lemma_start, $lemma_end );
+ my @reading_sets = [ @lemma_set ];
+
+ # For each reading that is not rdg_0, we create the variant
+ # reading nodes, and store the range as an edit operation on
+ # the base text.
+ my $variant_objects;
+ my %pc_lemma; # Keep track of mss that have been corrected back to lemma
+ my %pc_variant; # Keep track of mss with other corrections
foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
- next if $k eq 'rdg_0'; # that's the lemma.
- my @variant = split( /\s+/, $app->{$k} );
- @variant = () if $app->{$k} eq '/'; # This is an omission.
my @mss = grep { $app->{$_} eq $k } keys( %$app );
-
- unless( @mss ) {
- print STDERR "Skipping '@variant' at $line.$num: no mss\n";
- next;
+ # Keep track of what witnesses we have seen.
+ @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
+ my $pc_hash = $k eq 'rdg_0' ? \%pc_lemma : \%pc_variant;
+
+ # Keep track of which witnesses bear corrected readings here.
+ foreach my $m ( @mss ) {
+ my $base = _is_post_corr( $m );
+ next unless $base;
+ $pc_hash->{$base} = 1;
}
+ next if $k eq 'rdg_0';
+
+ # TODO don't hardcode the reading split operation
+ my @variant = split( /\s+/, $app->{$k} );
+ @variant = () if $app->{$k} eq '/'; # This is an omission.
- # Determine the label name for the edges here.
- my $edge_name = join(', ', @mss );
-
- # Make the variant into a set of nodes.
+ # Make the variant into a set of readings.
+ my @variant_readings;
my $ctr = 0;
- my $last_node = $graph->prior_word( $lemma_start );
- my $var_start;
foreach my $vw ( @variant ) {
my $vwname = "$k/$line.$num.$ctr"; $ctr++;
- my $vwnode = $graph->add_node( $vwname );
- $vwnode->set_attribute( 'label', $vw );
- $vwnode->set_attribute( 'class', 'variant' );
- $graph->add_edge( $last_node, $vwnode, $edge_name );
- $var_start = $vwnode unless $var_start;
- $last_node = $vwnode;
+ my $vwreading = $collation->add_reading( $vwname );
+ $vwreading->text( $vw );
+ push( @variant_readings, $vwreading );
}
- # Now hook it up at the end.
- $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
- $edge_name );
-
- # Now collate and collapse the identical nodes within the graph.
- collate_variant( $graph, $lemma_start, $lemma_end,
- $var_start, $last_node );
-
+
+ $variant_objects->{$k} = { 'mss' => \@mss,
+ 'reading' => \@variant_readings,
+ };
+ push( @reading_sets, \@variant_readings );
}
- }
- ## Now in theory I have a graph. I want to make it a little easier to
- ## read. So I collapse nodes that have only one edge in and one edge
- ## out, and I do this by looking at the edges.
-
- foreach my $edge ( $graph->edges() ) {
- my @out_edges = $edge->from()->outgoing();
- my @in_edges = $edge->to()->incoming();
-
- next unless scalar( @out_edges ) == 1;
- next unless scalar( @in_edges ) == 1;
- next unless $out_edges[0] eq $in_edges[0];
- # In theory if we've got this far, we're safe, but just to
- # double-check...
- next unless $out_edges[0] eq $edge;
-
- $graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
+ # Now collate and collapse the identical readings within the
+ # collated sets. Modifies the reading sets that were passed.
+ collate_variants( $collation, @reading_sets );
+
+ # Now create the splice-edit objects that will be used
+ # to reconstruct each witness.
+
+ foreach my $rkey ( keys %$variant_objects ) {
+ # Object is argument list for splice, so:
+ # offset, length, replacements
+ my $edit_object = [ $base_text_index{$lemma_start->name},
+ scalar( @lemma_set ),
+ $variant_objects->{$rkey}->{reading} ];
+ foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
+ # Is this a p.c. entry?
+ my $base = _is_post_corr( $ms );
+ if( $base ) { # this is a post-corr witness
+ my $pc_key = $base . "_post";
+ _add_hash_entry( $edits_required, $pc_key, $edit_object );
+ } else { # this is an ante-corr witness
+ my $pc_key = $ms . "_post";
+ _add_hash_entry( $edits_required, $_, $edit_object );
+ unless( !$pc_lemma{$ms} && !$pc_variant{$ms} ) {
+ # If this witness carries no correction, add this same object
+ # to its post-corrected state.
+ # TODO combine these hashes?
+ _add_hash_entry( $edits_required, $pc_key, $edit_object );
+ }
+ }
+ }
+ }
+ } # Finished going through the apparatus entries
+
+ # Now make the witness objects, and create their text sequences
+ foreach my $w ( grep { $_ !~ /_base$/ } keys %$edits_required ) {
+ my $witness_obj = $collation->tradition->add_witness( sigil => $w );
+ my @ante_corr_seq = apply_edits( $edits_required->{$w} );
+ my @post_corr_seq = apply_edits( $edits_required->{$w."_post"} )
+ if exists( $edits_required->{$w."_post"} );
+
+ # Now how to save these paths in my witness object?
+ if( @post_corr_seq ) {
+ $witness_obj->add_path( @post_corr_seq );
+ $witness_obj->add_uncorrected_path( @ante_corr_seq );
+ } else {
+ $witness_obj->add_path( @ante_corr_seq );
+ }
}
+
+ # TODO Now remove all the 'base text' links.
+
+ # Now walk paths and calculate positions.
+ my @common_readings =
+ $collation->walk_and_expand_base( $collation->reading( '#END#' ) );
+ $collation->calculate_positions( @common_readings );
}
=item B<read_base>
-my @line_beginnings = read_base( 'reference.txt', $graph );
+my @line_beginnings = read_base( 'reference.txt', $collation );
-Takes a text file and a (presumed empty) graph object, adds the words
-as simple linear nodes to the graph, and returns a list of nodes that
-represent the beginning of lines. This graph is now the starting point
-for application of apparatus entries in merge_base, e.g. from a CSV
-file or a Classical Text Editor file.
+Takes a text file and a (presumed empty) collation object, adds the
+words as simple linear readings to the collation, and returns a
+list of readings that represent the beginning of lines. This collation
+is now the starting point for application of apparatus entries in
+merge_base, e.g. from a CSV file or a Classical Text Editor file.
=cut
sub read_base {
- my( $base_file, $graph ) = @_;
+ my( $base_file, $collation ) = @_;
- # This array gives the first node for each line. We put the
+ # This array gives the first reading for each line. We put the
# common starting point in line zero.
- my $last_node = $graph->start();
- my $lineref_array = [ $last_node ]; # There is no line zero.
+ my $last_reading = $collation->start();
+ my $lineref_array = [ $last_reading ]; # There is no line zero.
open( BASE, $base_file ) or die "Could not open file $base_file: $!";
+ my $i = 0;
while(<BASE>) {
- # Make the nodes, and connect them up for the base, but also
- # save the first node of each line in an array for the purpose.
+ # Make the readings, and connect them up for the base, but
+ # also save the first reading of each line in an array for the
+ # purpose.
+ # TODO use configurable reading separator
chomp;
my @words = split;
my $started = 0;
my $wordref = 0;
my $lineref = scalar @$lineref_array;
+ last if $SHORT && $lineref > $SHORT;
foreach my $w ( @words ) {
- my $noderef = join( ',', $lineref, ++$wordref );
- my $node = $graph->add_node( $noderef );
- $node->set_attribute( 'label', $w );
- $node->set_attribute( 'class', 'common' );
+ my $readingref = join( ',', $lineref, ++$wordref );
+ my $reading = $collation->add_reading( $readingref );
+ $reading->text( $w );
unless( $started ) {
- push( @$lineref_array, $node );
+ push( @$lineref_array, $reading );
$started = 1;
}
- if( $last_node ) {
- $graph->add_edge( $last_node, $node, "base text" );
- $last_node = $node;
- } # TODO there should be no else here...
+ # Add edge paths in the graph, for easier tracking when
+ # we start applying corrections. These paths will be
+ # removed when we're done.
+ my $path = $collation->add_path( $last_reading, $reading,
+ $collation->baselabel );
+ $last_reading = $reading;
+
+ # Note an array index for the reading, for later correction splices.
+ $base_text_index{$readingref} = $i++;
}
}
close BASE;
# Ending point for all texts
- my $endpoint = $graph->add_node( '#END#' );
- $graph->add_edge( $last_node, $endpoint, "base text" );
+ my $endpoint = $collation->add_reading( '#END#' );
+ $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
push( @$lineref_array, $endpoint );
return( @$lineref_array );
}
-=item B<collate_variant>
+=item B<collate_variants>
-collate_variant( $graph, $lemma_start, $lemma_end, $var_start, $var_end );
+collate_variants( $collation, @reading_ranges )
-Given a lemma and a variant as start- and endpoints on the graph,
-walks through each to identify those nodes that are identical. The
-graph is a Text::Tradition::Graph object; the other arguments are
-Graph::Easy::Node objects that appear on the graph.
+Given a set of readings in the form
+( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
+walks through each to identify those readings that are identical. The
+collation is a Text::Tradition::Collation object; the elements of
+@readings are Text::Tradition::Collation::Reading objects that appear
+on the collation graph.
TODO: Handle collapsed and non-collapsed transpositions.
=cut
-sub collate_variant {
- my( $graph, $lemma_start, $lemma_end, $var_start, $var_end ) = @_;
- # If var_start is undef, then the variant is an omission and
- # there's nothing to collate. Return.
- return unless $var_start;
-
- # I want to look at the nodes in the variant and lemma, and
- # collapse nodes that are the same word. This is mini-collation.
- my %collapsed = ();
- # There will only be one outgoing edge at first, so this is safe.
- my @out = $var_start->outgoing();
- my $var_label = $out[0]->label();
-
- my @lemma_nodes;
- while( $lemma_start ne $lemma_end ) {
- push( @lemma_nodes, $lemma_start );
- $lemma_start = $graph->next_word( $lemma_start );
- }
- push( @lemma_nodes, $lemma_end );
-
- my @variant_nodes;
- while( $var_start ne $var_end ) {
- push( @variant_nodes, $var_start );
- $var_start = $graph->next_word( $var_start, $var_label );
- }
- push( @variant_nodes, $var_end );
-
- # Go through the variant nodes, and if we find a lemma node that
- # hasn't yet been collapsed with a node, equate them.
-
- foreach my $w ( @variant_nodes ) {
- my $word = $w->label();
- foreach my $l ( @lemma_nodes ) {
- if( $word eq cmp_str( $l ) ) {
- next if exists( $collapsed{ $l->label } )
- && $collapsed{ $l->label } eq $l;
- # Collapse the nodes.
- printf STDERR "Merging nodes %s/%s and %s/%s\n",
- $l->name, $l->label, $w->name, $w->label;
- $graph->merge_nodes( $l, $w );
- $collapsed{ $l->label } = $l;
- # Now collapse any multiple edges to and from the node.
- # Rely on the presence of the 'base text' edge.
- remove_duplicate_edges( $graph, $graph->prior_word( $l ), $l );
- remove_duplicate_edges( $graph, $l, $graph->next_word( $l ) );
+sub collate_variants {
+ my( $collation, @reading_sets ) = @_;
+ # my $detranspose = 1; # TODO handle merging transposed nodes
+
+ # Merge the nodes across the sets so that there is only one node
+ # for any given reading. Use diff to identify the 'same' nodes.
+
+ my $lemma_set = shift @reading_sets;
+
+ my @unique;
+ push( @unique, @$lemma_set );
+
+ while( @reading_sets ) {
+ my $variant_set = shift @reading_sets;
+ my $diff = Algorithm::Diff->new( \@unique, $variant_set, \&_collation_hash );
+ my @new_unique;
+ push( @new_unique, @unique );
+ while( $diff->Next ) {
+ if( $diff->Same ) {
+ # merge the nodes
+ my @l = $diff->Items( 1 );
+ my @v = $diff->Items( 2 );
+ foreach my $i ( 0 .. $#l ) {
+ $collation->merge_readings( $l[$i], $v[$i] );
+ }
+ # splice the lemma nodes into the variant set
+ splice( @$variant_set, $diff->Get( 'min2' ), scalar( @l ), @l );
+ push( @new_unique, @l );
+ } else {
+ # Keep the old unique readings
+ push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
+ # Add the new readings to the 'unique' list
+ push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
}
}
+ @unique = @new_unique;
}
+
+ return;
}
-=item B<remove_duplicate_edges>
+
+sub _collation_hash {
+ my $node = shift;
+ return _cmp_str( $node->label );
+}
-remove_duplicate_edges( $graph, $from, $to );
+sub apply_edits {
+ my $edit_sequence = shift;
+ my @lemma_text = map { $base_text_index{$_} } sort( keys %base_text_index );
-Given two nodes, reduce the number of edges between those nodes to
-one. If neither edge represents a base text, combine their labels.
+ my $drift = 0;
+ foreach my $correction ( @$edit_sequence ) {
+ my( $offset, $length, $items ) = @$correction;
+ my $realoffset = $offset + $drift;
+ splice( @lemma_text, $realoffset, $length, @$items );
+ $drift += @$items - $length;
+ }
+ return \@lemma_text;
+}
-=cut
-sub remove_duplicate_edges {
- my( $graph, $from, $to ) = @_;
- my @edges = $from->edges_to( $to );
- if( scalar @edges > 1 ) {
- my @base = grep { $_->label eq 'base text' } @edges;
- if ( scalar @base ) {
- # Remove the edges that are not base.
- foreach my $e ( @edges ) {
- $graph->del_edge( $e )
- unless $e eq $base[0];
- }
- } else {
- # Combine the edges into one.
- my $new_edge_name = join( ', ', map { $_->label() } @edges );
- my $new_edge = shift @edges;
- $new_edge->set_attribute( 'label', $new_edge_name );
- foreach my $e ( @edges ) {
- $graph->del_edge( $e );
- }
- }
+# Helper function. Given a witness sigil, if it is a post-correctione
+# sigil,return the base witness. If not, return a false value.
+sub _is_post_corr {
+ my( $sigil ) = @_;
+ if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
+ return $1;
}
+ return undef;
}
=item B<cmp_str>
=cut
sub cmp_str {
- my( $node ) = @_;
- my $word = $node->label();
+ my( $reading ) = @_;
+ my $word = $reading->label();
$word = lc( $word );
$word =~ s/\W//g;
$word =~ s/v/u/g;