From: Tara L Andrews <tla@mit.edu>
Date: Wed, 25 May 2011 09:08:40 +0000 (+0200)
Subject: CHECKPOINT untested pass at redoing base text merge
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=4ca00eca4d82f9c83acdf8e88683f1465e6a7486;p=scpubgit%2Fstemmatology.git

CHECKPOINT untested pass at redoing base text merge
---

diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm
index b530520..eb9fd74 100644
--- a/lib/Text/Tradition/Parser/BaseText.pm
+++ b/lib/Text/Tradition/Parser/BaseText.pm
@@ -3,6 +3,7 @@ package Text::Tradition::Parser::BaseText;
 use strict;
 use warnings;
 use Module::Load;
+use Algorithm::Diff;
 
 =head1 NAME
 
@@ -75,7 +76,12 @@ underscore in its name.
 
 =cut
 
-    my $SHORT = 25;
+my $SHORT = undef;  # Debug var - set this to limit the number of lines parsed
+
+my %base_text_index;
+my $edits_required;
+
+# edits_required -> wit -> [ { start_idx, end_idx, items } ]
 
 sub merge_base {
     my( $collation, $base_file, @app_entries ) = @_;
@@ -87,7 +93,7 @@ sub merge_base {
 	# DEBUG with a short graph
 	last if $SHORT && $line > $SHORT;
 	# DEBUG for problematic entries
-	my $scrutinize = "";
+	my $scrutinize = '';
 	my $first_line_reading = $base_line_starts[ $line ];
 	my $too_far = $base_line_starts[ $line+1 ];
 	
@@ -152,105 +158,109 @@ sub merge_base {
 	unless( $lemma_end ) {
 	    warn "No match found for @lemma_words at $line.$num";
 	    next;
-	} else {
-	    # These are no longer common readings; unmark them as such.
-	    my @lemma_readings = $collation->reading_sequence( $lemma_start, 
-						     $lemma_end );
-	    map { $_->make_variant } @lemma_readings;
 	}
 	
-	# Now we have our lemma readings; we add the variant readings
-	# to the collation.
-	
-	# Keep track of the start and end point of each reading for later
-	# reading collapse.
-	my @readings = ( $lemma_start, $lemma_end );
-
-	# For each reading that is not rdg_0, we make a chain of readings
-	# and connect them to the anchor.  Edges are named after the mss
-	# that are relevant.
+	# Now we have found the lemma; we will record an 'edit', in
+	# terms of a splice operation, for each subsequent reading.
+	# We also note which witnesses take the given edit.
+
+	my @lemma_set = $collation->reading_sequence( $lemma_start, $lemma_end );
+	my @reading_sets = [ @lemma_set ];
+
+	# For each reading that is not rdg_0, we create the variant
+	# reading nodes, and store the range as an edit operation on
+	# the base text.
+	my $variant_objects;
+	my %pc_lemma; # Keep track of mss that have been corrected back to lemma
+	my %pc_variant; # Keep track of mss with other corrections
 	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
-	    if( $k eq 'rdg_0' ) { # that's the lemma
-		# The lemma is already in the graph, but we need to look for
-		# any explicit post-correctione readings and add the
-		# relevant path.
-		my @mss = grep { $app->{$_} eq $k } keys( %$app );
-		# Keep track of what witnesses we have seen.
-		@all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
-		foreach my $m ( @mss ) {
-		    my $base = _is_post_corr( $m );
-		    next unless $base;
-		    my @lem = $collation->reading_sequence( $lemma_start, $lemma_end );
-		    $collation->add_path( $collation->prior_reading( $lem[0] ), $lem[0], $m );
-		    foreach my $i ( 0 .. $#lem-1 ) {
-			$collation->add_path( $lem[$i], $lem[++$i], $m );
-		    }
-		    $collation->add_path( $lem[-1], $collation->next_reading( $lem[-1] ), $m );
-		}
-		next;
-	    }
-	    my @variant = split( /\s+/, $app->{$k} );
-	    @variant = () if $app->{$k} eq '/'; # This is an omission.
 	    my @mss = grep { $app->{$_} eq $k } keys( %$app );
-	    
-	    unless( @mss ) {
-		print STDERR "Skipping '@variant' at $line.$num: no mss\n";
-		next;
-	    }
-	    
 	    # Keep track of what witnesses we have seen.
 	    @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
+	    my $pc_hash = $k eq 'rdg_0' ? \%pc_lemma : \%pc_variant;
+
+	    # Keep track of which witnesses bear corrected readings here.
+	    foreach my $m ( @mss ) {
+		my $base = _is_post_corr( $m );
+		next unless $base;
+		$pc_hash->{$base} = 1;
+	    }
+	    next if $k eq 'rdg_0';
+
+	    # TODO don't hardcode the reading split operation
+	    my @variant = split( /\s+/, $app->{$k} );
+	    @variant = () if $app->{$k} eq '/'; # This is an omission.
 	    
 	    # Make the variant into a set of readings.
+	    my @variant_readings;
 	    my $ctr = 0;
-	    my $last_reading = $collation->prior_reading( $lemma_start );
-	    my $var_start;
 	    foreach my $vw ( @variant ) {
 		my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 		my $vwreading = $collation->add_reading( $vwname );
 		$vwreading->text( $vw );
-		$vwreading->make_variant();
-		foreach ( @mss ) {
-		    $collation->add_path( $last_reading, $vwreading, $_ );
-		}
-		$var_start = $vwreading unless $var_start;
-		$last_reading = $vwreading;
+		push( @variant_readings, $vwreading );
 	    }
-	    # Now hook it up at the end.
-	    foreach ( @mss ) {
-		$collation->add_path( $last_reading, 
-				      $collation->next_reading( $lemma_end ),
-				      $_ );
-	    }
-	    
-	    if( $var_start ) { # if it wasn't an empty reading
-		push( @readings, $var_start, $last_reading );
-	    }
-	}
 
-	# Now collate and collapse the identical readings within the collation.
-	collate_variants( $collation, @readings );
-    }
+	    $variant_objects->{$k} = { 'mss' => \@mss,
+				       'reading' => \@variant_readings,
+	    };
+	    push( @reading_sets, \@variant_readings );
+	}
 
-    # Now make the witness objects
-    foreach my $w ( keys %all_witnesses ) {
-	my $base = _is_post_corr( $w );
-	if( $base ) {
-	    my $pctag = substr( $w, length( $base ) );
-	    my $existing_wit = $collation->tradition->witness( $base );
-	    unless( $existing_wit ) {
-		$existing_wit = $collation->tradition->add_witness( sigil => $base );
+	# Now collate and collapse the identical readings within the
+	# collated sets.  Modifies the reading sets that were passed.
+	collate_variants( $collation, @reading_sets );
+
+	# Now create the splice-edit objects that will be used
+	# to reconstruct each witness.
+
+	foreach my $rkey ( keys %$variant_objects ) {
+	    # Object is argument list for splice, so:
+	    # offset, length, replacements
+	    my $edit_object = [ $base_text_index{$lemma_start->name},
+				scalar( @lemma_set ),
+				$variant_objects->{$rkey}->{reading} ];
+	    foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
+		# Is this a p.c. entry?
+		my $base = _is_post_corr( $ms );
+		if( $base ) { # this is a post-corr witness
+		    my $pc_key = $base . "_post";
+		    _add_hash_entry( $edits_required, $pc_key, $edit_object );
+		} else { # this is an ante-corr witness
+		    my $pc_key = $ms . "_post";
+		    _add_hash_entry( $edits_required, $_, $edit_object );
+		    unless( !$pc_lemma{$ms} && !$pc_variant{$ms} ) {
+			# If this witness carries no correction, add this same object
+			# to its post-corrected state.
+			# TODO combine these hashes?
+			_add_hash_entry( $edits_required, $pc_key, $edit_object );
+		    }
+		}
 	    }
-	    $existing_wit->post_correctione( $pctag );
+	}
+    } # Finished going through the apparatus entries
+
+    # Now make the witness objects, and create their text sequences
+    foreach my $w ( grep { $_ !~ /_base$/ } keys %$edits_required ) {
+	my $witness_obj = $collation->tradition->add_witness( sigil => $w );
+	my @ante_corr_seq = apply_edits( $edits_required->{$w} );
+	my @post_corr_seq = apply_edits( $edits_required->{$w."_post"} )
+	    if exists( $edits_required->{$w."_post"} );
+
+	# Now how to save these paths in my witness object?
+	if( @post_corr_seq ) {
+	    $witness_obj->add_path( @post_corr_seq );
+	    $witness_obj->add_uncorrected_path( @ante_corr_seq );
 	} else {
-	    $collation->tradition->add_witness( sigil => $w )
-		unless $collation->tradition->witness( $w );
+	    $witness_obj->add_path( @ante_corr_seq );
 	}
     }
 
+    # TODO Now remove all the 'base text' links.
+
     # Now walk paths and calculate positions.
     my @common_readings = 
-        $collation->walk_and_expand_base( $collation->reading( '#END#' ) );
+	$collation->walk_and_expand_base( $collation->reading( '#END#' ) );
     $collation->calculate_positions( @common_readings );
 }
 
@@ -275,6 +285,7 @@ sub read_base {
     my $lineref_array = [ $last_reading ]; # There is no line zero.
 
     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
+    my $i = 0;
     while(<BASE>) {
 	# Make the readings, and connect them up for the base, but
 	# also save the first reading of each line in an array for the
@@ -290,17 +301,19 @@ sub read_base {
 	    my $readingref = join( ',', $lineref, ++$wordref );
 	    my $reading = $collation->add_reading( $readingref );
 	    $reading->text( $w );
-	    $reading->make_common();
 	    unless( $started ) {
 		push( @$lineref_array, $reading );
 		$started = 1;
 	    }
-	    if( $last_reading ) {
-		my $path = $collation->add_path( $last_reading, $reading, 
-						 $collation->baselabel );
-		$path->set_attribute( 'class', 'basetext' );
-		$last_reading = $reading;
-	    } # TODO there should be no else here...
+	    # Add edge paths in the graph, for easier tracking when
+	    # we start applying corrections.  These paths will be
+	    # removed when we're done.
+	    my $path = $collation->add_path( $last_reading, $reading, 
+					     $collation->baselabel );
+	    $last_reading = $reading;
+
+	    # Note an array index for the reading, for later correction splices.
+	    $base_text_index{$readingref} = $i++;
 	}
     }
     close BASE;
@@ -314,7 +327,7 @@ sub read_base {
 
 =item B<collate_variants>
 
-collate_variants( $collation, @readings )
+collate_variants( $collation, @reading_ranges )
 
 Given a set of readings in the form 
 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
@@ -328,100 +341,67 @@ TODO: Handle collapsed and non-collapsed transpositions.
 =cut
 
 sub collate_variants {
-    my( $collation, @readings ) = @_;
-    my $lemma_start = shift @readings;
-    my $lemma_end = shift @readings;
-    my $detranspose = 1;
-
-    # Start the list of distinct readings with those readings in the lemma.
-    my @distinct_readings;
-    while( $lemma_start ne $lemma_end ) {
-	push( @distinct_readings, [ $lemma_start, $collation->baselabel ] );
-	$lemma_start = $collation->next_reading( $lemma_start );
-    } 
-    push( @distinct_readings, [ $lemma_end, $collation->baselabel ] );
-    
-
-    while( scalar @readings ) {
-	my( $var_start, $var_end ) = splice( @readings, 0, 2 );
-
-	# I want to look at the readings in the variant and lemma, and
-	# collapse readings that are the same word.  This is mini-collation.
-	# Each word in the 'main' list can only be collapsed once with a
-	# word from the current reading.
-	my %collapsed = ();
-
-	# Get the variant witnesses.  They will all be going along the
-	# same path, so just use the first one as representative for
-	# the purpose of following the path.
-	my @var_wits = map { $_->label } $var_start->outgoing();
-	my $rep_wit = $var_wits[0];
-
-	my @variant_readings;
-	while( $var_start ne $var_end ) {
-	    push( @variant_readings, $var_start );
-	    $var_start = $collation->next_reading( $var_start, $rep_wit );
-	}
-	push( @variant_readings, $var_end );
-
-	# Go through the variant readings, and if we find a lemma reading that
-	# hasn't yet been collapsed with a reading, equate them.  If we do
-	# not, keep them to push onto the end of all_readings.
-	# TODO replace this with proper mini-collation
-	my @remaining_readings;
-	my $last_index = 0;
-	my $curr_pos = 0;
-	foreach my $w ( @variant_readings ) {
-	    my $word = $w->label();
-	    my $matched = 0;
-	    foreach my $idx ( $last_index .. $#distinct_readings ) {
-		my( $l, $pathlabel ) = @{$distinct_readings[$idx]};
-		if( $word eq cmp_str( $l ) ) {
-		    next if exists( $collapsed{ $l->label } )
-			&& $collapsed{ $l->label } eq $l;
-		    $matched = 1;
-		    $last_index = $idx if $detranspose;
-		    # Collapse the readings.
-		    printf STDERR "Merging readings %s/%s and %s/%s\n", 
-		        $l->name, $l->label, $w->name, $w->label;
-		    $collation->merge_readings( $l, $w );
-		    $collapsed{ $l->label } = $l;
-		    # Now collapse any multiple paths to and from the reading.
- 		    remove_duplicate_paths( $collation, 
- 				    $collation->prior_reading( $l, $rep_wit ), $l );
- 		    remove_duplicate_paths( $collation, $l, 
- 				    $collation->next_reading( $l, $rep_wit ) );
-		    last;
+    my( $collation, @reading_sets ) = @_;
+    # my $detranspose = 1;  # TODO handle merging transposed nodes
+
+    # Merge the nodes across the sets so that there is only one node
+    # for any given reading.  Use diff to identify the 'same' nodes.
+
+    my $lemma_set = shift @reading_sets;
+
+    my @unique;
+    push( @unique, @$lemma_set );
+
+    while( @reading_sets ) {
+	my $variant_set = shift @reading_sets;
+	my $diff = Algorithm::Diff->new( \@unique, $variant_set, \&_collation_hash );
+	my @new_unique;
+	push( @new_unique, @unique );
+	while( $diff->Next ) {
+	    if( $diff->Same ) {
+		# merge the nodes
+		my @l = $diff->Items( 1 );
+		my @v = $diff->Items( 2 );
+		foreach my $i ( 0 .. $#l ) {
+		    $collation->merge_readings( $l[$i], $v[$i] );
 		}
+		# splice the lemma nodes into the variant set
+		splice( @$variant_set, $diff->Get( 'min2' ), scalar( @l ), @l );
+		push( @new_unique, @l );
+	    } else {
+		# Keep the old unique readings
+		push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
+		# Add the new readings to the 'unique' list
+		push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
 	    }
-	    push( @remaining_readings, [ $w, $rep_wit ] ) unless $matched;
 	}
-	push( @distinct_readings, @remaining_readings) if scalar( @remaining_readings );
+	@unique = @new_unique;
     }
-}
-
-=item B<remove_duplicate_paths>
 
-remove_duplicate_paths( $collation, $from, $to );
+    return;
+}
 
-Given two readings, reduce the number of paths between those readings to
-a set of unique paths.
+    
+sub _collation_hash {
+    my $node = shift;
+    return _cmp_str( $node->label );
+}
 
-=cut
+sub apply_edits {
+    my $edit_sequence = shift;
+    my @lemma_text = map { $base_text_index{$_} } sort( keys %base_text_index );
 
-# TODO wonder if this is necessary
-sub remove_duplicate_paths {
-    my( $collation, $from, $to ) = @_;
-    my %seen_paths;
-    foreach my $p ( $from->edges_to( $to ) ) {
-	if( exists $seen_paths{$p->name} ) {
-	    $collation->del_path( $p );
-	} else {
-	    $seen_paths{$p->name} = 1;
-	}
+    my $drift = 0;
+    foreach my $correction ( @$edit_sequence ) {
+	my( $offset, $length, $items ) = @$correction;
+	my $realoffset = $offset + $drift;
+	splice( @lemma_text, $realoffset, $length, @$items );
+	$drift += @$items - $length;
     }
+    return \@lemma_text;
 }
 
+
 # Helper function. Given a witness sigil, if it is a post-correctione
 # sigil,return the base witness.  If not, return a false value.
 sub _is_post_corr {