From: tla <tla@mit.edu>
Date: Wed, 25 May 2011 13:05:22 +0000 (+0200)
Subject: working with new base text merge routine, up to line 25
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=6a222840d60d8ef64c47e5ce0c4f033db1f72e2b;hp=4ca00eca4d82f9c83acdf8e88683f1465e6a7486;p=scpubgit%2Fstemmatology.git

working with new base text merge routine, up to line 25
---

diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm
index 8e028a9..c2deeae 100644
--- a/lib/Text/Tradition.pm
+++ b/lib/Text/Tradition.pm
@@ -72,10 +72,10 @@ sub BUILD {
 	if( $format ) {
 	    my @parseargs;
 	    if( $format =~ /^(CSV|CTE)$/ ) {
-		@parseargs = ( 'base' => $init_args->{'base'},
-			       'data' => $init_args->{$format},
-			       'format' => $format );
+		$init_args->{'data'} = $init_args->{$format};
+		$init_args->{'format'} = $format;
 		$format = 'BaseText';
+		@parseargs = %$init_args;
 	    } else {
 		@parseargs = ( $init_args->{ $format } ); 
 	    }
diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm
index ea194ef..4686d5e 100644
--- a/lib/Text/Tradition/Collation.pm
+++ b/lib/Text/Tradition/Collation.pm
@@ -299,15 +299,15 @@ sub collapse_graph_edges {
 		$label = join( ', ', @{$newlabels->{$newdest}} );
 	    } else {
 		## TODO FIX THIS HACK
-		my @pclabels;
+		my @aclabels;
 		foreach my $wit ( @{$newlabels->{$newdest}} ) {
-		    if( $wit =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
-			push( @pclabels, $wit );
+		    if( $wit =~ /^(.*?)(\s*\(?a\.\s*c\.\)?)$/ ) {
+			push( @aclabels, $wit );
 		    } else {
 			push( @compressed_wits, $wit );
 		    }
 		}
-		$label = join( ', ', 'majority', @pclabels );
+		$label = join( ', ', 'majority', @aclabels );
 	    }
 	    
 	    my $newedge = 
@@ -555,86 +555,60 @@ sub _remove_common {
 
 
 # An alternative to walk_witness_paths, for use when a collation is
-# constructed from a base text and an apparatus.  Also modifies the
-# collation graph to remove all 'base text' paths and replace them
-# with real witness paths.
+# constructed from a base text and an apparatus.  We have the
+# sequences of readings and just need to add path edges.
 
-sub walk_and_expand_base {
-    my( $self, $end ) = @_;
+sub make_witness_paths {
+    my( $self ) = @_;
 
     my @common_readings;
     foreach my $wit ( @{$self->tradition->witnesses} ) {
-	my $sig = $wit->sigil;
-	$DB::single = 1 if $sig eq 'Vb5';
-	my $post_sig;
-	$post_sig = $wit->post_correctione 
-	    if $wit->has_post_correctione;
-	
-	my @wit_path = $self->reading_sequence( $self->start, $end, $sig );
-	$wit->path( \@wit_path );
-	$self->connect_readings_for_witness( $wit );
-	@common_readings = _find_common( \@common_readings, \@wit_path );
-
-	# If there is a post-correctio, get its path and compare.
-	# Add a correction range for each divergence.
-	if( $post_sig ) {
-	    my @corr_wit_path = $self->reading_sequence( $self->start, $end, 
-							 "$sig$post_sig", $sig );
-
-	    # Map ante-corr readings to their indices
-	    my %in_orig; 
-	    my $i = 0;
-	    map { $in_orig{$_->name} = $i++ } @wit_path;
-
-	    # Look for divergences
-	    my $diverged = 0;
-	    my $last_common;
-	    my @correction;
-	    foreach my $rdg ( @corr_wit_path ) {
-		if( exists( $in_orig{$rdg->name} ) && !$diverged ) {
-		    # We are reading the same here
-		    $last_common = $in_orig{$rdg->name};
-		} elsif ( exists( $in_orig{$rdg->name} ) ) {
-		    # We have been diverging but are reading the same again.
-		    # Add the correction to the witness.
-		    my $offset = $last_common + 1;
-		    my $length = $in_orig{$rdg->name} - $offset;
-		    $wit->add_correction( $offset, $length, @correction );
-		    $diverged = 0;
-		    @common_readings = _remove_common( \@common_readings, \@correction );
-		    @correction = ();
-		    $last_common = $in_orig{$rdg->name};
-		} elsif( $diverged ) {
-		    # We are in the middle of a divergence.
-		    push( @correction, $rdg );
-		} else {
-		    # We have started to diverge.  Note it.
-		    $diverged = 1;
-		    push( @correction, $rdg );
-		}
-	    }
-	    # Add any divergence that is at the end of the text
-	    if( $diverged ) {
-		$wit->add_correction( $last_common+1, $#wit_path, \@correction );
+	$self->make_witness_path( $wit );
+	@common_readings = _find_common( \@common_readings, $wit->path );
+
+	# If we have pre-corrected readings, we need to add paths
+	# for those as well.
+	if( $wit->has_ante_corr ) {
+	    my @path = @{$wit->path};
+	    foreach my $ac ( @{$wit->ante_corr} ) {
+		# my( $offset, $length, $items ) = @$ac;
+		# Figure out where the path needs to start and
+		# end its divergence.
+		my $start = $ac->[0] - 1;
+		my $end = $ac->[0] + $ac->[1];
+		my @chain;
+		push( @chain, $path[$start] );
+		push( @chain, @{$ac->[2]} );
+		push( @chain, $path[$end] );
+		$self->make_path_uncorrection( $wit->sigil, @chain );
 	    }
+	    @common_readings = _find_common( \@common_readings,
+					     $wit->uncorrected_path );
 	}
     }
-
-    # Remove any 'base text' paths.
-    foreach my $path ( $self->paths ) {
-	$self->del_path( $path ) 
-	    if $path->label eq $self->baselabel;
-    }
+    return @common_readings;
 }
 
-sub connect_readings_for_witness {
+sub make_witness_path {
     my( $self, $wit ) = @_;
     my @chain = @{$wit->path};
+    $self->connect_readings_for_witness( $wit->sigil, @chain );
+}
+
+sub make_path_uncorrection {
+    my( $self, $sig, @chain ) = @_;
+    $sig .= ' (a.c.)';
+    $self->connect_readings_for_witness( $sig, @chain );
+}
+
+sub connect_readings_for_witness {
+    my( $self, $sig, @chain ) = @_;
     foreach my $idx ( 0 .. $#chain-1 ) {
-	$self->add_path( $chain[$idx], $chain[$idx+1], $wit->sigil );
+	$self->add_path( $chain[$idx], $chain[$idx+1], $sig );
     }
 }
 
+
 sub common_readings {
     my $self = shift;
     my @common = grep { $_->is_common } $self->readings();
@@ -656,8 +630,8 @@ sub calculate_positions {
     foreach my $wit ( @{$self->tradition->witnesses} ) {
 	print STDERR "Calculating positions in " . $wit->sigil . "\n";
 	_update_positions_from_path( $wit->path, @ordered_common );
-	_update_positions_from_path( $wit->corrected_path, @ordered_common )
-	    if $wit->has_post_correctione;
+	_update_positions_from_path( $wit->uncorrected_path, @ordered_common )
+	    if $wit->has_ante_corr;
     }
     
     # DEBUG
@@ -676,7 +650,6 @@ sub _update_positions_from_path {
     # that corresponds to its eventual position identifier.  Common
     # nodes always start a new row, and are thus always in the first
     # column.
-    
     my $wit_matrix = [];
     my $cn = 0;  # We should hit the common readings in order.
     my $row = [];
@@ -703,6 +676,8 @@ sub _update_positions_from_path {
 	foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) {
 	    my $reading = $wit_matrix->[$li-1]->[$di-1];
 	    my $position = "$li,$di";
+	    $DB::single = 1 unless ref( $reading ) eq 'Text::Tradition::Collation::Reading';
+
 	    # If we have seen this node before, we need to compare
 	    # its position with what went before.
 	    unless( $reading->has_position &&
diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm
index eb9fd74..ae5235c 100644
--- a/lib/Text/Tradition/Parser/BaseText.pm
+++ b/lib/Text/Tradition/Parser/BaseText.pm
@@ -40,12 +40,14 @@ Takes an initialized graph and a set of options, which must include:
 
 =cut
 
+my $DETRANSPOSE = 0;
 sub parse {
     my( $tradition, %opts ) = @_;
 
     my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
     load( $format_mod );
     my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
+    $DETRANSPOSE = 1 if $opts{'linear'};
     merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
 }
 
@@ -76,10 +78,10 @@ underscore in its name.
 
 =cut
 
-my $SHORT = undef;  # Debug var - set this to limit the number of lines parsed
+my $SHORT = 25;  # Debug var - set this to limit the number of lines parsed
 
 my %base_text_index;
-my $edits_required;
+my $edits_required = {};
 
 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
 
@@ -88,6 +90,7 @@ sub merge_base {
     my @base_line_starts = read_base( $base_file, $collation );
 
     my %all_witnesses;
+    my @unwitnessed_lemma_nodes;
     foreach my $app ( @app_entries ) {
 	my( $line, $num ) = split( /\./, $app->{_id} );
 	# DEBUG with a short graph
@@ -164,26 +167,27 @@ sub merge_base {
 	# terms of a splice operation, for each subsequent reading.
 	# We also note which witnesses take the given edit.
 
-	my @lemma_set = $collation->reading_sequence( $lemma_start, $lemma_end );
+	my @lemma_set = $collation->reading_sequence( $lemma_start, 
+						      $lemma_end );
 	my @reading_sets = [ @lemma_set ];
 
 	# For each reading that is not rdg_0, we create the variant
 	# reading nodes, and store the range as an edit operation on
 	# the base text.
 	my $variant_objects;
-	my %pc_lemma; # Keep track of mss that have been corrected back to lemma
-	my %pc_variant; # Keep track of mss with other corrections
+	my %pc_seen; # Keep track of mss with explicit post-corr data
 	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
 	    my @mss = grep { $app->{$_} eq $k } keys( %$app );
+	    push( @unwitnessed_lemma_nodes, @lemma_set )
+		if !@mss && $k eq 'rdg_0';
+
 	    # Keep track of what witnesses we have seen.
 	    @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
-	    my $pc_hash = $k eq 'rdg_0' ? \%pc_lemma : \%pc_variant;
-
 	    # Keep track of which witnesses bear corrected readings here.
 	    foreach my $m ( @mss ) {
 		my $base = _is_post_corr( $m );
 		next unless $base;
-		$pc_hash->{$base} = 1;
+		$pc_seen{$base} = 1;
 	    }
 	    next if $k eq 'rdg_0';
 
@@ -209,6 +213,7 @@ sub merge_base {
 
 	# Now collate and collapse the identical readings within the
 	# collated sets.  Modifies the reading sets that were passed.
+	$DB::single = 1 if "$line.$num" eq '16.2';
 	collate_variants( $collation, @reading_sets );
 
 	# Now create the splice-edit objects that will be used
@@ -228,12 +233,12 @@ sub merge_base {
 		    _add_hash_entry( $edits_required, $pc_key, $edit_object );
 		} else { # this is an ante-corr witness
 		    my $pc_key = $ms . "_post";
-		    _add_hash_entry( $edits_required, $_, $edit_object );
-		    unless( !$pc_lemma{$ms} && !$pc_variant{$ms} ) {
-			# If this witness carries no correction, add this same object
-			# to its post-corrected state.
-			# TODO combine these hashes?
-			_add_hash_entry( $edits_required, $pc_key, $edit_object );
+		    _add_hash_entry( $edits_required, $ms, $edit_object );
+		    unless( $pc_seen{$ms} ) {
+			# If this witness carries no correction, add this 
+			# same object to its post-corrected state.
+			_add_hash_entry( $edits_required, $pc_key, 
+					 $edit_object );
 		    }
 		}
 	    }
@@ -241,26 +246,36 @@ sub merge_base {
     } # Finished going through the apparatus entries
 
     # Now make the witness objects, and create their text sequences
-    foreach my $w ( grep { $_ !~ /_base$/ } keys %$edits_required ) {
+    foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
 	my $witness_obj = $collation->tradition->add_witness( sigil => $w );
-	my @ante_corr_seq = apply_edits( $edits_required->{$w} );
-	my @post_corr_seq = apply_edits( $edits_required->{$w."_post"} )
+	my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w} );
+	my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"} )
 	    if exists( $edits_required->{$w."_post"} );
 
-	# Now how to save these paths in my witness object?
+	# Now save these paths in my witness object
 	if( @post_corr_seq ) {
-	    $witness_obj->add_path( @post_corr_seq );
-	    $witness_obj->add_uncorrected_path( @ante_corr_seq );
+	    $witness_obj->path( \@post_corr_seq );
+	    my @ante_corr = make_witness_uncorrections( \@post_corr_seq,
+							\@ante_corr_seq );
+	    $witness_obj->ante_corr( \@ante_corr );
 	} else {
-	    $witness_obj->add_path( @ante_corr_seq );
+	    $witness_obj->path( \@ante_corr_seq );
 	}
     }
 
-    # TODO Now remove all the 'base text' links.
+    # Now remove our 'base text' edges, which is to say, the only
+    # ones we have created so far.  Also remove any nodes that didn't
+    # appear in any witnesses.
+    foreach ( $collation->paths() ) {
+	$collation->del_path( $_ );
+    }
+    foreach( @unwitnessed_lemma_nodes ) {
+	$collation->del_reading( $_ );
+    }
 
     # Now walk paths and calculate positions.
     my @common_readings = 
-	$collation->walk_and_expand_base( $collation->reading( '#END#' ) );
+	$collation->make_witness_paths();
     $collation->calculate_positions( @common_readings );
 }
 
@@ -282,10 +297,11 @@ sub read_base {
     # This array gives the first reading for each line.  We put the
     # common starting point in line zero.
     my $last_reading = $collation->start();
+    $base_text_index{$last_reading->name} = 0;
     my $lineref_array = [ $last_reading ]; # There is no line zero.
 
     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
-    my $i = 0;
+    my $i = 1;
     while(<BASE>) {
 	# Make the readings, and connect them up for the base, but
 	# also save the first reading of each line in an array for the
@@ -321,6 +337,7 @@ sub read_base {
     my $endpoint = $collation->add_reading( '#END#' );
     $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
     push( @$lineref_array, $endpoint );
+    $base_text_index{$endpoint->name} = $i;
 
     return( @$lineref_array );
 }
@@ -342,7 +359,6 @@ TODO: Handle collapsed and non-collapsed transpositions.
 
 sub collate_variants {
     my( $collation, @reading_sets ) = @_;
-    # my $detranspose = 1;  # TODO handle merging transposed nodes
 
     # Merge the nodes across the sets so that there is only one node
     # for any given reading.  Use diff to identify the 'same' nodes.
@@ -354,28 +370,47 @@ sub collate_variants {
 
     while( @reading_sets ) {
 	my $variant_set = shift @reading_sets;
-	my $diff = Algorithm::Diff->new( \@unique, $variant_set, \&_collation_hash );
-	my @new_unique;
-	push( @new_unique, @unique );
-	while( $diff->Next ) {
-	    if( $diff->Same ) {
-		# merge the nodes
-		my @l = $diff->Items( 1 );
-		my @v = $diff->Items( 2 );
-		foreach my $i ( 0 .. $#l ) {
-		    $collation->merge_readings( $l[$i], $v[$i] );
+	if( $DETRANSPOSE ) {
+	    # Use diff to do this job
+	    my $diff = Algorithm::Diff->new( \@unique, $variant_set, 
+					     {'keyGen' => \&_collation_hash} );
+	    my @new_unique;
+	    while( $diff->Next ) {
+		if( $diff->Same ) {
+		    # merge the nodes
+		    my @l = $diff->Items( 1 );
+		    my @v = $diff->Items( 2 );
+		    foreach my $i ( 0 .. $#l ) {
+			$collation->merge_readings( $l[$i], $v[$i] );
+		    }
+		    # splice the lemma nodes into the variant set
+		    my( $offset ) = $diff->Get( 'min2' );
+		    splice( @$variant_set, $offset, scalar( @l ), @l );
+		    push( @new_unique, @l );
+		} else {
+		    # Keep the old unique readings
+		    push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
+		    # Add the new readings to the 'unique' list
+		    push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
 		}
-		# splice the lemma nodes into the variant set
-		splice( @$variant_set, $diff->Get( 'min2' ), scalar( @l ), @l );
-		push( @new_unique, @l );
-	    } else {
-		# Keep the old unique readings
-		push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
-		# Add the new readings to the 'unique' list
-		push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
 	    }
+	    @unique = @new_unique;
+	} else {
+	    # It becomes a much simpler job
+	    $DB::single = 1;
+	    my @distinct;
+	    foreach my $idx ( 0 .. $#{$variant_set} ) {
+		my $vw = $variant_set->[$idx];
+		my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
+		if( @same ) {
+		    $collation->merge_readings( $same[0], $vw );
+		    $variant_set->[$idx] = $same[0];
+		} else {
+		    push( @distinct, $vw );
+		}
+	    }
+	    push( @unique, @distinct );
 	}
-	@unique = @new_unique;
     }
 
     return;
@@ -384,12 +419,14 @@ sub collate_variants {
     
 sub _collation_hash {
     my $node = shift;
-    return _cmp_str( $node->label );
+    return cmp_str( $node );
 }
 
 sub apply_edits {
-    my $edit_sequence = shift;
-    my @lemma_text = map { $base_text_index{$_} } sort( keys %base_text_index );
+    my( $collation, $edit_sequence ) = @_;
+    my @lemma_names = sort { $base_text_index{$a} <=> $base_text_index{$b} }
+        keys %base_text_index;
+    my @lemma_text = map { $collation->reading( $_ ) } @lemma_names;
 
     my $drift = 0;
     foreach my $correction ( @$edit_sequence ) {
@@ -398,9 +435,26 @@ sub apply_edits {
 	splice( @lemma_text, $realoffset, $length, @$items );
 	$drift += @$items - $length;
     }
-    return \@lemma_text;
+    return @lemma_text;
 }
 
+sub make_witness_uncorrections {
+    my( $path, $uncorr_path ) = @_;
+    my $diff = Algorithm::Diff->new( $path, $uncorr_path, 
+				     { 'keyGen' => \&_collation_hash } );
+    # We basically just want to make a bunch of splice arguments that
+    # will reconstruct the ante-corr text from the post-corr.
+    my @diff_list;
+    while( $diff->Next ) {
+	next if $diff->Same;
+	my( $offset ) = $diff->Get( 'min1' );
+	my $length = scalar( $diff->Items( 1 ) );
+	my $items = [];	push( @$items, $diff->Items( 2 ) );
+	push( @diff_list, [ $offset, $length, $items ] );
+    }
+    return @diff_list;
+}
+	
 
 # Helper function. Given a witness sigil, if it is a post-correctione
 # sigil,return the base witness.  If not, return a false value.
@@ -412,6 +466,16 @@ sub _is_post_corr {
     return undef;
 }
 
+sub _add_hash_entry {
+    my( $hash, $key, $entry ) = @_;
+    if( exists $hash->{$key} ) {
+	push( @{$hash->{$key}}, $entry );
+    } else {
+	$hash->{$key} = [ $entry ];
+    }
+}
+
+
 =item B<cmp_str>
 
 Pretend you never saw this method.  Really it needs to not be hardcoded.
diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm
index f570365..145eba4 100644
--- a/lib/Text/Tradition/Witness.pm
+++ b/lib/Text/Tradition/Witness.pm
@@ -32,25 +32,19 @@ has 'path' => (
     predicate => 'has_path',
     );	       
 
-has 'post_correctione' => (
-    is => 'rw',
-    isa => 'Str',
-    predicate => 'has_post_correctione',
-    );
-
 subtype 'Correction',
     as 'ArrayRef',
     where { @{$_} == 3 &&
-	    $_->[0]->isa( 'Int' ) &&
-	    $_->[1]->isa( 'Int' ) &&
-	    $_->[2]->isa( 'ArrayRef[Text::Tradition::Collation::Reading]' );
+	    find_type_constraint('Int')->check( $_->[0] ) &&
+	    find_type_constraint('Int')->check( $_->[1] ) &&
+	    find_type_constraint('ArrayRef[Text::Tradition::Collation::Reading]')->check( $_->[2] );
     },
     message { 'Correction must be a tuple of [offset, length, list]' };
 
-has 'corrections' => (
-    is => 'ro',
+has 'ante_corr' => (
+    is => 'rw',
     isa => 'ArrayRef[Correction]',
-    default => sub { [] },
+    predicate => 'has_ante_corr',
     );
     
 
@@ -85,20 +79,14 @@ around text => sub {
     $self->$orig( @_ );
 };
 
-sub add_correction {
-    my( $self, $offset, $length, @replacement ) = @_;
-    # Rely on Moose for type checking of the arguments
-    push( @{$self->corrections}, [ $offset, $length, \@replacement ] );
-}
-
-sub corrected_path {
+sub uncorrected_path {
     my $self = shift;
 
     my @new_path;
     push( @new_path, @{$self->path} );
     my $drift = 0;
-    foreach my $correction ( @{$self->corrections} ) {
-	my( $offset, $length, $items ) = @$correction;
+    foreach my $change ( @{$self->ante_corr} ) {
+	my( $offset, $length, $items ) = @$change;
 	my $realoffset = $offset + $drift;
 	splice( @new_path, $realoffset, $length, @$items );
 	$drift += @$items - $length;
diff --git a/script/svg_from_csv.pl b/script/svg_from_csv.pl
index 3907ca6..97e2741 100644
--- a/script/svg_from_csv.pl
+++ b/script/svg_from_csv.pl
@@ -11,6 +11,7 @@ use Text::Tradition;
 my $tradition = Text::Tradition->new(
     'CSV' => $ARGV[0],
     'base' => $ARGV[1],
+    'linear' => 1,
     );