From: Tara L Andrews <tla@mit.edu>
Date: Wed, 25 May 2011 22:29:56 +0000 (+0200)
Subject: fix circular-path bugs in the graph
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=15d2d3dfe4461e506cb78a1c4b96e69f4ed6126f;p=scpubgit%2Fstemmatology.git

fix circular-path bugs in the graph
---

diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm
index 4686d5e..21b3c96 100644
--- a/lib/Text/Tradition/Collation.pm
+++ b/lib/Text/Tradition/Collation.pm
@@ -69,9 +69,15 @@ has 'baselabel' => (
     );
 
 has 'collapsed' => (
-		    is => 'rw',
-		    isa => 'Bool',
-		    );
+    is => 'rw',
+    isa => 'Bool',
+    );
+
+has 'linear' => (
+    is => 'rw',
+    isa => 'Bool',
+    default => 1,
+    );
 
 
 # The collation can be created two ways:
@@ -135,6 +141,14 @@ sub merge_readings {
     return $self->graph->merge_nodes( @_ );
 }
 
+# Extra graph-alike utility
+sub has_path {
+    my( $self, $source, $target, $label ) = @_;
+    my @paths = $source->edges_to( $target );
+    my @relevant = grep { $_->label eq $label } @paths;
+    return scalar @paths;
+}
+
 =head2 Output method(s)
 
 =over
@@ -563,28 +577,10 @@ sub make_witness_paths {
 
     my @common_readings;
     foreach my $wit ( @{$self->tradition->witnesses} ) {
+	print STDERR "Making path for " . $wit->sigil . "\n";
 	$self->make_witness_path( $wit );
 	@common_readings = _find_common( \@common_readings, $wit->path );
-
-	# If we have pre-corrected readings, we need to add paths
-	# for those as well.
-	if( $wit->has_ante_corr ) {
-	    my @path = @{$wit->path};
-	    foreach my $ac ( @{$wit->ante_corr} ) {
-		# my( $offset, $length, $items ) = @$ac;
-		# Figure out where the path needs to start and
-		# end its divergence.
-		my $start = $ac->[0] - 1;
-		my $end = $ac->[0] + $ac->[1];
-		my @chain;
-		push( @chain, $path[$start] );
-		push( @chain, @{$ac->[2]} );
-		push( @chain, $path[$end] );
-		$self->make_path_uncorrection( $wit->sigil, @chain );
-	    }
-	    @common_readings = _find_common( \@common_readings,
-					     $wit->uncorrected_path );
-	}
+	@common_readings = _find_common( \@common_readings, $wit->uncorrected_path );
     }
     return @common_readings;
 }
@@ -592,23 +588,19 @@ sub make_witness_paths {
 sub make_witness_path {
     my( $self, $wit ) = @_;
     my @chain = @{$wit->path};
-    $self->connect_readings_for_witness( $wit->sigil, @chain );
-}
-
-sub make_path_uncorrection {
-    my( $self, $sig, @chain ) = @_;
-    $sig .= ' (a.c.)';
-    $self->connect_readings_for_witness( $sig, @chain );
-}
-
-sub connect_readings_for_witness {
-    my( $self, $sig, @chain ) = @_;
+    my $sig = $wit->sigil;
     foreach my $idx ( 0 .. $#chain-1 ) {
 	$self->add_path( $chain[$idx], $chain[$idx+1], $sig );
     }
+    @chain = @{$wit->uncorrected_path};
+    foreach my $idx( 0 .. $#chain-1 ) {
+	my $source = $chain[$idx];
+	my $target = $chain[$idx+1];
+	$self->add_path( $source, $target, "$sig (a.c.)" )
+	    unless $self->has_path( $source, $target, $sig );
+    }
 }
 
-
 sub common_readings {
     my $self = shift;
     my @common = grep { $_->is_common } $self->readings();
@@ -676,7 +668,6 @@ sub _update_positions_from_path {
 	foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) {
 	    my $reading = $wit_matrix->[$li-1]->[$di-1];
 	    my $position = "$li,$di";
-	    $DB::single = 1 unless ref( $reading ) eq 'Text::Tradition::Collation::Reading';
 
 	    # If we have seen this node before, we need to compare
 	    # its position with what went before.
diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm
index ae5235c..3aeba95 100644
--- a/lib/Text/Tradition/Parser/BaseText.pm
+++ b/lib/Text/Tradition/Parser/BaseText.pm
@@ -40,14 +40,12 @@ Takes an initialized graph and a set of options, which must include:
 
 =cut
 
-my $DETRANSPOSE = 0;
 sub parse {
     my( $tradition, %opts ) = @_;
 
     my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
     load( $format_mod );
     my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
-    $DETRANSPOSE = 1 if $opts{'linear'};
     merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
 }
 
@@ -78,7 +76,7 @@ underscore in its name.
 
 =cut
 
-my $SHORT = 25;  # Debug var - set this to limit the number of lines parsed
+my $SHORTEND; # Debug var - set this to limit the number of lines parsed
 
 my %base_text_index;
 my $edits_required = {};
@@ -94,7 +92,7 @@ sub merge_base {
     foreach my $app ( @app_entries ) {
 	my( $line, $num ) = split( /\./, $app->{_id} );
 	# DEBUG with a short graph
-	last if $SHORT && $line > $SHORT;
+	last if $SHORTEND && $line > $SHORTEND;
 	# DEBUG for problematic entries
 	my $scrutinize = '';
 	my $first_line_reading = $base_line_starts[ $line ];
@@ -213,9 +211,12 @@ sub merge_base {
 
 	# Now collate and collapse the identical readings within the
 	# collated sets.  Modifies the reading sets that were passed.
-	$DB::single = 1 if "$line.$num" eq '16.2';
 	collate_variants( $collation, @reading_sets );
 
+	# TODO Here would be a very good place to set up relationships
+	# between the nodes and the lemma.
+	set_relationships( $app, \@lemma_set, $variant_objects );
+
 	# Now create the splice-edit objects that will be used
 	# to reconstruct each witness.
 
@@ -247,17 +248,23 @@ sub merge_base {
 
     # Now make the witness objects, and create their text sequences
     foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
+	print STDERR "Creating witness $w\n";
 	my $witness_obj = $collation->tradition->add_witness( sigil => $w );
 	my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w} );
 	my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"} )
 	    if exists( $edits_required->{$w."_post"} );
 
+	my @repeated = _check_for_repeated( @ante_corr_seq );
+	warn "Repeated elements @repeated in $w a.c."
+	    if @repeated;
+	@repeated = _check_for_repeated( @post_corr_seq );
+	warn "Repeated elements @repeated in $w p.c."
+	    if @repeated;
+
 	# Now save these paths in my witness object
 	if( @post_corr_seq ) {
 	    $witness_obj->path( \@post_corr_seq );
-	    my @ante_corr = make_witness_uncorrections( \@post_corr_seq,
-							\@ante_corr_seq );
-	    $witness_obj->ante_corr( \@ante_corr );
+	    $witness_obj->uncorrected_path( \@ante_corr_seq );
 	} else {
 	    $witness_obj->path( \@ante_corr_seq );
 	}
@@ -279,6 +286,20 @@ sub merge_base {
     $collation->calculate_positions( @common_readings );
 }
 
+sub _check_for_repeated {
+    my @seq = @_;
+    my %unique;
+    my @repeated;
+    foreach ( @seq ) {
+	if( exists $unique{$_->name} ) {
+	    push( @repeated, $_->name );
+	} else {
+	    $unique{$_->name} = 1;
+	}
+    }
+    return @repeated;
+}
+
 =item B<read_base>
 
 my @line_beginnings = read_base( 'reference.txt', $collation );
@@ -312,7 +333,7 @@ sub read_base {
 	my $started = 0;
 	my $wordref = 0;
 	my $lineref = scalar @$lineref_array;
-	last if $SHORT && $lineref > $SHORT;
+	last if $SHORTEND && $lineref > $SHORTEND;
 	foreach my $w ( @words ) {
 	    my $readingref = join( ',', $lineref, ++$wordref );
 	    my $reading = $collation->add_reading( $readingref );
@@ -370,18 +391,24 @@ sub collate_variants {
 
     while( @reading_sets ) {
 	my $variant_set = shift @reading_sets;
-	if( $DETRANSPOSE ) {
+	if( $collation->linear ) {
 	    # Use diff to do this job
 	    my $diff = Algorithm::Diff->new( \@unique, $variant_set, 
 					     {'keyGen' => \&_collation_hash} );
 	    my @new_unique;
+	    my %merged;
 	    while( $diff->Next ) {
 		if( $diff->Same ) {
 		    # merge the nodes
 		    my @l = $diff->Items( 1 );
 		    my @v = $diff->Items( 2 );
 		    foreach my $i ( 0 .. $#l ) {
-			$collation->merge_readings( $l[$i], $v[$i] );
+			if( !$merged{$l[$i]->name} ) {
+			    $collation->merge_readings( $l[$i], $v[$i] );
+			    $merged{$l[$i]->name} = 1;
+			} else {
+			    print STDERR "Would have double merged " . $l[$i]->name . "\n";
+			}
 		    }
 		    # splice the lemma nodes into the variant set
 		    my( $offset ) = $diff->Get( 'min2' );
@@ -399,13 +426,25 @@ sub collate_variants {
 	    # It becomes a much simpler job
 	    $DB::single = 1;
 	    my @distinct;
+	    my %merged;
 	    foreach my $idx ( 0 .. $#{$variant_set} ) {
 		my $vw = $variant_set->[$idx];
 		my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
+		my $matched;
 		if( @same ) {
-		    $collation->merge_readings( $same[0], $vw );
-		    $variant_set->[$idx] = $same[0];
-		} else {
+		    foreach my $i ( 0 .. $#same ) {
+			unless( $merged{$same[$i]->name} ) {
+			    print STDERR sprintf( "Merging %s into %s\n", 
+						  $vw->name,
+						  $same[$i]->name );
+			    $collation->merge_readings( $same[$i], $vw );
+			    $merged{$same[$i]->name} = 1;
+			    $matched = $i;
+			    $variant_set->[$idx] = $same[$i];
+			}
+		    }
+		}
+		unless( @same && defined($matched) ) {
 		    push( @distinct, $vw );
 		}
 	    }
@@ -422,6 +461,23 @@ sub _collation_hash {
     return cmp_str( $node );
 }
 
+sub set_relationships {
+    my( $app, $lemma, $variants ) = @_;
+    foreach my $rkey ( keys %$variants ) {
+	my $var = $variants->{$rkey}->{'reading'};
+	my $typekey = sprintf( "_%s_type", $rkey );
+	my $type = $app->{$typekey};
+	
+	# Transposition: look for nodes with the same label but different IDs
+	# and mark them as transposed-identical.
+
+	# Lexical / Grammatical / Spelling: look for non-identical nodes.
+	# Need to work out how to handle many-to-many mapping.
+    }
+}
+	
+
+
 sub apply_edits {
     my( $collation, $edit_sequence ) = @_;
     my @lemma_names = sort { $base_text_index{$a} <=> $base_text_index{$b} }
@@ -437,23 +493,6 @@ sub apply_edits {
     }
     return @lemma_text;
 }
-
-sub make_witness_uncorrections {
-    my( $path, $uncorr_path ) = @_;
-    my $diff = Algorithm::Diff->new( $path, $uncorr_path, 
-				     { 'keyGen' => \&_collation_hash } );
-    # We basically just want to make a bunch of splice arguments that
-    # will reconstruct the ante-corr text from the post-corr.
-    my @diff_list;
-    while( $diff->Next ) {
-	next if $diff->Same;
-	my( $offset ) = $diff->Get( 'min1' );
-	my $length = scalar( $diff->Items( 1 ) );
-	my $items = [];	push( @$items, $diff->Items( 2 ) );
-	push( @diff_list, [ $offset, $length, $items ] );
-    }
-    return @diff_list;
-}
 	
 
 # Helper function. Given a witness sigil, if it is a post-correctione
diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm
index 145eba4..934bb51 100644
--- a/lib/Text/Tradition/Witness.pm
+++ b/lib/Text/Tradition/Witness.pm
@@ -32,18 +32,9 @@ has 'path' => (
     predicate => 'has_path',
     );	       
 
-subtype 'Correction',
-    as 'ArrayRef',
-    where { @{$_} == 3 &&
-	    find_type_constraint('Int')->check( $_->[0] ) &&
-	    find_type_constraint('Int')->check( $_->[1] ) &&
-	    find_type_constraint('ArrayRef[Text::Tradition::Collation::Reading]')->check( $_->[2] );
-    },
-    message { 'Correction must be a tuple of [offset, length, list]' };
-
-has 'ante_corr' => (
+has 'uncorrected_path' => (
     is => 'rw',
-    isa => 'ArrayRef[Correction]',
+    isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
     predicate => 'has_ante_corr',
     );
     
@@ -79,21 +70,6 @@ around text => sub {
     $self->$orig( @_ );
 };
 
-sub uncorrected_path {
-    my $self = shift;
-
-    my @new_path;
-    push( @new_path, @{$self->path} );
-    my $drift = 0;
-    foreach my $change ( @{$self->ante_corr} ) {
-	my( $offset, $length, $items ) = @$change;
-	my $realoffset = $offset + $drift;
-	splice( @new_path, $realoffset, $length, @$items );
-	$drift += @$items - $length;
-    }
-    return \@new_path;
-}
-    
 
 no Moose;
 __PACKAGE__->meta->make_immutable;
diff --git a/script/svg_from_csv.pl b/script/svg_from_csv.pl
index 97e2741..9b72e7a 100644
--- a/script/svg_from_csv.pl
+++ b/script/svg_from_csv.pl
@@ -11,7 +11,7 @@ use Text::Tradition;
 my $tradition = Text::Tradition->new(
     'CSV' => $ARGV[0],
     'base' => $ARGV[1],
-    'linear' => 1,
+    'linear' => 0,
     );