X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FAnalysis.pm;h=18999526e0afa3c55076c19d7c19713ce8785187;hb=428bcf0bc79f77a7857b21ef881708faa792e33a;hp=9cb4f394b02f8197b295013b0a8b0f058fff1d7e;hpb=d120c995b9d6154a340858d90b46436af3758d90;p=scpubgit%2Fstemmatology.git

diff --git a/lib/Text/Tradition/Analysis.pm b/lib/Text/Tradition/Analysis.pm
index 9cb4f39..1899952 100644
--- a/lib/Text/Tradition/Analysis.pm
+++ b/lib/Text/Tradition/Analysis.pm
@@ -2,6 +2,7 @@ package Text::Tradition::Analysis;
 
 use strict;
 use warnings;
+use Algorithm::Diff;  # for word similarity measure
 use Benchmark;
 use Encode qw/ encode_utf8 /;
 use Exporter 'import';
@@ -201,6 +202,8 @@ sub run_analysis {
 		$location->{'missing'} = [ keys %lmiss ];
 		
 		# Run the extra analysis we need.
+		## TODO We run through all the variants in this call, so
+		## why not add the reading data there instead of here below?
 		analyze_location( $tradition, $stemma, $location, \%lmiss );
 
 		my @layerwits;
@@ -213,6 +216,8 @@ sub run_analysis {
 			if( $rdg ) {
 				$rdghash->{'text'} = $rdg->text . 
 					( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
+				$rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
+				$rdghash->{'is_nonsense'} = $rdg->is_nonsense;
 			}
 			# Remove lacunose witnesses from this reading's list now that the
 			# analysis is done 
@@ -326,9 +331,9 @@ sub group_variants {
 		
 	# If something was transposed, check the groups for doubled-up readings
 	if( $has_transposition ) {
-		print STDERR "Group for rank $rank:\n";
-		map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" } 
-			keys %$grouped_readings;
+		# print STDERR "Group for rank $rank:\n";
+		# map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" } 
+		# 	keys %$grouped_readings;
 		_check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
 	}
 	
@@ -401,7 +406,7 @@ sub _check_transposed_consistency {
 				delete $groupings->{$rdg};
 				# If we found a group match, assume there is a symmetry happening.
 				# TODO think more about this
-				print STDERR "*** Deleting symmetric reading $rdg\n";
+				# print STDERR "*** Deleting symmetric reading $rdg\n";
 				unless( $matched ) {
 					delete $transposed->{$rdg};
 					warn "Found problem in evident symmetry with reading $rdg";
@@ -414,7 +419,7 @@ sub _check_transposed_consistency {
 			foreach my $rdg ( @{$seen_wits{$dup}} ) {
 				next if $thisrank{$rdg};
 				next unless exists $groupings->{$rdg};
-				print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
+				# print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
 				delete $groupings->{$rdg};
 				delete $transposed->{$rdg};
 			}
@@ -667,7 +672,6 @@ sub analyze_location {
 	my $subgraph = {};
 	my $acstr = $c->ac_label;
 	my @acwits;
-	$DB::single = 1 if $variant_row->{id} == 87;
 	# Note which witnesses positively belong to which group
     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
     	my $rid = $rdghash->{'readingid'};
@@ -678,7 +682,6 @@ sub analyze_location {
     	    }
     	}
 	}
-	
 	# Get the actual graph we should work with
 	my $graph;
 	try {
@@ -713,6 +716,7 @@ sub analyze_location {
     # reading's evident parent(s).
     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
         my $rid = $rdghash->{'readingid'};
+        my $rdg = $c->reading( $rid );
         # Get the subgraph
         my $part = $subgraph->{$rid};
         
@@ -730,7 +734,11 @@ sub analyze_location {
 				my @next;
 				foreach my $wparent( @check ) {
 					my $preading = $contig->{$wparent};
-					if( $preading ) {
+					# IDP assigns all nodes, hypothetical included, to a reading
+					# in the case of genealogical sets. We prune non-necessary
+					# hypothetical readings, but they are still in $contig, so
+					# we account for that here.
+					if( $preading && $preading ne $rid ) {
 						$rdgparents->{$preading} = 1;
 					} else {
 						push( @next, $graph->predecessors( $wparent ) );
@@ -743,18 +751,55 @@ sub analyze_location {
 			# Resolve the relationship of the parent to the reading, and
 			# save it in our hash.
 			my $pobj = $c->reading( $p );
-			my $relation;
 			my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
+			my $phash = { 'label' => $prep };
 			if( $pobj ) {
 				my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
 				if( $rel ) {
-					$relation = { type => $rel->type };
-					if( $rel->has_annotation ) {
-						$relation->{'annotation'} = $rel->annotation;
+					_add_to_hash( $rel, $phash );
+				} elsif( $rdg ) {
+					# First check for a transposed relationship
+					if( $rdg->rank != $pobj->rank ) {
+						foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
+							next unless $ti->text eq $rdg->text;
+							$rel = $c->get_relationship( $ti, $pobj );
+							if( $rel ) {
+								_add_to_hash( $rel, $phash, 1 );
+								last;
+							}
+						}
+						unless( $rel ) {
+							foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
+								next unless $ti->text eq $pobj->text;
+								$rel = $c->get_relationship( $ti, $rdg );
+								if( $rel ) {
+									_add_to_hash( $rel, $phash, 1 );
+									last;
+								}
+							}
+						}
 					}
+					unless( $rel ) {
+						# and then check for sheer word similarity.
+						my $rtext = $rdg->text;
+						my $ptext = $pobj->text;
+						if( similar( $rtext, $ptext ) ) {
+							# say STDERR "Words $rtext and $ptext judged similar";
+							$phash->{relation} = { type => 'wordsimilar' };
+						} 
+					}
+				} else {
+					$phash->{relation} = { type => 'deletion' };
 				}
-			}	
-			$rdgparents->{$p} = { 'label' => $prep, 'relation' => $relation };
+				# Get the attributes of the parent object while we are here
+				$phash->{'text'} = $pobj->text if $pobj;
+				$phash->{'is_nonsense'} = $pobj->is_nonsense;
+				$phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
+			} elsif( $p eq '(omitted)' ) {
+				$phash->{relation} = { type => 'addition' };
+			}
+			# Save it
+			$rdgparents->{$p} = $phash;
 		}
 			
 		$rdghash->{'reading_parents'} = $rdgparents;
@@ -786,6 +831,51 @@ sub analyze_location {
     }
 }
 
+sub _add_to_hash {
+	my( $rel, $phash, $is_transposed ) = @_;
+	$phash->{relation} = { type => $rel->type };
+	$phash->{relation}->{transposed} = 1 if $is_transposed;
+	$phash->{relation}->{annotation} = $rel->annotation
+		if $rel->has_annotation;
+}
+
+=head2 similar( $word1, $word2 )
+
+Use Algorithm::Diff to get a sense of how close the words are to each other.
+This will hopefully handle substitutions a bit more nicely than Levenshtein.
+
+=cut
+
+#!/usr/bin/env perl
+
+sub similar {
+	my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
+	my @let1 = split( '', lc( $word1 ) );
+	my @let2 = split( '', lc( $word2 ) );
+	my $diff = Algorithm::Diff->new( \@let1, \@let2 );
+	my $mag = 0;
+	while( $diff->Next ) {
+		if( $diff->Same ) {
+			# Take off points for longer strings
+			my $cs = $diff->Range(1) - 2;
+			$cs = 0 if $cs < 0;
+			$mag -= $cs;
+		} elsif( !$diff->Items(1) ) {
+			$mag += $diff->Range(2);
+		} elsif( !$diff->Items(2) ) {
+			$mag += $diff->Range(1);
+		} else {
+			# Split the difference for substitutions
+			my $c1 = $diff->Range(1) || 1;
+			my $c2 = $diff->Range(2) || 1;
+			my $cd = ( $c1 + $c2 ) / 2;
+			$mag += $cd;
+		}
+	}
+	return ( $mag <= length( $word1 ) / 2 );
+}
+
+
 
 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )