From: Tara L Andrews <tla@mit.edu>
Date: Sun, 15 Jul 2012 10:06:32 +0000 (+0200)
Subject: load XML::LibXML only when required; handle global relationships more correctly;... 
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?p=scpubgit%2Fstemmatology.git;a=commitdiff_plain;h=428bcf0bc79f77a7857b21ef881708faa792e33a

load XML::LibXML only when required; handle global relationships more correctly; analysis logic updates
---

diff --git a/Makefile.PL b/Makefile.PL
index 8d23a73..33b8642 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -19,6 +19,7 @@ requires( 'KiokuDB::GC::Naive' );
 requires( 'KiokuDB::TypeMap' );
 requires( 'KiokuDB::TypeMap::Entry::Naive' );
 requires( 'KiokuX::Model' );
+requires( 'KiokuX::User::Util' );
 requires( 'Module::Load' );
 requires( 'Moose' );
 requires( 'Moose::Util::TypeConstraints' );
diff --git a/lib/Text/Tradition/Analysis.pm b/lib/Text/Tradition/Analysis.pm
index c51f3fd..1899952 100644
--- a/lib/Text/Tradition/Analysis.pm
+++ b/lib/Text/Tradition/Analysis.pm
@@ -2,13 +2,13 @@ package Text::Tradition::Analysis;
 
 use strict;
 use warnings;
+use Algorithm::Diff;  # for word similarity measure
 use Benchmark;
 use Encode qw/ encode_utf8 /;
 use Exporter 'import';
 use Graph;
 use JSON qw/ encode_json decode_json /;
 use LWP::UserAgent;
-use Text::LevenshteinXS qw/ distance /;
 use Text::Tradition;
 use Text::Tradition::Stemma;
 use TryCatch;
@@ -202,6 +202,8 @@ sub run_analysis {
 		$location->{'missing'} = [ keys %lmiss ];
 		
 		# Run the extra analysis we need.
+		## TODO We run through all the variants in this call, so
+		## why not add the reading data there instead of here below?
 		analyze_location( $tradition, $stemma, $location, \%lmiss );
 
 		my @layerwits;
@@ -680,7 +682,6 @@ sub analyze_location {
     	    }
     	}
 	}
-	
 	# Get the actual graph we should work with
 	my $graph;
 	try {
@@ -715,6 +716,7 @@ sub analyze_location {
     # reading's evident parent(s).
     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
         my $rid = $rdghash->{'readingid'};
+        my $rdg = $c->reading( $rid );
         # Get the subgraph
         my $part = $subgraph->{$rid};
         
@@ -732,7 +734,11 @@ sub analyze_location {
 				my @next;
 				foreach my $wparent( @check ) {
 					my $preading = $contig->{$wparent};
-					if( $preading ) {
+					# IDP assigns all nodes, hypothetical included, to a reading
+					# in the case of genealogical sets. We prune non-necessary
+					# hypothetical readings, but they are still in $contig, so
+					# we account for that here.
+					if( $preading && $preading ne $rid ) {
 						$rdgparents->{$preading} = 1;
 					} else {
 						push( @next, $graph->predecessors( $wparent ) );
@@ -750,22 +756,40 @@ sub analyze_location {
 			if( $pobj ) {
 				my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
 				if( $rel ) {
-					$phash->{relation} = { type => $rel->type };
-					if( $rel->has_annotation ) {
-						$phash->{relation}->{'annotation'} = $rel->annotation;
+					_add_to_hash( $rel, $phash );
+				} elsif( $rdg ) {
+					# First check for a transposed relationship
+					if( $rdg->rank != $pobj->rank ) {
+						foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
+							next unless $ti->text eq $rdg->text;
+							$rel = $c->get_relationship( $ti, $pobj );
+							if( $rel ) {
+								_add_to_hash( $rel, $phash, 1 );
+								last;
+							}
+						}
+						unless( $rel ) {
+							foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
+								next unless $ti->text eq $pobj->text;
+								$rel = $c->get_relationship( $ti, $rdg );
+								if( $rel ) {
+									_add_to_hash( $rel, $phash, 1 );
+									last;
+								}
+							}
+						}
 					}
-				} elsif( $rdghash->{readingid} eq '(omitted)' ) {
-					$phash->{relation} = { type => 'deletion' };
-				} elsif( $rdghash->{text} ) {
-					# Check for sheer word similarity.
-					my $rtext = $rdghash->{text};
-					my $ptext = $pobj->text;
-					my $min = length( $rtext ) > length( $ptext )
-						? length( $ptext ) : length( $rtext );
-					my $distance = distance( $rtext, $ptext );
-					if( $distance < $min ) {
-						$phash->{relation} = { type => 'wordsimilar' };
+					unless( $rel ) {
+						# and then check for sheer word similarity.
+						my $rtext = $rdg->text;
+						my $ptext = $pobj->text;
+						if( similar( $rtext, $ptext ) ) {
+							# say STDERR "Words $rtext and $ptext judged similar";
+							$phash->{relation} = { type => 'wordsimilar' };
+						} 
 					}
+				} else {
+					$phash->{relation} = { type => 'deletion' };
 				}
 				# Get the attributes of the parent object while we are here
 				$phash->{'text'} = $pobj->text if $pobj;
@@ -807,6 +831,51 @@ sub analyze_location {
     }
 }
 
+sub _add_to_hash {
+	my( $rel, $phash, $is_transposed ) = @_;
+	$phash->{relation} = { type => $rel->type };
+	$phash->{relation}->{transposed} = 1 if $is_transposed;
+	$phash->{relation}->{annotation} = $rel->annotation
+		if $rel->has_annotation;
+}
+
+=head2 similar( $word1, $word2 )
+
+Use Algorithm::Diff to get a sense of how close the words are to each other.
+This will hopefully handle substitutions a bit more nicely than Levenshtein.
+
+=cut
+
+#!/usr/bin/env perl
+
+sub similar {
+	my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
+	my @let1 = split( '', lc( $word1 ) );
+	my @let2 = split( '', lc( $word2 ) );
+	my $diff = Algorithm::Diff->new( \@let1, \@let2 );
+	my $mag = 0;
+	while( $diff->Next ) {
+		if( $diff->Same ) {
+			# Take off points for longer strings
+			my $cs = $diff->Range(1) - 2;
+			$cs = 0 if $cs < 0;
+			$mag -= $cs;
+		} elsif( !$diff->Items(1) ) {
+			$mag += $diff->Range(2);
+		} elsif( !$diff->Items(2) ) {
+			$mag += $diff->Range(1);
+		} else {
+			# Split the difference for substitutions
+			my $c1 = $diff->Range(1) || 1;
+			my $c2 = $diff->Range(2) || 1;
+			my $cd = ( $c1 + $c2 ) / 2;
+			$mag += $cd;
+		}
+	}
+	return ( $mag <= length( $word1 ) / 2 );
+}
+
+
 
 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 
diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm
index 5b3cd96..dda8485 100644
--- a/lib/Text/Tradition/Collation.pm
+++ b/lib/Text/Tradition/Collation.pm
@@ -11,8 +11,6 @@ use Text::Tradition::Collation::Reading;
 use Text::Tradition::Collation::RelationshipStore;
 use Text::Tradition::Error;
 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
-use XML::LibXML;
-use XML::LibXML::XPathContext;
 use Moose;
 
 has 'sequence' => (
@@ -448,6 +446,15 @@ sub compress_readings {
 	# readings.
 	my %gobbled;
 	foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
+		# While we are here, get rid of any extra wordforms from a disambiguated
+		# reading.
+		if( $rdg->disambiguated ) {
+			foreach my $lex ( $rdg->lexemes ) {
+				$lex->clear_matching_forms();
+				$lex->add_matching_form( $lex->form );
+			}
+		}
+		# Now look for readings that can be joined to their successors.
 		next if $rdg->is_meta;
 		next if $gobbled{$rdg->id};
 		next if $rdg->grammar_invalid || $rdg->is_nonsense;
@@ -1021,6 +1028,7 @@ sub as_graphml {
         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
 
     # Create the document and root node
+    require XML::LibXML;
     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
     $graphml->setDocumentElement( $root );
diff --git a/lib/Text/Tradition/Collation/RelationshipStore.pm b/lib/Text/Tradition/Collation/RelationshipStore.pm
index a5a9529..4342bd9 100644
--- a/lib/Text/Tradition/Collation/RelationshipStore.pm
+++ b/lib/Text/Tradition/Collation/RelationshipStore.pm
@@ -232,8 +232,12 @@ between the two reading strings. Returns undef if there is no general relationsh
 sub scoped_relationship {
 	my( $self, $rdga, $rdgb ) = @_;
 	my( $first, $second ) = sort( $rdga, $rdgb );
+	my( $lcfirst, $lcsecond ) = sort( lc( $rdga ), lc( $rdgb ) );
 	if( exists $self->scopedrels->{$first}->{$second} ) {
 		return $self->scopedrels->{$first}->{$second};
+	} elsif( exists $self->scopedrels->{$lcfirst}->{$lcsecond} ) {
+		my $rel = $self->scopedrels->{$lcfirst}->{$lcsecond};
+		return $rel->type ne 'orthographic' ? $rel : undef;
 	} else {
 		return undef;
 	}
@@ -419,10 +423,8 @@ sub add_relationship {
     	if( $options->{'scope'} ne 'local' ) {
 			# Is there a relationship with this a & b already?
 			# Case-insensitive for non-orthographics.
-			my $rdga = $options->{'type'} eq 'orthographic' 
-				? $options->{'reading_a'} : lc( $options->{'reading_a'} );
-			my $rdgb = $options->{'type'} eq 'orthographic' 
-				? $options->{'reading_b'} : lc( $options->{'reading_b'} );
+			my $rdga = $options->{'reading_a'};
+			my $rdgb = $options->{'reading_b'};
 			my $otherrel = $self->scoped_relationship( $rdga, $rdgb );
 			if( $otherrel && $otherrel->type eq $options->{type}
 				&& $otherrel->scope eq $options->{scope} ) {
@@ -436,12 +438,6 @@ sub add_relationship {
     }
 
 
-	# Find all the pairs for which we need to set the relationship.
-	my @vectors;
-    if( $relationship->colocated && $relationship->nonlocal && !$thispaironly ) {
-    	push( @vectors, $self->_find_applicable( $relationship ) );
-    }
-        
     # Now set the relationship(s).
     my @pairs_set;
 	my $rel = $self->get_relationship( $source, $target );
@@ -464,19 +460,58 @@ sub add_relationship {
 	$self->_set_relationship( $relationship, $source, $target ) unless $skip;
 	push( @pairs_set, [ $source, $target ] );
     
-    # Set any additional relationships that might be in @vectors.
-    foreach my $v ( @vectors ) {
-    	next if $v->[0] eq $source && $v->[1] eq $target;
-    	next if $v->[1] eq $source && $v->[0] eq $target;
-    	my @added = $self->add_relationship( @$v, $relationship );
-    	push( @pairs_set, @added );
+	# Find all the pairs for which we need to set the relationship.
+    if( $relationship->colocated && $relationship->nonlocal && !$thispaironly ) {
+		push( @pairs_set, $self->add_global_relationship( $relationship ) );
     }
-    
     # Finally, restore whatever collations we can, and return.
     $self->_restore_collations( @$droppedcolls );
     return @pairs_set;
 }
 
+=head2 add_global_relationship( $options, $skipvector )
+
+Adds the relationship specified wherever the relevant readings appear together 
+in the graph.  Options as in add_relationship above. 
+
+=cut
+
+sub add_global_relationship {
+	my( $self, $options ) = @_;
+	# First see if we are dealing with a relationship object already
+	my $relationship;
+	if( ref( $options ) eq 'Text::Tradition::Collation::Relationship' ) {
+		$relationship = $options;
+	} else {
+		# Then see if a scoped relationship already applies for the words.
+		my $scopedrel = $self->scoped_relationship( 
+			$options->{reading_a}, $options->{reading_b} );
+		$relationship = $scopedrel ? $scopedrel 
+			: $self->create( $options );
+	}
+	# Sanity checking
+	throw( "Relationship passed to add_global is not global" )
+		unless $relationship->nonlocal;
+	throw( "Relationship passed to add_global is not a valid global type" )
+		unless $relationship->colocated && $relationship->type ne 'collated';
+		
+	# Apply the relationship wherever it is valid
+	my @pairs_set;
+    foreach my $v ( $self->_find_applicable( $relationship ) ) {
+    	my $exists = $self->get_relationship( @$v );
+    	if( $exists && $exists->type ne 'collated' ) {
+    		throw( "Found conflicting relationship at @$v" )
+    			unless $exists->type eq $relationship->type
+    				&& $exists->scope eq $relationship->scope;
+    	} else {
+	    	my @added = $self->add_relationship( @$v, $relationship );
+    		push( @pairs_set, @added );
+    	}
+    }
+	return @pairs_set;	
+}
+
+
 =head2 del_scoped_relationship( $reading_a, $reading_b )
 
 Returns the general (document-level or global) relationship that has been defined 
diff --git a/lib/Text/Tradition/Stemma.pm b/lib/Text/Tradition/Stemma.pm
index d6f9646..661c205 100644
--- a/lib/Text/Tradition/Stemma.pm
+++ b/lib/Text/Tradition/Stemma.pm
@@ -8,7 +8,6 @@ use Graph::Reader::Dot;
 use IPC::Run qw/ run binary /;
 use Text::Tradition::Error;
 use Text::Tradition::StemmaUtil qw/ character_input phylip_pars parse_newick /;
-use XML::LibXML;
 use Moose;
 
 =head1 NAME
@@ -347,9 +346,10 @@ sub as_svg {
     run( \@cmd, ">", binary(), \$svg );
     # HACK: Parse the SVG and change the dimensions.
     # Get rid of width and height attributes to allow scaling.
-    my $parser = XML::LibXML->new();
-    my $svgdoc = $parser->parse_string( decode_utf8( $svg ) );
     if( $opts->{'size'} ) {
+    	require XML::LibXML;
+		my $parser = XML::LibXML->new();
+		my $svgdoc = $parser->parse_string( decode_utf8( $svg ) );
     	my( $ew, $eh ) = @{$opts->{'size'}};
     	# If the graph is wider than it is tall, set width to ew and remove height.
     	# Otherwise set height to eh and remove width.
@@ -369,9 +369,10 @@ sub as_svg {
 		}
 		$svgdoc->documentElement->removeAttribute( $remove );
 		$svgdoc->documentElement->setAttribute( $keep, $val );
+		$svg = $svgdoc->toString();
 	}
     # Return the result
-    return decode_utf8( $svgdoc->toString );
+    return decode_utf8( $svg );
 }
 
 =head2 witnesses
diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm
index 3113e54..ab5b2f4 100644
--- a/lib/Text/Tradition/Witness.pm
+++ b/lib/Text/Tradition/Witness.pm
@@ -348,6 +348,7 @@ sub _init_from_xmldesc {
 		}
 		$xmlobj = $self->object;
 	} else {
+		require XML::LibXML;
 		my $parser = XML::LibXML->new();
 		my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
 		try {
diff --git a/script/join_readings.pl b/script/join_readings.pl
index a5ef593..31f780a 100755
--- a/script/join_readings.pl
+++ b/script/join_readings.pl
@@ -32,8 +32,10 @@ my $dir = Text::Tradition::Directory->new( $dbopts );
 my $scope = $dir->new_scope();
 my $lookfor = $ARGV[0] || '';
 foreach my $tinfo ( $dir->traditionlist() ) {
+	next if $tinfo->{'name'} eq 'xxxxx';
 	next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor;
 	my $tradition = $dir->lookup( $tinfo->{'id'} );
+	say "Looking at tradition " . $tradition->name;
 	my $c = $tradition->collation;
 
 	# Anywhere in the graph that there is a reading that joins only to a single
diff --git a/script/propagate_transitive.pl b/script/propagate_transitive.pl
index 2d49467..22cfee8 100755
--- a/script/propagate_transitive.pl
+++ b/script/propagate_transitive.pl
@@ -40,8 +40,10 @@ my $dir = Text::Tradition::Directory->new( $dbopts );
 my $scope = $dir->new_scope();
 my $lookfor = $ARGV[0] || '';
 foreach my $tinfo ( $dir->traditionlist() ) {
+	next if $tinfo->{'name'} eq 'xxxxx';
 	next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor;
 	my $tradition = $dir->lookup( $tinfo->{'id'} );
+	say "Looking at tradition " . $tradition->name;
 	my $c = $tradition->collation;
 
 	my $represented_by = {};
@@ -58,16 +60,22 @@ foreach my $tinfo ( $dir->traditionlist() ) {
 	# the same word.
 	foreach my $rel ( $c->relationships ) {
 		my $relobj = $c->get_relationship( $rel );
-		next unless $relobj->type =~ /^(grammatical|lexical)$/;
-		my $r1pool = $represented_by->{$representative->{$rel->[0]}};
-		my $r2pool = $represented_by->{$representative->{$rel->[1]}};
-		# Error check
-		if( check_distinct( $r1pool, $r2pool ) ) {
-			map { propagate_rel( $c, $relobj->type, $_, @$r2pool ) } @$r1pool;
-		} else {
-			warn "Pools not distinct for " . join( ' and ', @$rel );
+		if( $relobj->type =~ /^(grammatical|lexical)$/ ) {
+			my $r1pool = $represented_by->{$representative->{$rel->[0]}};
+			my $r2pool = $represented_by->{$representative->{$rel->[1]}};
+			# Error check
+			if( check_distinct( $r1pool, $r2pool ) ) {
+				map { propagate_rel( $c, $relobj->type, $_, @$r2pool ) } @$r1pool;
+			} else {
+				warn "Pools not distinct for " . join( ' and ', @$rel );
+			}
+		} elsif( $relobj->type eq 'transposition' ) {
+			# We also need to propagate transposition links, but rather more strictly.
+			propagate_rel( $c, 'transposition', map { $c->reading( $_ ) } @$rel );
 		}
 	}
+	
+	
 	$dir->save( $tradition ) unless $testrun;
 }
 
@@ -77,14 +85,28 @@ sub propagate_rel {
 	while( @list ) {
 		foreach my $r ( @list ) {
 			next if $curr eq $r;
-			my $hasrel = $c->get_relationship( $curr, $r );
-			if( !$hasrel || $hasrel->type eq 'collated' ) {
-				say STDERR "Propagating $type relationship $curr -> $r";
-				$c->add_relationship( $curr, $r, { type => $type } );
-			} elsif( $hasrel->type ne $type ) {
-				warn "Found relationship conflict at $curr / $r: "
-					. $hasrel->type . " instead of $type"
-					unless( $TYPEVALUES{$hasrel->type} < $TYPEVALUES{$type} );
+			# Check that the given relationship type exists between $curr and $r.
+			# Also check that the given relationship type exists between $curr and
+			# the same-type relationships of $r.
+			my @candidates = ( $r );
+			foreach my $rrel ( $r->related_readings() ) {
+				next if $curr eq $rrel;
+				my $rrelobj = $c->get_relationship( $r, $rrel );
+				if( $rrelobj && $rrelobj->type eq $type ) {
+					my $hasrrel = $c->get_relationship( $curr, $rrel );
+					push( @candidates, $rrel ) unless $hasrrel;
+				}
+			}
+			foreach my $cand ( @candidates ) {
+				my $hasrel = $c->get_relationship( $curr, $cand );
+				if( !$hasrel || $hasrel->type eq 'collated' ) {
+					say STDERR "Propagating $type relationship $curr -> $cand";
+					$c->add_relationship( $curr, $cand, { type => $type } );
+				} elsif( $hasrel->type ne $type ) {
+					warn "Found relationship conflict at $curr / $cand: "
+						. $hasrel->type . " instead of $type"
+						unless( $TYPEVALUES{$hasrel->type} < $TYPEVALUES{$type} );
+				}
 			}
 		}
 		$curr = shift @list;
diff --git a/t/analysis.t b/t/analysis.t
index 30c3852..b9e9d83 100755
--- a/t/analysis.t
+++ b/t/analysis.t
@@ -130,15 +130,15 @@ foreach my $row ( @{$results->{'variants'}} ) {
 			my %is_parent;
 			my @has_no_parent;
 			foreach my $rdg ( @{$row->{'readings'}} ) {
-				my $parents = $rdg->{'reading_parents'} || [];
-				foreach my $p ( @$parents ) {
+				my $parents = $rdg->{'reading_parents'} || {};
+				foreach my $p ( keys %$parents ) {
 					push( @{$is_parent{$p}}, $rdg->{'readingid'} );
 				}
-				push( @has_no_parent, $rdg->{'readingid'} ) unless @$parents;
+				push( @has_no_parent, $rdg->{'readingid'} ) unless keys %$parents;
 			}
 			# Test some stuff
 			foreach my $rdg ( @{$row->{'readings'}} ) {
-				is( $rdg->{'independent_occurrence'}, 1, 
+				is( @{$rdg->{'independent_occurrence'}}, 1, 
 					"Genealogical reading originates exactly once" );
 			}
 			is( @has_no_parent, 1, "Only one genealogical reading lacks a parent" );