load XML::LibXML only when required; handle global relationships more correctly;...
Tara L Andrews [Sun, 15 Jul 2012 10:06:32 +0000 (12:06 +0200)]
Makefile.PL
lib/Text/Tradition/Analysis.pm
lib/Text/Tradition/Collation.pm
lib/Text/Tradition/Collation/RelationshipStore.pm
lib/Text/Tradition/Stemma.pm
lib/Text/Tradition/Witness.pm
script/join_readings.pl
script/propagate_transitive.pl
t/analysis.t

index 8d23a73..33b8642 100644 (file)
@@ -19,6 +19,7 @@ requires( 'KiokuDB::GC::Naive' );
 requires( 'KiokuDB::TypeMap' );
 requires( 'KiokuDB::TypeMap::Entry::Naive' );
 requires( 'KiokuX::Model' );
+requires( 'KiokuX::User::Util' );
 requires( 'Module::Load' );
 requires( 'Moose' );
 requires( 'Moose::Util::TypeConstraints' );
index c51f3fd..1899952 100644 (file)
@@ -2,13 +2,13 @@ package Text::Tradition::Analysis;
 
 use strict;
 use warnings;
+use Algorithm::Diff;  # for word similarity measure
 use Benchmark;
 use Encode qw/ encode_utf8 /;
 use Exporter 'import';
 use Graph;
 use JSON qw/ encode_json decode_json /;
 use LWP::UserAgent;
-use Text::LevenshteinXS qw/ distance /;
 use Text::Tradition;
 use Text::Tradition::Stemma;
 use TryCatch;
@@ -202,6 +202,8 @@ sub run_analysis {
                $location->{'missing'} = [ keys %lmiss ];
                
                # Run the extra analysis we need.
+               ## TODO We run through all the variants in this call, so
+               ## why not add the reading data there instead of here below?
                analyze_location( $tradition, $stemma, $location, \%lmiss );
 
                my @layerwits;
@@ -680,7 +682,6 @@ sub analyze_location {
            }
        }
        }
-       
        # Get the actual graph we should work with
        my $graph;
        try {
@@ -715,6 +716,7 @@ sub analyze_location {
     # reading's evident parent(s).
     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
         my $rid = $rdghash->{'readingid'};
+        my $rdg = $c->reading( $rid );
         # Get the subgraph
         my $part = $subgraph->{$rid};
         
@@ -732,7 +734,11 @@ sub analyze_location {
                                my @next;
                                foreach my $wparent( @check ) {
                                        my $preading = $contig->{$wparent};
-                                       if( $preading ) {
+                                       # IDP assigns all nodes, hypothetical included, to a reading
+                                       # in the case of genealogical sets. We prune non-necessary
+                                       # hypothetical readings, but they are still in $contig, so
+                                       # we account for that here.
+                                       if( $preading && $preading ne $rid ) {
                                                $rdgparents->{$preading} = 1;
                                        } else {
                                                push( @next, $graph->predecessors( $wparent ) );
@@ -750,22 +756,40 @@ sub analyze_location {
                        if( $pobj ) {
                                my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
                                if( $rel ) {
-                                       $phash->{relation} = { type => $rel->type };
-                                       if( $rel->has_annotation ) {
-                                               $phash->{relation}->{'annotation'} = $rel->annotation;
+                                       _add_to_hash( $rel, $phash );
+                               } elsif( $rdg ) {
+                                       # First check for a transposed relationship
+                                       if( $rdg->rank != $pobj->rank ) {
+                                               foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
+                                                       next unless $ti->text eq $rdg->text;
+                                                       $rel = $c->get_relationship( $ti, $pobj );
+                                                       if( $rel ) {
+                                                               _add_to_hash( $rel, $phash, 1 );
+                                                               last;
+                                                       }
+                                               }
+                                               unless( $rel ) {
+                                                       foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
+                                                               next unless $ti->text eq $pobj->text;
+                                                               $rel = $c->get_relationship( $ti, $rdg );
+                                                               if( $rel ) {
+                                                                       _add_to_hash( $rel, $phash, 1 );
+                                                                       last;
+                                                               }
+                                                       }
+                                               }
                                        }
-                               } elsif( $rdghash->{readingid} eq '(omitted)' ) {
-                                       $phash->{relation} = { type => 'deletion' };
-                               } elsif( $rdghash->{text} ) {
-                                       # Check for sheer word similarity.
-                                       my $rtext = $rdghash->{text};
-                                       my $ptext = $pobj->text;
-                                       my $min = length( $rtext ) > length( $ptext )
-                                               ? length( $ptext ) : length( $rtext );
-                                       my $distance = distance( $rtext, $ptext );
-                                       if( $distance < $min ) {
-                                               $phash->{relation} = { type => 'wordsimilar' };
+                                       unless( $rel ) {
+                                               # and then check for sheer word similarity.
+                                               my $rtext = $rdg->text;
+                                               my $ptext = $pobj->text;
+                                               if( similar( $rtext, $ptext ) ) {
+                                                       # say STDERR "Words $rtext and $ptext judged similar";
+                                                       $phash->{relation} = { type => 'wordsimilar' };
+                                               } 
                                        }
+                               } else {
+                                       $phash->{relation} = { type => 'deletion' };
                                }
                                # Get the attributes of the parent object while we are here
                                $phash->{'text'} = $pobj->text if $pobj;
@@ -807,6 +831,51 @@ sub analyze_location {
     }
 }
 
+sub _add_to_hash {
+       my( $rel, $phash, $is_transposed ) = @_;
+       $phash->{relation} = { type => $rel->type };
+       $phash->{relation}->{transposed} = 1 if $is_transposed;
+       $phash->{relation}->{annotation} = $rel->annotation
+               if $rel->has_annotation;
+}
+
+=head2 similar( $word1, $word2 )
+
+Use Algorithm::Diff to get a sense of how close the words are to each other.
+This will hopefully handle substitutions a bit more nicely than Levenshtein.
+
+=cut
+
+#!/usr/bin/env perl
+
+sub similar {
+       my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
+       my @let1 = split( '', lc( $word1 ) );
+       my @let2 = split( '', lc( $word2 ) );
+       my $diff = Algorithm::Diff->new( \@let1, \@let2 );
+       my $mag = 0;
+       while( $diff->Next ) {
+               if( $diff->Same ) {
+                       # Take off points for longer strings
+                       my $cs = $diff->Range(1) - 2;
+                       $cs = 0 if $cs < 0;
+                       $mag -= $cs;
+               } elsif( !$diff->Items(1) ) {
+                       $mag += $diff->Range(2);
+               } elsif( !$diff->Items(2) ) {
+                       $mag += $diff->Range(1);
+               } else {
+                       # Split the difference for substitutions
+                       my $c1 = $diff->Range(1) || 1;
+                       my $c2 = $diff->Range(2) || 1;
+                       my $cd = ( $c1 + $c2 ) / 2;
+                       $mag += $cd;
+               }
+       }
+       return ( $mag <= length( $word1 ) / 2 );
+}
+
+
 
 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
 
index 5b3cd96..dda8485 100644 (file)
@@ -11,8 +11,6 @@ use Text::Tradition::Collation::Reading;
 use Text::Tradition::Collation::RelationshipStore;
 use Text::Tradition::Error;
 use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
-use XML::LibXML;
-use XML::LibXML::XPathContext;
 use Moose;
 
 has 'sequence' => (
@@ -448,6 +446,15 @@ sub compress_readings {
        # readings.
        my %gobbled;
        foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
+               # While we are here, get rid of any extra wordforms from a disambiguated
+               # reading.
+               if( $rdg->disambiguated ) {
+                       foreach my $lex ( $rdg->lexemes ) {
+                               $lex->clear_matching_forms();
+                               $lex->add_matching_form( $lex->form );
+                       }
+               }
+               # Now look for readings that can be joined to their successors.
                next if $rdg->is_meta;
                next if $gobbled{$rdg->id};
                next if $rdg->grammar_invalid || $rdg->is_nonsense;
@@ -1021,6 +1028,7 @@ sub as_graphml {
         'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
 
     # Create the document and root node
+    require XML::LibXML;
     my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
     my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
     $graphml->setDocumentElement( $root );
index a5a9529..4342bd9 100644 (file)
@@ -232,8 +232,12 @@ between the two reading strings. Returns undef if there is no general relationsh
 sub scoped_relationship {
        my( $self, $rdga, $rdgb ) = @_;
        my( $first, $second ) = sort( $rdga, $rdgb );
+       my( $lcfirst, $lcsecond ) = sort( lc( $rdga ), lc( $rdgb ) );
        if( exists $self->scopedrels->{$first}->{$second} ) {
                return $self->scopedrels->{$first}->{$second};
+       } elsif( exists $self->scopedrels->{$lcfirst}->{$lcsecond} ) {
+               my $rel = $self->scopedrels->{$lcfirst}->{$lcsecond};
+               return $rel->type ne 'orthographic' ? $rel : undef;
        } else {
                return undef;
        }
@@ -419,10 +423,8 @@ sub add_relationship {
        if( $options->{'scope'} ne 'local' ) {
                        # Is there a relationship with this a & b already?
                        # Case-insensitive for non-orthographics.
-                       my $rdga = $options->{'type'} eq 'orthographic' 
-                               ? $options->{'reading_a'} : lc( $options->{'reading_a'} );
-                       my $rdgb = $options->{'type'} eq 'orthographic' 
-                               ? $options->{'reading_b'} : lc( $options->{'reading_b'} );
+                       my $rdga = $options->{'reading_a'};
+                       my $rdgb = $options->{'reading_b'};
                        my $otherrel = $self->scoped_relationship( $rdga, $rdgb );
                        if( $otherrel && $otherrel->type eq $options->{type}
                                && $otherrel->scope eq $options->{scope} ) {
@@ -436,12 +438,6 @@ sub add_relationship {
     }
 
 
-       # Find all the pairs for which we need to set the relationship.
-       my @vectors;
-    if( $relationship->colocated && $relationship->nonlocal && !$thispaironly ) {
-       push( @vectors, $self->_find_applicable( $relationship ) );
-    }
-        
     # Now set the relationship(s).
     my @pairs_set;
        my $rel = $self->get_relationship( $source, $target );
@@ -464,19 +460,58 @@ sub add_relationship {
        $self->_set_relationship( $relationship, $source, $target ) unless $skip;
        push( @pairs_set, [ $source, $target ] );
     
-    # Set any additional relationships that might be in @vectors.
-    foreach my $v ( @vectors ) {
-       next if $v->[0] eq $source && $v->[1] eq $target;
-       next if $v->[1] eq $source && $v->[0] eq $target;
-       my @added = $self->add_relationship( @$v, $relationship );
-       push( @pairs_set, @added );
+       # Find all the pairs for which we need to set the relationship.
+    if( $relationship->colocated && $relationship->nonlocal && !$thispaironly ) {
+               push( @pairs_set, $self->add_global_relationship( $relationship ) );
     }
-    
     # Finally, restore whatever collations we can, and return.
     $self->_restore_collations( @$droppedcolls );
     return @pairs_set;
 }
 
+=head2 add_global_relationship( $options, $skipvector )
+
+Adds the relationship specified wherever the relevant readings appear together 
+in the graph.  Options as in add_relationship above. 
+
+=cut
+
+sub add_global_relationship {
+       my( $self, $options ) = @_;
+       # First see if we are dealing with a relationship object already
+       my $relationship;
+       if( ref( $options ) eq 'Text::Tradition::Collation::Relationship' ) {
+               $relationship = $options;
+       } else {
+               # Then see if a scoped relationship already applies for the words.
+               my $scopedrel = $self->scoped_relationship( 
+                       $options->{reading_a}, $options->{reading_b} );
+               $relationship = $scopedrel ? $scopedrel 
+                       : $self->create( $options );
+       }
+       # Sanity checking
+       throw( "Relationship passed to add_global is not global" )
+               unless $relationship->nonlocal;
+       throw( "Relationship passed to add_global is not a valid global type" )
+               unless $relationship->colocated && $relationship->type ne 'collated';
+               
+       # Apply the relationship wherever it is valid
+       my @pairs_set;
+    foreach my $v ( $self->_find_applicable( $relationship ) ) {
+       my $exists = $self->get_relationship( @$v );
+       if( $exists && $exists->type ne 'collated' ) {
+               throw( "Found conflicting relationship at @$v" )
+                       unless $exists->type eq $relationship->type
+                               && $exists->scope eq $relationship->scope;
+       } else {
+               my @added = $self->add_relationship( @$v, $relationship );
+               push( @pairs_set, @added );
+       }
+    }
+       return @pairs_set;      
+}
+
+
 =head2 del_scoped_relationship( $reading_a, $reading_b )
 
 Returns the general (document-level or global) relationship that has been defined 
index d6f9646..661c205 100644 (file)
@@ -8,7 +8,6 @@ use Graph::Reader::Dot;
 use IPC::Run qw/ run binary /;
 use Text::Tradition::Error;
 use Text::Tradition::StemmaUtil qw/ character_input phylip_pars parse_newick /;
-use XML::LibXML;
 use Moose;
 
 =head1 NAME
@@ -347,9 +346,10 @@ sub as_svg {
     run( \@cmd, ">", binary(), \$svg );
     # HACK: Parse the SVG and change the dimensions.
     # Get rid of width and height attributes to allow scaling.
-    my $parser = XML::LibXML->new();
-    my $svgdoc = $parser->parse_string( decode_utf8( $svg ) );
     if( $opts->{'size'} ) {
+       require XML::LibXML;
+               my $parser = XML::LibXML->new();
+               my $svgdoc = $parser->parse_string( decode_utf8( $svg ) );
        my( $ew, $eh ) = @{$opts->{'size'}};
        # If the graph is wider than it is tall, set width to ew and remove height.
        # Otherwise set height to eh and remove width.
@@ -369,9 +369,10 @@ sub as_svg {
                }
                $svgdoc->documentElement->removeAttribute( $remove );
                $svgdoc->documentElement->setAttribute( $keep, $val );
+               $svg = $svgdoc->toString();
        }
     # Return the result
-    return decode_utf8( $svgdoc->toString );
+    return decode_utf8( $svg );
 }
 
 =head2 witnesses
index 3113e54..ab5b2f4 100644 (file)
@@ -348,6 +348,7 @@ sub _init_from_xmldesc {
                }
                $xmlobj = $self->object;
        } else {
+               require XML::LibXML;
                my $parser = XML::LibXML->new();
                my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
                try {
index a5ef593..31f780a 100755 (executable)
@@ -32,8 +32,10 @@ my $dir = Text::Tradition::Directory->new( $dbopts );
 my $scope = $dir->new_scope();
 my $lookfor = $ARGV[0] || '';
 foreach my $tinfo ( $dir->traditionlist() ) {
+       next if $tinfo->{'name'} eq 'xxxxx';
        next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor;
        my $tradition = $dir->lookup( $tinfo->{'id'} );
+       say "Looking at tradition " . $tradition->name;
        my $c = $tradition->collation;
 
        # Anywhere in the graph that there is a reading that joins only to a single
index 2d49467..22cfee8 100755 (executable)
@@ -40,8 +40,10 @@ my $dir = Text::Tradition::Directory->new( $dbopts );
 my $scope = $dir->new_scope();
 my $lookfor = $ARGV[0] || '';
 foreach my $tinfo ( $dir->traditionlist() ) {
+       next if $tinfo->{'name'} eq 'xxxxx';
        next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor;
        my $tradition = $dir->lookup( $tinfo->{'id'} );
+       say "Looking at tradition " . $tradition->name;
        my $c = $tradition->collation;
 
        my $represented_by = {};
@@ -58,16 +60,22 @@ foreach my $tinfo ( $dir->traditionlist() ) {
        # the same word.
        foreach my $rel ( $c->relationships ) {
                my $relobj = $c->get_relationship( $rel );
-               next unless $relobj->type =~ /^(grammatical|lexical)$/;
-               my $r1pool = $represented_by->{$representative->{$rel->[0]}};
-               my $r2pool = $represented_by->{$representative->{$rel->[1]}};
-               # Error check
-               if( check_distinct( $r1pool, $r2pool ) ) {
-                       map { propagate_rel( $c, $relobj->type, $_, @$r2pool ) } @$r1pool;
-               } else {
-                       warn "Pools not distinct for " . join( ' and ', @$rel );
+               if( $relobj->type =~ /^(grammatical|lexical)$/ ) {
+                       my $r1pool = $represented_by->{$representative->{$rel->[0]}};
+                       my $r2pool = $represented_by->{$representative->{$rel->[1]}};
+                       # Error check
+                       if( check_distinct( $r1pool, $r2pool ) ) {
+                               map { propagate_rel( $c, $relobj->type, $_, @$r2pool ) } @$r1pool;
+                       } else {
+                               warn "Pools not distinct for " . join( ' and ', @$rel );
+                       }
+               } elsif( $relobj->type eq 'transposition' ) {
+                       # We also need to propagate transposition links, but rather more strictly.
+                       propagate_rel( $c, 'transposition', map { $c->reading( $_ ) } @$rel );
                }
        }
+       
+       
        $dir->save( $tradition ) unless $testrun;
 }
 
@@ -77,14 +85,28 @@ sub propagate_rel {
        while( @list ) {
                foreach my $r ( @list ) {
                        next if $curr eq $r;
-                       my $hasrel = $c->get_relationship( $curr, $r );
-                       if( !$hasrel || $hasrel->type eq 'collated' ) {
-                               say STDERR "Propagating $type relationship $curr -> $r";
-                               $c->add_relationship( $curr, $r, { type => $type } );
-                       } elsif( $hasrel->type ne $type ) {
-                               warn "Found relationship conflict at $curr / $r: "
-                                       . $hasrel->type . " instead of $type"
-                                       unless( $TYPEVALUES{$hasrel->type} < $TYPEVALUES{$type} );
+                       # Check that the given relationship type exists between $curr and $r.
+                       # Also check that the given relationship type exists between $curr and
+                       # the same-type relationships of $r.
+                       my @candidates = ( $r );
+                       foreach my $rrel ( $r->related_readings() ) {
+                               next if $curr eq $rrel;
+                               my $rrelobj = $c->get_relationship( $r, $rrel );
+                               if( $rrelobj && $rrelobj->type eq $type ) {
+                                       my $hasrrel = $c->get_relationship( $curr, $rrel );
+                                       push( @candidates, $rrel ) unless $hasrrel;
+                               }
+                       }
+                       foreach my $cand ( @candidates ) {
+                               my $hasrel = $c->get_relationship( $curr, $cand );
+                               if( !$hasrel || $hasrel->type eq 'collated' ) {
+                                       say STDERR "Propagating $type relationship $curr -> $cand";
+                                       $c->add_relationship( $curr, $cand, { type => $type } );
+                               } elsif( $hasrel->type ne $type ) {
+                                       warn "Found relationship conflict at $curr / $cand: "
+                                               . $hasrel->type . " instead of $type"
+                                               unless( $TYPEVALUES{$hasrel->type} < $TYPEVALUES{$type} );
+                               }
                        }
                }
                $curr = shift @list;
index 30c3852..b9e9d83 100755 (executable)
@@ -130,15 +130,15 @@ foreach my $row ( @{$results->{'variants'}} ) {
                        my %is_parent;
                        my @has_no_parent;
                        foreach my $rdg ( @{$row->{'readings'}} ) {
-                               my $parents = $rdg->{'reading_parents'} || [];
-                               foreach my $p ( @$parents ) {
+                               my $parents = $rdg->{'reading_parents'} || {};
+                               foreach my $p ( keys %$parents ) {
                                        push( @{$is_parent{$p}}, $rdg->{'readingid'} );
                                }
-                               push( @has_no_parent, $rdg->{'readingid'} ) unless @$parents;
+                               push( @has_no_parent, $rdg->{'readingid'} ) unless keys %$parents;
                        }
                        # Test some stuff
                        foreach my $rdg ( @{$row->{'readings'}} ) {
-                               is( $rdg->{'independent_occurrence'}, 1, 
+                               is( @{$rdg->{'independent_occurrence'}}, 1, 
                                        "Genealogical reading originates exactly once" );
                        }
                        is( @has_no_parent, 1, "Only one genealogical reading lacks a parent" );