load XML::LibXML only when required; handle global relationships more correctly;...

[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
diff --git a/lib/Text/Tradition/Analysis.pm b/lib/Text/Tradition/Analysis.pm

index 9cb4f39..1899952 100644 (file)
--- a/lib/Text/Tradition/Analysis.pm
+++ b/lib/Text/Tradition/Analysis.pm
@@ -2,6 +2,7 @@ package Text::Tradition::Analysis;
 
 use strict;
 use warnings;
+use Algorithm::Diff;  # for word similarity measure
 use Benchmark;
 use Encode qw/ encode_utf8 /;
 use Exporter 'import';
@@ -201,6 +202,8 @@ sub run_analysis {
                $location->{'missing'} = [ keys %lmiss ];
                
                # Run the extra analysis we need.
+               ## TODO We run through all the variants in this call, so
+               ## why not add the reading data there instead of here below?
                analyze_location( $tradition, $stemma, $location, \%lmiss );
 
                my @layerwits;
@@ -213,6 +216,8 @@ sub run_analysis {
                        if( $rdg ) {
                                $rdghash->{'text'} = $rdg->text . 
                                        ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
+                               $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
+                               $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
                        }
                        # Remove lacunose witnesses from this reading's list now that the
                        # analysis is done 
@@ -326,9 +331,9 @@ sub group_variants {
                
        # If something was transposed, check the groups for doubled-up readings
        if( $has_transposition ) {
-               print STDERR "Group for rank $rank:\n";
-               map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" } 
-                       keys %$grouped_readings;
+               # print STDERR "Group for rank $rank:\n";
+               # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" } 
+               #       keys %$grouped_readings;
                _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
        }
        
@@ -401,7 +406,7 @@ sub _check_transposed_consistency {
                                delete $groupings->{$rdg};
                                # If we found a group match, assume there is a symmetry happening.
                                # TODO think more about this
-                               print STDERR "*** Deleting symmetric reading $rdg\n";
+                               # print STDERR "*** Deleting symmetric reading $rdg\n";
                                unless( $matched ) {
                                        delete $transposed->{$rdg};
                                        warn "Found problem in evident symmetry with reading $rdg";
@@ -414,7 +419,7 @@ sub _check_transposed_consistency {
                        foreach my $rdg ( @{$seen_wits{$dup}} ) {
                                next if $thisrank{$rdg};
                                next unless exists $groupings->{$rdg};
-                               print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
+                               # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
                                delete $groupings->{$rdg};
                                delete $transposed->{$rdg};
                        }
@@ -667,7 +672,6 @@ sub analyze_location {
        my $subgraph = {};
        my $acstr = $c->ac_label;
        my @acwits;
-       $DB::single = 1 if $variant_row->{id} == 87;
        # Note which witnesses positively belong to which group
     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
        my $rid = $rdghash->{'readingid'};
@@ -678,7 +682,6 @@ sub analyze_location {
            }
        }
        }
-       
        # Get the actual graph we should work with
        my $graph;
        try {
@@ -713,6 +716,7 @@ sub analyze_location {
     # reading's evident parent(s).
     foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
         my $rid = $rdghash->{'readingid'};
+        my $rdg = $c->reading( $rid );
         # Get the subgraph
         my $part = $subgraph->{$rid};
         
@@ -730,7 +734,11 @@ sub analyze_location {
                                my @next;
                                foreach my $wparent( @check ) {
                                        my $preading = $contig->{$wparent};
-                                       if( $preading ) {
+                                       # IDP assigns all nodes, hypothetical included, to a reading
+                                       # in the case of genealogical sets. We prune non-necessary
+                                       # hypothetical readings, but they are still in $contig, so
+                                       # we account for that here.
+                                       if( $preading && $preading ne $rid ) {
                                                $rdgparents->{$preading} = 1;
                                        } else {
                                                push( @next, $graph->predecessors( $wparent ) );
@@ -743,18 +751,55 @@ sub analyze_location {
                        # Resolve the relationship of the parent to the reading, and
                        # save it in our hash.
                        my $pobj = $c->reading( $p );
-                       my $relation;
                        my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
+                       my $phash = { 'label' => $prep };
                        if( $pobj ) {
                                my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
                                if( $rel ) {
-                                       $relation = { type => $rel->type };
-                                       if( $rel->has_annotation ) {
-                                               $relation->{'annotation'} = $rel->annotation;
+                                       _add_to_hash( $rel, $phash );
+                               } elsif( $rdg ) {
+                                       # First check for a transposed relationship
+                                       if( $rdg->rank != $pobj->rank ) {
+                                               foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
+                                                       next unless $ti->text eq $rdg->text;
+                                                       $rel = $c->get_relationship( $ti, $pobj );
+                                                       if( $rel ) {
+                                                               _add_to_hash( $rel, $phash, 1 );
+                                                               last;
+                                                       }
+                                               }
+                                               unless( $rel ) {
+                                                       foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
+                                                               next unless $ti->text eq $pobj->text;
+                                                               $rel = $c->get_relationship( $ti, $rdg );
+                                                               if( $rel ) {
+                                                                       _add_to_hash( $rel, $phash, 1 );
+                                                                       last;
+                                                               }
+                                                       }
+                                               }
                                        }
+                                       unless( $rel ) {
+                                               # and then check for sheer word similarity.
+                                               my $rtext = $rdg->text;
+                                               my $ptext = $pobj->text;
+                                               if( similar( $rtext, $ptext ) ) {
+                                                       # say STDERR "Words $rtext and $ptext judged similar";
+                                                       $phash->{relation} = { type => 'wordsimilar' };
+                                               } 
+                                       }
+                               } else {
+                                       $phash->{relation} = { type => 'deletion' };
                                }
-                       }       
-                       $rdgparents->{$p} = { 'label' => $prep, 'relation' => $relation };
+                               # Get the attributes of the parent object while we are here
+                               $phash->{'text'} = $pobj->text if $pobj;
+                               $phash->{'is_nonsense'} = $pobj->is_nonsense;
+                               $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
+                       } elsif( $p eq '(omitted)' ) {
+                               $phash->{relation} = { type => 'addition' };
+                       }
+                       # Save it
+                       $rdgparents->{$p} = $phash;
                }
                        
                $rdghash->{'reading_parents'} = $rdgparents;
@@ -786,6 +831,51 @@ sub analyze_location {
     }
 }
 
+sub _add_to_hash {
+       my( $rel, $phash, $is_transposed ) = @_;
+       $phash->{relation} = { type => $rel->type };
+       $phash->{relation}->{transposed} = 1 if $is_transposed;
+       $phash->{relation}->{annotation} = $rel->annotation
+               if $rel->has_annotation;
+}
+
+=head2 similar( $word1, $word2 )
+
+Use Algorithm::Diff to get a sense of how close the words are to each other.
+This will hopefully handle substitutions a bit more nicely than Levenshtein.
+
+=cut
+
+#!/usr/bin/env perl
+
+sub similar {
+       my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
+       my @let1 = split( '', lc( $word1 ) );
+       my @let2 = split( '', lc( $word2 ) );
+       my $diff = Algorithm::Diff->new( \@let1, \@let2 );
+       my $mag = 0;
+       while( $diff->Next ) {
+               if( $diff->Same ) {
+                       # Take off points for longer strings
+                       my $cs = $diff->Range(1) - 2;
+                       $cs = 0 if $cs < 0;
+                       $mag -= $cs;
+               } elsif( !$diff->Items(1) ) {
+                       $mag += $diff->Range(2);
+               } elsif( !$diff->Items(2) ) {
+                       $mag += $diff->Range(1);
+               } else {
+                       # Split the difference for substitutions
+                       my $c1 = $diff->Range(1) || 1;
+                       my $c2 = $diff->Range(2) || 1;
+                       my $cd = ( $c1 + $c2 ) / 2;
+                       $mag += $cd;
+               }
+       }
+       return ( $mag <= length( $word1 ) / 2 );
+}
+
+
 
 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )