let stexaminer run without IDP server reachable
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
index feb4675..d9612d7 100644 (file)
@@ -3,14 +3,12 @@ package Text::Tradition::Analysis;
 use strict;
 use warnings;
 use Algorithm::Diff;  # for word similarity measure
-use Benchmark;
-use Digest::MD5 qw/ md5_hex /;
-use Encode qw/ encode_utf8 /;
+use Encode qw/ decode_utf8 encode_utf8 /;
 use Exporter 'import';
 use Graph;
-use JSON qw/ to_json /;
+use JSON qw/ to_json decode_json /;
+use LWP::UserAgent;
 use Set::Scalar;
-use Text::Tradition;
 use Text::Tradition::Analysis::Result;
 use Text::Tradition::Directory;
 use Text::Tradition::Stemma;
@@ -19,6 +17,7 @@ use TryCatch;
 use vars qw/ @EXPORT_OK /;
 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
 
+my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
 my $unsolved_problems = {};
 
 =head1 NAME
@@ -157,8 +156,12 @@ sub run_analysis {
        
        # Make sure we have a lookup DB for graph calculation results; this will die
        # if calcdir or calcdsn isn't passed.
-       my $dir = $opts{'calcdir'} ? delete $opts{'calcdir'}
-               : Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
+       my $dir;
+       if( exists $opts{'calcdir'} ) {
+               $dir = delete $opts{'calcdir'}
+       } elsif ( exists $opts{'calcdsn'} ) {
+               $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
+       }
 
        # Get the stemma        
        my $stemma = $tradition->stemma( $stemma_id );
@@ -203,7 +206,20 @@ sub run_analysis {
                $lacunae{$rank} = $missing;
        }
        # Run the solver
-       my $answer = solve_variants( $dir, @groups );
+       my $answer;
+       try {
+               $answer = solve_variants( $dir, @groups );
+       } catch ( Text::Tradition::Error $e ) {
+               if( $e->message =~ /IDP/ ) {
+                       # Something is wrong with the solver; make the variants table anyway
+                       $answer->{'variants'} = [];
+                       map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
+                               @groups;
+               } else {
+                       # Something else is wrong; error out.
+                       $e->throw;
+               }
+       }
 
        # Do further analysis on the answer
        my $conflict_count = 0;
@@ -478,10 +494,10 @@ sub _graph_for_grouping {
                # needed to make up the groups.
                $graph = $stemma->situation_graph( $extant, $acwits );
        } catch ( Text::Tradition::Error $e ) {
-               die "Could not extend graph with given extant and a.c. witnesses: "
-                       . $e->message;
+               throw( "Could not extend graph with given extant and a.c. witnesses: "
+                       . $e->message );
        } catch {
-               die "Could not extend graph with a.c. witnesses @$acwits";
+               throw( "Could not extend graph with a.c. witnesses @$acwits" );
        }
        return $graph;
 }
@@ -491,10 +507,6 @@ sub _graph_for_grouping {
 Looks up the set of groups in the answers provided by the external graph solver 
 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
 
-The JSON has the form 
-  { "graph": [ stemmagraph DOT string without newlines ],
-    "groupings": [ array of arrays of groups, one per rank ] }
-    
 The answer has the form 
   { "variants" => [ array of variant location structures ],
     "variant_count" => total,
@@ -504,91 +516,112 @@ The answer has the form
 =cut
 
 sub solve_variants {
-       my( $dir, @groups ) = @_;
+       my( @groups ) = @_;
+       
+       # Are we using a local result directory, or did we pass an empty value
+       # for the directory?
+       my $dir;
+       unless( ref( $groups[0] ) eq 'HASH' ) {
+               $dir = shift @groups;
+       }
 
-       ## For each graph/group combo, look it up in the DB.
-       ## Witness map is a HACK to get around limitations in node names from IDP
-       my $witness_map = {};
-       ## Variables to store answers as they come back
+       ## For each graph/group combo, make a Text::Tradition::Analysis::Result
+       ## object so that we can send it off for IDP lookup.
        my $variants = [];
        my $genealogical = 0; # counter
+       # TODO Optimize for unique graph problems
+       my %problems;
        foreach my $graphproblem ( @groups ) {
-               # Initialize the result structure for this graph problem
-               my $vstruct = { readings => [] };
-               push( @$variants, $vstruct );
-               
                # Construct the calc result key and look up its answer
-               my $reqkey = _get_calc_key( $graphproblem );
+               my $problem = Text::Tradition::Analysis::Result->new(
+                       graph => $graphproblem->{'graph'},
+                       setlist => [ values %{$graphproblem->{'grouping'}} ] );
+               if( exists $problems{$problem->object_key} ) {
+                       $problem = $problems{$problem->object_key};
+               } else {
+                       $problems{$problem->object_key} = $problem;
+               }
+               $graphproblem->{'object'} = $problem;
+       }
+       
+       my %results;
+       if( $dir ) {
                my $scope = $dir->new_scope;
-               my $answer = $dir->lookup( $reqkey );
-               unless( $answer ) {
-                       #warn "No answer found for graph problem $reqkey, moving on";
-                       # Record the unsolved problem so that we can go get a solution
-                       _save_problem( $graphproblem );
-                       # Put just the request, with no real result, into vstruct
-                       foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
-                               push( @{$vstruct->{readings}}, { readingid => $rid, 
-                                       group => [ $graphproblem->{grouping}->{$rid}->members ] } );
-                       }
-                       next;
+               map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
+       } else {
+               my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode( 
+                       [ values %problems ] );
+               # Send it off and get the result
+               # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
+               my $ua = LWP::UserAgent->new();
+               my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json', 
+                                                         'Content' => $json ); 
+               my $answer;     
+               if( $resp->is_success ) {
+                       $answer = decode_json( $resp->content );
+                       throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
+               } else {
+                       throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
+                               . "; cannot run graph analysis" );
                }
-
-               ## The answer is a Text::Tradition::Analysis::Result containing a bunch
-               ## of information about this variant set. Record the information therein.
+               # One more sanity check
+               throw( "Something went wrong with answer symmetricity" )
+                       unless keys( %problems ) == @$answer;
+               # Convert the results
+               foreach my $a ( @$answer ) {
+                       my $r = Text::Tradition::Analysis::Result->new( $a );
+                       $results{$r->object_key} = $r;
+               }
+       }
+       
+       # We now have a single JSON-encoded Result object per problem sent. Fold its
+       # answers into our variant info structure.
+       foreach my $graphproblem ( @groups ) {
+               my $result = $results{$graphproblem->{'object'}->object_key}
+                       || $graphproblem->{'object'};
                
+               # Initialize the result structure for this graph problem
+               my $vstruct;
+               if( $result->status eq 'OK' ) {
+                       $vstruct = { readings => [] };
+                       push( @$variants, $vstruct );
+               } else {
+                       push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
+                       next;
+               }
+                               
                # 1. Did the group evaluate as genealogical?
-               $vstruct->{genealogical} = $answer->is_genealogical;
-               $genealogical++ if $answer->is_genealogical;
+               $vstruct->{genealogical} = $result->is_genealogical;
+               $genealogical++ if $result->is_genealogical;
                
                # 2. What are the calculated minimum groupings for each variant loc?
                foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
                        my $inputset = $graphproblem->{grouping}->{$rid};
-                       my $minset = $answer->minimum_grouping_for( $inputset );
+                       my $minset = $result->minimum_grouping_for( $inputset );
                        push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
                }
                
                # 3. What are the sources and classes calculated for each witness?
-               $vstruct->{witcopy_types} = { $answer->classes };
+               $vstruct->{witcopy_types} = { $result->classes };
                $vstruct->{reading_roots} = {};
-               map { $vstruct->{reading_roots}->{$_} = 1 } $answer->sources;
+               map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
                
        }
        
-       # Spit out any unsolved problems we encountered
-       # _list_unsolved();
-       
        return { 'variants' => $variants, 
                         'variant_count' => scalar @$variants,
                         'genealogical_count' => $genealogical };
 }
 
-sub _get_calc_key {
-       my( $graphproblem ) = @_;
-       my $graph = $graphproblem->{graph};
-       my $grouping = [ values %{$graphproblem->{grouping}} ];
-       my $key = Text::Tradition::Analysis::Result::string_from_graph_problem( 
-               $graph, $grouping );
-       return md5_hex( encode_utf8( $key ) );
-}
-
-sub _save_problem {
-       my( $graphproblem ) = @_;
-       my $problem = Text::Tradition::Analysis::Result->new(
-               graph => $graphproblem->{graph},
-               setlist => [ values %{$graphproblem->{grouping}} ]
-       );
-       my $key = _get_calc_key( $graphproblem );
-       my( $str ) = $problem->problem_json;
-       say STDERR "Stashing unsolved problem $str at key $key";
-       $unsolved_problems->{$key} = $problem;
-}
-
-sub _list_unsolved {
-       #say STDERR "Problems needing a solution:";
-       my @problems = values %$unsolved_problems;
-       return unless @problems;
-       my $first = shift @problems;
-       map { say STDERR $_ } $first->problem_json( @problems );
+sub _init_unsolved {
+       my( $graphproblem, $status ) = @_;
+       my $vstruct = { 'readings' => [] };
+       $vstruct->{'unsolved'} = $status;
+       foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
+               push( @{$vstruct->{readings}}, { readingid => $rid, 
+                       group => [ $graphproblem->{grouping}->{$rid}->members ] } );
+       }
+       return $vstruct;
 }
 
 =head2 analyze_location ( $tradition, $graph, $location_hash )
@@ -604,22 +637,18 @@ sub analyze_location {
        my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
        my $c = $tradition->collation;
        
+       if( exists $variant_row->{'unsolved'} ) {
+               return;
+       }
+       my $reading_roots = delete $variant_row->{'reading_roots'};
+       my $classinfo = delete $variant_row->{'witcopy_types'};
+       
        # Make a hash of all known node memberships, and make the subgraphs.
        my $contig = {};
-       my $reading_roots = {};
        my $subgraph = {};
        my $acstr = $c->ac_label;
        my @acwits;
        
-       my $NO_IDP;
-       if( exists $variant_row->{'reading_roots'} ) {
-               $reading_roots = delete $variant_row->{'reading_roots'};
-       } else {
-               warn "No reading source information from IDP - proceed at your own risk";
-               $NO_IDP = 1;
-       }
-       my $classinfo = delete $variant_row->{'witcopy_types'};
-       
        # Note which witnesses positively belong to which group. This information
        # comes ultimately from the IDP solver.
        # Also make a note of the reading's roots.
@@ -649,6 +678,7 @@ sub analyze_location {
         if( $classinfo ) {
                @reversions = grep { $classinfo->{$_} eq 'revert' } 
                        $rdghash->{'group'}->members;
+               $rdghash->{'reversions'} = \@reversions;
         }
         my @group = @{$rdghash->{'group'}};
         
@@ -659,7 +689,7 @@ sub analyze_location {
         my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
                # Work out relationships between readings and their non-followed parent.
                _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
-               $rdghash->{'reading_parents'} = $sourceparents;
+               $rdghash->{'source_parents'} = $sourceparents;
 
                if( @reversions ) {
                        my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
@@ -821,52 +851,6 @@ sub similar {
        return ( $mag <= length( $word1 ) / 2 );
 }
 
-sub _prune_group {
-       my( $group, $graph ) = @_;
-       my $relevant = {};
-       # Record the existence of the vertices in the group
-       map { $relevant->{$_} = 1 } @$group;
-       # Make our subgraph
-       my $subgraph = $graph->deep_copy;
-       map { $subgraph->delete_vertex( $_ ) unless $relevant->{$_} }
-               $subgraph->vertices;
-       # Now prune and return the remaining vertices.
-       _prune_subtree( $subgraph );
-       # Return the list of vertices and the list of roots.
-       my $pruned_group = [ sort $subgraph->vertices ];
-       my $pruned_roots = [ $subgraph->predecessorless_vertices ];
-       return( $pruned_group, $pruned_roots );
-}
-
-sub _prune_subtree {
-       my( $tree ) = @_;
-       
-       # Delete lacunose witnesses that have no successors
-       my @orphan_hypotheticals;
-       my $ctr = 0;
-       do {
-               die "Infinite loop on leaves" if $ctr > 100;
-               @orphan_hypotheticals = 
-                       grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' } 
-                               $tree->successorless_vertices;
-               $tree->delete_vertices( @orphan_hypotheticals );
-               $ctr++;
-       } while( @orphan_hypotheticals );
-       
-       # Delete lacunose roots that have a single successor
-       my @redundant_root;
-       $ctr = 0;
-       do {
-               die "Infinite loop on roots" if $ctr > 100;
-               @redundant_root = 
-                       grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' 
-                                  && $tree->successors( $_ ) == 1 } 
-                               $tree->predecessorless_vertices;
-               $tree->delete_vertices( @redundant_root );
-               $ctr++;
-       } while( @redundant_root );
-}
-
 sub _useful_variant {
        my( $rankgroup, $rankgraph, $acstr ) = @_;
 
@@ -908,6 +892,13 @@ sub wit_stringify {
 
 1;
 
+sub throw {
+       Text::Tradition::Error->throw( 
+               'ident' => 'Analysis error',
+               'message' => $_[0],
+       );
+}
+
 =head1 LICENSE
 
 This package is free software and is provided "as is" without express