use strict;
use warnings;
+use Algorithm::Diff; # for word similarity measure
use Benchmark;
use Encode qw/ encode_utf8 /;
use Exporter 'import';
$location->{'missing'} = [ keys %lmiss ];
# Run the extra analysis we need.
+ ## TODO We run through all the variants in this call, so
+ ## why not add the reading data there instead of here below?
analyze_location( $tradition, $stemma, $location, \%lmiss );
my @layerwits;
if( $rdg ) {
$rdghash->{'text'} = $rdg->text .
( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
+ $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
+ $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
}
# Remove lacunose witnesses from this reading's list now that the
# analysis is done
# If something was transposed, check the groups for doubled-up readings
if( $has_transposition ) {
- print STDERR "Group for rank $rank:\n";
- map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
- keys %$grouped_readings;
+ # print STDERR "Group for rank $rank:\n";
+ # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
+ # keys %$grouped_readings;
_check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
}
delete $groupings->{$rdg};
# If we found a group match, assume there is a symmetry happening.
# TODO think more about this
- print STDERR "*** Deleting symmetric reading $rdg\n";
+ # print STDERR "*** Deleting symmetric reading $rdg\n";
unless( $matched ) {
delete $transposed->{$rdg};
warn "Found problem in evident symmetry with reading $rdg";
foreach my $rdg ( @{$seen_wits{$dup}} ) {
next if $thisrank{$rdg};
next unless exists $groupings->{$rdg};
- print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
+ # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
delete $groupings->{$rdg};
delete $transposed->{$rdg};
}
my $subgraph = {};
my $acstr = $c->ac_label;
my @acwits;
- $DB::single = 1 if $variant_row->{id} == 87;
# Note which witnesses positively belong to which group
foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
my $rid = $rdghash->{'readingid'};
}
}
}
-
# Get the actual graph we should work with
my $graph;
try {
# reading's evident parent(s).
foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
my $rid = $rdghash->{'readingid'};
+ my $rdg = $c->reading( $rid );
# Get the subgraph
my $part = $subgraph->{$rid};
my @next;
foreach my $wparent( @check ) {
my $preading = $contig->{$wparent};
- if( $preading ) {
+ # IDP assigns all nodes, hypothetical included, to a reading
+ # in the case of genealogical sets. We prune non-necessary
+ # hypothetical readings, but they are still in $contig, so
+ # we account for that here.
+ if( $preading && $preading ne $rid ) {
$rdgparents->{$preading} = 1;
} else {
push( @next, $graph->predecessors( $wparent ) );
# Resolve the relationship of the parent to the reading, and
# save it in our hash.
my $pobj = $c->reading( $p );
- my $relation;
my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
+ my $phash = { 'label' => $prep };
if( $pobj ) {
my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
if( $rel ) {
- $relation = { type => $rel->type };
- if( $rel->has_annotation ) {
- $relation->{'annotation'} = $rel->annotation;
+ _add_to_hash( $rel, $phash );
+ } elsif( $rdg ) {
+ # First check for a transposed relationship
+ if( $rdg->rank != $pobj->rank ) {
+ foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
+ next unless $ti->text eq $rdg->text;
+ $rel = $c->get_relationship( $ti, $pobj );
+ if( $rel ) {
+ _add_to_hash( $rel, $phash, 1 );
+ last;
+ }
+ }
+ unless( $rel ) {
+ foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
+ next unless $ti->text eq $pobj->text;
+ $rel = $c->get_relationship( $ti, $rdg );
+ if( $rel ) {
+ _add_to_hash( $rel, $phash, 1 );
+ last;
+ }
+ }
+ }
}
+ unless( $rel ) {
+ # and then check for sheer word similarity.
+ my $rtext = $rdg->text;
+ my $ptext = $pobj->text;
+ if( similar( $rtext, $ptext ) ) {
+ # say STDERR "Words $rtext and $ptext judged similar";
+ $phash->{relation} = { type => 'wordsimilar' };
+ }
+ }
+ } else {
+ $phash->{relation} = { type => 'deletion' };
}
- }
- $rdgparents->{$p} = { 'label' => $prep, 'relation' => $relation };
+ # Get the attributes of the parent object while we are here
+ $phash->{'text'} = $pobj->text if $pobj;
+ $phash->{'is_nonsense'} = $pobj->is_nonsense;
+ $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
+ } elsif( $p eq '(omitted)' ) {
+ $phash->{relation} = { type => 'addition' };
+ }
+ # Save it
+ $rdgparents->{$p} = $phash;
}
$rdghash->{'reading_parents'} = $rdgparents;
}
}
+sub _add_to_hash {
+ my( $rel, $phash, $is_transposed ) = @_;
+ $phash->{relation} = { type => $rel->type };
+ $phash->{relation}->{transposed} = 1 if $is_transposed;
+ $phash->{relation}->{annotation} = $rel->annotation
+ if $rel->has_annotation;
+}
+
+=head2 similar( $word1, $word2 )
+
+Use Algorithm::Diff to get a sense of how close the words are to each other.
+This will hopefully handle substitutions a bit more nicely than Levenshtein.
+
+=cut
+
+#!/usr/bin/env perl
+
+sub similar {
+ my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
+ my @let1 = split( '', lc( $word1 ) );
+ my @let2 = split( '', lc( $word2 ) );
+ my $diff = Algorithm::Diff->new( \@let1, \@let2 );
+ my $mag = 0;
+ while( $diff->Next ) {
+ if( $diff->Same ) {
+ # Take off points for longer strings
+ my $cs = $diff->Range(1) - 2;
+ $cs = 0 if $cs < 0;
+ $mag -= $cs;
+ } elsif( !$diff->Items(1) ) {
+ $mag += $diff->Range(2);
+ } elsif( !$diff->Items(2) ) {
+ $mag += $diff->Range(1);
+ } else {
+ # Split the difference for substitutions
+ my $c1 = $diff->Range(1) || 1;
+ my $c2 = $diff->Range(2) || 1;
+ my $cd = ( $c1 + $c2 ) / 2;
+ $mag += $cd;
+ }
+ }
+ return ( $mag <= length( $word1 ) / 2 );
+}
+
+
=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )