X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FAnalysis.pm;h=18999526e0afa3c55076c19d7c19713ce8785187;hb=428bcf0bc79f77a7857b21ef881708faa792e33a;hp=9cb4f394b02f8197b295013b0a8b0f058fff1d7e;hpb=d120c995b9d6154a340858d90b46436af3758d90;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Analysis.pm b/lib/Text/Tradition/Analysis.pm index 9cb4f39..1899952 100644 --- a/lib/Text/Tradition/Analysis.pm +++ b/lib/Text/Tradition/Analysis.pm @@ -2,6 +2,7 @@ package Text::Tradition::Analysis; use strict; use warnings; +use Algorithm::Diff; # for word similarity measure use Benchmark; use Encode qw/ encode_utf8 /; use Exporter 'import'; @@ -201,6 +202,8 @@ sub run_analysis { $location->{'missing'} = [ keys %lmiss ]; # Run the extra analysis we need. + ## TODO We run through all the variants in this call, so + ## why not add the reading data there instead of here below? analyze_location( $tradition, $stemma, $location, \%lmiss ); my @layerwits; @@ -213,6 +216,8 @@ sub run_analysis { if( $rdg ) { $rdghash->{'text'} = $rdg->text . ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' ); + $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid; + $rdghash->{'is_nonsense'} = $rdg->is_nonsense; } # Remove lacunose witnesses from this reading's list now that the # analysis is done @@ -326,9 +331,9 @@ sub group_variants { # If something was transposed, check the groups for doubled-up readings if( $has_transposition ) { - print STDERR "Group for rank $rank:\n"; - map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" } - keys %$grouped_readings; + # print STDERR "Group for rank $rank:\n"; + # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" } + # keys %$grouped_readings; _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings ); } @@ -401,7 +406,7 @@ sub _check_transposed_consistency { delete $groupings->{$rdg}; # If we found a group match, assume there is a symmetry happening. # TODO think more about this - print STDERR "*** Deleting symmetric reading $rdg\n"; + # print STDERR "*** Deleting symmetric reading $rdg\n"; unless( $matched ) { delete $transposed->{$rdg}; warn "Found problem in evident symmetry with reading $rdg"; @@ -414,7 +419,7 @@ sub _check_transposed_consistency { foreach my $rdg ( @{$seen_wits{$dup}} ) { next if $thisrank{$rdg}; next unless exists $groupings->{$rdg}; - print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n"; + # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n"; delete $groupings->{$rdg}; delete $transposed->{$rdg}; } @@ -667,7 +672,6 @@ sub analyze_location { my $subgraph = {}; my $acstr = $c->ac_label; my @acwits; - $DB::single = 1 if $variant_row->{id} == 87; # Note which witnesses positively belong to which group foreach my $rdghash ( @{$variant_row->{'readings'}} ) { my $rid = $rdghash->{'readingid'}; @@ -678,7 +682,6 @@ sub analyze_location { } } } - # Get the actual graph we should work with my $graph; try { @@ -713,6 +716,7 @@ sub analyze_location { # reading's evident parent(s). foreach my $rdghash ( @{$variant_row->{'readings'}} ) { my $rid = $rdghash->{'readingid'}; + my $rdg = $c->reading( $rid ); # Get the subgraph my $part = $subgraph->{$rid}; @@ -730,7 +734,11 @@ sub analyze_location { my @next; foreach my $wparent( @check ) { my $preading = $contig->{$wparent}; - if( $preading ) { + # IDP assigns all nodes, hypothetical included, to a reading + # in the case of genealogical sets. We prune non-necessary + # hypothetical readings, but they are still in $contig, so + # we account for that here. + if( $preading && $preading ne $rid ) { $rdgparents->{$preading} = 1; } else { push( @next, $graph->predecessors( $wparent ) ); @@ -743,18 +751,55 @@ sub analyze_location { # Resolve the relationship of the parent to the reading, and # save it in our hash. my $pobj = $c->reading( $p ); - my $relation; my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p; + my $phash = { 'label' => $prep }; if( $pobj ) { my $rel = $c->get_relationship( $p, $rdghash->{readingid} ); if( $rel ) { - $relation = { type => $rel->type }; - if( $rel->has_annotation ) { - $relation->{'annotation'} = $rel->annotation; + _add_to_hash( $rel, $phash ); + } elsif( $rdg ) { + # First check for a transposed relationship + if( $rdg->rank != $pobj->rank ) { + foreach my $ti ( $rdg->related_readings( 'transposition' ) ) { + next unless $ti->text eq $rdg->text; + $rel = $c->get_relationship( $ti, $pobj ); + if( $rel ) { + _add_to_hash( $rel, $phash, 1 ); + last; + } + } + unless( $rel ) { + foreach my $ti ( $pobj->related_readings( 'transposition' ) ) { + next unless $ti->text eq $pobj->text; + $rel = $c->get_relationship( $ti, $rdg ); + if( $rel ) { + _add_to_hash( $rel, $phash, 1 ); + last; + } + } + } } + unless( $rel ) { + # and then check for sheer word similarity. + my $rtext = $rdg->text; + my $ptext = $pobj->text; + if( similar( $rtext, $ptext ) ) { + # say STDERR "Words $rtext and $ptext judged similar"; + $phash->{relation} = { type => 'wordsimilar' }; + } + } + } else { + $phash->{relation} = { type => 'deletion' }; } - } - $rdgparents->{$p} = { 'label' => $prep, 'relation' => $relation }; + # Get the attributes of the parent object while we are here + $phash->{'text'} = $pobj->text if $pobj; + $phash->{'is_nonsense'} = $pobj->is_nonsense; + $phash->{'is_ungrammatical'} = $pobj->grammar_invalid; + } elsif( $p eq '(omitted)' ) { + $phash->{relation} = { type => 'addition' }; + } + # Save it + $rdgparents->{$p} = $phash; } $rdghash->{'reading_parents'} = $rdgparents; @@ -786,6 +831,51 @@ sub analyze_location { } } +sub _add_to_hash { + my( $rel, $phash, $is_transposed ) = @_; + $phash->{relation} = { type => $rel->type }; + $phash->{relation}->{transposed} = 1 if $is_transposed; + $phash->{relation}->{annotation} = $rel->annotation + if $rel->has_annotation; +} + +=head2 similar( $word1, $word2 ) + +Use Algorithm::Diff to get a sense of how close the words are to each other. +This will hopefully handle substitutions a bit more nicely than Levenshtein. + +=cut + +#!/usr/bin/env perl + +sub similar { + my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_; + my @let1 = split( '', lc( $word1 ) ); + my @let2 = split( '', lc( $word2 ) ); + my $diff = Algorithm::Diff->new( \@let1, \@let2 ); + my $mag = 0; + while( $diff->Next ) { + if( $diff->Same ) { + # Take off points for longer strings + my $cs = $diff->Range(1) - 2; + $cs = 0 if $cs < 0; + $mag -= $cs; + } elsif( !$diff->Items(1) ) { + $mag += $diff->Range(2); + } elsif( !$diff->Items(2) ) { + $mag += $diff->Range(1); + } else { + # Split the difference for substitutions + my $c1 = $diff->Range(1) || 1; + my $c2 = $diff->Range(2) || 1; + my $cd = ( $c1 + $c2 ) / 2; + $mag += $cd; + } + } + return ( $mag <= length( $word1 ) / 2 ); +} + + =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )