X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FAnalysis.pm;h=18999526e0afa3c55076c19d7c19713ce8785187;hb=428bcf0bc79f77a7857b21ef881708faa792e33a;hp=bcede0821b7f24378c2723a15512055b05c27e97;hpb=94654e27c9b76e28a7fb0f5d12bb4eac45cfc4f7;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Analysis.pm b/lib/Text/Tradition/Analysis.pm index bcede08..1899952 100644 --- a/lib/Text/Tradition/Analysis.pm +++ b/lib/Text/Tradition/Analysis.pm @@ -2,6 +2,7 @@ package Text::Tradition::Analysis; use strict; use warnings; +use Algorithm::Diff; # for word similarity measure use Benchmark; use Encode qw/ encode_utf8 /; use Exporter 'import'; @@ -10,6 +11,7 @@ use JSON qw/ encode_json decode_json /; use LWP::UserAgent; use Text::Tradition; use Text::Tradition::Stemma; +use TryCatch; use vars qw/ @EXPORT_OK /; @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /; @@ -200,8 +202,11 @@ sub run_analysis { $location->{'missing'} = [ keys %lmiss ]; # Run the extra analysis we need. - analyze_location( $tradition, $stemma->graph, $location, \%lmiss ); + ## TODO We run through all the variants in this call, so + ## why not add the reading data there instead of here below? + analyze_location( $tradition, $stemma, $location, \%lmiss ); + my @layerwits; # Do the final post-analysis tidying up of the data. foreach my $rdghash ( @{$location->{'readings'}} ) { $conflict_count++ @@ -211,15 +216,22 @@ sub run_analysis { if( $rdg ) { $rdghash->{'text'} = $rdg->text . ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' ); + $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid; + $rdghash->{'is_nonsense'} = $rdg->is_nonsense; } # Remove lacunose witnesses from this reading's list now that the # analysis is done my @realgroup; map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}}; $rdghash->{'group'} = \@realgroup; - # TODO Record hypotheticals used to create group, if we end up - # needing it + # Note any layered witnesses that appear in this group + foreach( @realgroup ) { + if( $_ =~ /^(.*)\Q$aclabel\E$/ ) { + push( @layerwits, $1 ); + } + } } + $location->{'layerwits'} = \@layerwits if @layerwits; } $answer->{'conflict_count'} = $conflict_count; @@ -244,13 +256,15 @@ sub group_variants { my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_; my $c = $tradition->collation; my $aclabel = $c->ac_label; + my $table = $c->alignment_table; # Get the alignment table readings my %readings_at_rank; - my %is_lacunose; # lookup table for $lacunose - map { $is_lacunose{$_} = 1 } @$lacunose; + my %is_lacunose; # lookup table for witnesses not in stemma + map { $is_lacunose{$_} = 1; $is_lacunose{$_.$aclabel} = 1 } @$lacunose; my @check_for_gaps; my %moved_wits; - foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) { + my $has_transposition; + foreach my $tablewit ( @{$table->{'alignment'}} ) { my $rdg = $tablewit->{'tokens'}->[$rank-1]; my $wit = $tablewit->{'witness'}; # Exclude the witness if it is "lacunose" which if we got here @@ -263,7 +277,7 @@ sub group_variants { } elsif( $rdg ) { # If the reading has been counted elsewhere as a transposition, ignore it. if( $transposed->{$rdg->{'t'}->id} ) { - # TODO This doesn't cope with three-way transpositions + # TODO Does this cope with three-way transpositions? map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}}; next; } @@ -272,51 +286,76 @@ sub group_variants { # ...and grab any transpositions, and their relations. my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings(); foreach my $trdg ( @transp ) { - map { $moved_wits{$_} = 1 } $trdg->witnesses; - $transposed->{$trdg->id} = [ $rdg->{'t'}->witnesses ]; + next if exists $readings_at_rank{$trdg->id}; + $has_transposition = 1; + my @affected_wits = _table_witnesses( + $table, $trdg, \%is_lacunose, $aclabel ); + next unless @affected_wits; + map { $moved_wits{$_} = 1 } @affected_wits; + $transposed->{$trdg->id} = + [ _table_witnesses( $table, $rdg->{'t'}, \%is_lacunose, $aclabel ) ]; $readings_at_rank{$trdg->id} = $trdg; } # ...or it is empty, ergo a gap. } else { - push( @check_for_gaps, $wit ); + _add_to_witlist( $wit, \@check_for_gaps, $aclabel ); } } my @gap_wits; map { _add_to_witlist( $_, \@gap_wits, $aclabel ) unless $moved_wits{$_} } @check_for_gaps; - # TODO check for, and break into a new row, any doubled-up witness readings - # after transposition... # Group the readings, collapsing groups by relationship if needed - my %grouped_readings; + my $grouped_readings = {}; foreach my $rdg ( values %readings_at_rank ) { # Skip readings that have been collapsed into others. - next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id}; + next if exists $grouped_readings->{$rdg->id} + && $grouped_readings->{$rdg->id} eq 'COLLAPSE'; # Get the witness list, including from readings collapsed into this one. - my @wits = $rdg->witnesses; - if( $collapse ) { + my @wits = _table_witnesses( $table, $rdg, \%is_lacunose, $aclabel ); + if( $collapse && @$collapse ) { my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; }; foreach my $other ( $rdg->related_readings( $filter ) ) { - my @otherwits = $other->witnesses; + my @otherwits = _table_witnesses( + $table, $other, \%is_lacunose, $aclabel ); push( @wits, @otherwits ); - $grouped_readings{$other->id} = 0; + $grouped_readings->{$other->id} = 'COLLAPSE'; } } - # Filter the group to those witnesses in the stemma - my @use_wits; - foreach my $wit ( @wits ) { - next if $is_lacunose{$wit}; - push( @use_wits, $wit ); - } - $grouped_readings{$rdg->id} = \@use_wits; + $grouped_readings->{$rdg->id} = \@wits; } - $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits; + $grouped_readings->{'(omitted)'} = \@gap_wits if @gap_wits; # Get rid of our collapsed readings - map { delete $grouped_readings{$_} unless $grouped_readings{$_} } - keys %grouped_readings + map { delete $grouped_readings->{$_} if $grouped_readings->{$_} eq 'COLLAPSE' } + keys %$grouped_readings if $collapse; + + # If something was transposed, check the groups for doubled-up readings + if( $has_transposition ) { + # print STDERR "Group for rank $rank:\n"; + # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" } + # keys %$grouped_readings; + _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings ); + } # Return the result - return \%grouped_readings; + return $grouped_readings; +} + +# Helper function to query the alignment table for all witnesses (a.c. included) +# that have a given reading at its rank. +sub _table_witnesses { + my( $table, $trdg, $lacunose, $aclabel ) = @_; + my $tableidx = $trdg->rank - 1; + my @has_reading; + foreach my $row ( @{$table->{'alignment'}} ) { + my $wit = $row->{'witness'}; + next if $lacunose->{$wit}; + my $rdg = $row->{'tokens'}->[$tableidx]; + next unless exists $rdg->{'t'} && defined $rdg->{'t'}; + _add_to_witlist( $wit, \@has_reading, $aclabel ) + if $rdg->{'t'}->id eq $trdg->id; + } + return @has_reading; } # Helper function to ensure that X and X a.c. never appear in the same list. @@ -341,6 +380,60 @@ sub _add_to_witlist { } } +sub _check_transposed_consistency { + my( $c, $rank, $transposed, $groupings ) = @_; + my %seen_wits; + my %thisrank; + # Note which readings are actually at this rank, and which witnesses + # belong to which reading. + foreach my $rdg ( keys %$groupings ) { + my $rdgobj = $c->reading( $rdg ); + # Count '(omitted)' as a reading at this rank + $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank; + map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}}; + } + # Our work is done if we have no witness belonging to more than one + # reading. + my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits; + return unless @doubled; + # If we have a symmetric related transposition, drop the non-rank readings. + if( @doubled == scalar keys %seen_wits ) { + foreach my $rdg ( keys %$groupings ) { + if( !$thisrank{$rdg} ) { + my $groupstr = wit_stringify( $groupings->{$rdg} ); + my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) } + keys %thisrank; + delete $groupings->{$rdg}; + # If we found a group match, assume there is a symmetry happening. + # TODO think more about this + # print STDERR "*** Deleting symmetric reading $rdg\n"; + unless( $matched ) { + delete $transposed->{$rdg}; + warn "Found problem in evident symmetry with reading $rdg"; + } + } + } + # Otherwise 'unhook' the transposed reading(s) that have duplicates. + } else { + foreach my $dup ( @doubled ) { + foreach my $rdg ( @{$seen_wits{$dup}} ) { + next if $thisrank{$rdg}; + next unless exists $groupings->{$rdg}; + # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n"; + delete $groupings->{$rdg}; + delete $transposed->{$rdg}; + } + } + # and put any now-orphaned readings into an 'omitted' reading. + foreach my $wit ( keys %seen_wits ) { + unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) { + $groupings->{'(omitted)'} = [] unless exists $groupings->{'(omitted)'}; + _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label ); + } + } + } +} + =head2 solve_variants( $graph, @groups ) Sends the set of groups to the external graph solver service and returns @@ -392,7 +485,12 @@ sub solve_variants { # Finally, add the group to the list to be calculated for this graph. map { s/\Q$aclabel\E$// } @acwits; - my $graph = $stemma->extend_graph( \@acwits ); + my $graph; + try { + $graph = $stemma->extend_graph( \@acwits ); + } catch { + die "Unable to extend graph with @acwits"; + } unless( exists $graph_problems->{"$graph"} ) { $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] }; } @@ -565,18 +663,31 @@ conflict, reading_parents, independent_occurrence, followed, not_followed, and f =cut sub analyze_location { - my ( $tradition, $graph, $variant_row, $lacunose ) = @_; + my ( $tradition, $stemma, $variant_row, $lacunose ) = @_; my $c = $tradition->collation; # Make a hash of all known node memberships, and make the subgraphs. my $contig = {}; my $reading_roots = {}; my $subgraph = {}; - $DB::single = 1 if $variant_row->{id} == 6; + my $acstr = $c->ac_label; + my @acwits; # Note which witnesses positively belong to which group foreach my $rdghash ( @{$variant_row->{'readings'}} ) { my $rid = $rdghash->{'readingid'}; - map { $contig->{$_} = $rid } @{$rdghash->{'group'}}; + foreach my $wit ( @{$rdghash->{'group'}} ) { + $contig->{$wit} = $rid; + if( $wit =~ /^(.*)\Q$acstr\E$/ ) { + push( @acwits, $1 ); + } + } + } + # Get the actual graph we should work with + my $graph; + try { + $graph = @acwits ? $stemma->extend_graph( \@acwits ) : $stemma->graph; + } catch { + die "Could not extend graph with a.c. witnesses @acwits"; } # Now, armed with that knowledge, make a subgraph for each reading @@ -605,6 +716,7 @@ sub analyze_location { # reading's evident parent(s). foreach my $rdghash ( @{$variant_row->{'readings'}} ) { my $rid = $rdghash->{'readingid'}; + my $rdg = $c->reading( $rid ); # Get the subgraph my $part = $subgraph->{$rid}; @@ -622,7 +734,11 @@ sub analyze_location { my @next; foreach my $wparent( @check ) { my $preading = $contig->{$wparent}; - if( $preading ) { + # IDP assigns all nodes, hypothetical included, to a reading + # in the case of genealogical sets. We prune non-necessary + # hypothetical readings, but they are still in $contig, so + # we account for that here. + if( $preading && $preading ne $rid ) { $rdgparents->{$preading} = 1; } else { push( @next, $graph->predecessors( $wparent ) ); @@ -635,18 +751,55 @@ sub analyze_location { # Resolve the relationship of the parent to the reading, and # save it in our hash. my $pobj = $c->reading( $p ); - my $relation; my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p; + my $phash = { 'label' => $prep }; if( $pobj ) { my $rel = $c->get_relationship( $p, $rdghash->{readingid} ); if( $rel ) { - $relation = { type => $rel->type }; - if( $rel->has_annotation ) { - $relation->{'annotation'} = $rel->annotation; + _add_to_hash( $rel, $phash ); + } elsif( $rdg ) { + # First check for a transposed relationship + if( $rdg->rank != $pobj->rank ) { + foreach my $ti ( $rdg->related_readings( 'transposition' ) ) { + next unless $ti->text eq $rdg->text; + $rel = $c->get_relationship( $ti, $pobj ); + if( $rel ) { + _add_to_hash( $rel, $phash, 1 ); + last; + } + } + unless( $rel ) { + foreach my $ti ( $pobj->related_readings( 'transposition' ) ) { + next unless $ti->text eq $pobj->text; + $rel = $c->get_relationship( $ti, $rdg ); + if( $rel ) { + _add_to_hash( $rel, $phash, 1 ); + last; + } + } + } } + unless( $rel ) { + # and then check for sheer word similarity. + my $rtext = $rdg->text; + my $ptext = $pobj->text; + if( similar( $rtext, $ptext ) ) { + # say STDERR "Words $rtext and $ptext judged similar"; + $phash->{relation} = { type => 'wordsimilar' }; + } + } + } else { + $phash->{relation} = { type => 'deletion' }; } - } - $rdgparents->{$p} = { 'label' => $prep, 'relation' => $relation }; + # Get the attributes of the parent object while we are here + $phash->{'text'} = $pobj->text if $pobj; + $phash->{'is_nonsense'} = $pobj->is_nonsense; + $phash->{'is_ungrammatical'} = $pobj->grammar_invalid; + } elsif( $p eq '(omitted)' ) { + $phash->{relation} = { type => 'addition' }; + } + # Save it + $rdgparents->{$p} = $phash; } $rdghash->{'reading_parents'} = $rdgparents; @@ -678,6 +831,51 @@ sub analyze_location { } } +sub _add_to_hash { + my( $rel, $phash, $is_transposed ) = @_; + $phash->{relation} = { type => $rel->type }; + $phash->{relation}->{transposed} = 1 if $is_transposed; + $phash->{relation}->{annotation} = $rel->annotation + if $rel->has_annotation; +} + +=head2 similar( $word1, $word2 ) + +Use Algorithm::Diff to get a sense of how close the words are to each other. +This will hopefully handle substitutions a bit more nicely than Levenshtein. + +=cut + +#!/usr/bin/env perl + +sub similar { + my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_; + my @let1 = split( '', lc( $word1 ) ); + my @let2 = split( '', lc( $word2 ) ); + my $diff = Algorithm::Diff->new( \@let1, \@let2 ); + my $mag = 0; + while( $diff->Next ) { + if( $diff->Same ) { + # Take off points for longer strings + my $cs = $diff->Range(1) - 2; + $cs = 0 if $cs < 0; + $mag -= $cs; + } elsif( !$diff->Items(1) ) { + $mag += $diff->Range(2); + } elsif( !$diff->Items(2) ) { + $mag += $diff->Range(1); + } else { + # Split the difference for substitutions + my $c1 = $diff->Range(1) || 1; + my $c2 = $diff->Range(2) || 1; + my $cd = ( $c1 + $c2 ) / 2; + $mag += $cd; + } + } + return ( $mag <= length( $word1 ) / 2 ); +} + + =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )