From: Tara L Andrews Date: Fri, 12 Jul 2013 12:39:55 +0000 (+0200) Subject: experimental script for vector cosine analysis X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=5412ad2e4e1fff25f9804a21d774c525cc89bf0a;p=scpubgit%2Fstemmatology.git experimental script for vector cosine analysis --- diff --git a/base/script/vectors.pl b/base/script/vectors.pl new file mode 100755 index 0000000..b3efd7b --- /dev/null +++ b/base/script/vectors.pl @@ -0,0 +1,185 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use feature 'say'; +use lib '/Users/tla/Projects/cpanmods/Text-SenseClusters-1.03/lib'; +use Text::SenseClusters::Simat; +use Text::Tradition; +use Text::WagnerFischer qw/distance/; + +binmode STDOUT, ':utf8'; +binmode STDERR, ':utf8'; + +# Get our arguments +my( $traditionfile, $threshold ) = @ARGV; +$threshold = 0.9 unless $threshold; + +# Load up a tradition +my $t; +my $m; + +$t = Text::Tradition->new( + file => $traditionfile, + input => 'Self' ); +say STDERR "Parsed tradition file"; +my $c = $t->collation; + +# Get the cosine similarity values +my ( $matrix ) = make_matrix( $t ); + +# For each relationship in the graph, see how it compares to other node pairs +# rated > .9 + +foreach my $pair ( $c->relationships ) { + my $rel = $c->get_relationship( $pair ); + my( $rx, $ry ) = map { $c->reading( $_ ) } sort @$pair; + next if $rx->rank > 100; + next if $ry->rank > 100; + + say STDERR "Checking relationship $rx -- $ry, of type " . $rel->type; + my $matches = 0; + my @matched_rels; + foreach my $val ( sort { $a<=>$b } keys %{$matrix->{"$rx"}->{"$ry"}} ) { + $matches++; + foreach my $mpair ( @{$matrix->{"$rx"}->{"$ry"}->{"$val"}} ) { + my $mrel = $c->get_relationship( $mpair ); + my $mreltype = $mrel ? $mrel->type : '(no relation)'; + my( $mx, $my ) = map { $c->reading( $_ ) } sort @$mpair; + push( @matched_rels, sprintf( "%f: %s (%s) -- %s (%s), type %s", + $val, $mx, $mx->text, $my, $my->text, $mreltype ) ); + } + } + say sprintf( "Matches for %s (%s) -- %s (%s), type %s", + $rx, $rx->text, $ry, $ry->text, $rel->type ); + foreach ( @matched_rels ) { + say "\t$_"; + } +} + + +sub make_matrix { + my @comm = $c->calculate_common_readings(); + my %nextcomm; + my $ri = 0; + foreach my $cr ( sort { $a->rank <=> $b->rank } @comm ) { + until( $cr->rank == $ri ) { + $nextcomm{$ri++} = $cr->rank; + } + } + until ( $ri == $c->end->rank ) { + $nextcomm{$ri++} = $c->end->rank; + } + + # Find all the relatable pairs + my $grid = {}; + my %analyzed; + my $rct = 0; + foreach my $rx ( $c->readings ) { + next if $rx->rank > 100; + next if $rx->is_meta(); + # Have to compare each reading with each other, so do this only once + $analyzed{"$rx"} = 1; + $rct++; + say STDERR "Looking at reading $rct ( $rx )"; + foreach my $ry ( $c->readings ) { + next if $ry->rank > 100; + next if $ry->is_meta(); + next if $analyzed{"$ry"}; + + # Get their textual and graph distances from each other. + my $vector = { + textdiff => distance( $rx->text, $ry->text ), + rankdiff => abs( $rx->rank - $ry->rank ) + }; + $grid->{"$rx"}->{"$ry"} = $vector; + + # Do they share one or more witnesses? + my %rxwits; + my $sharewit = 0; + map { $rxwits{$_} = 1 } $rx->witnesses; + foreach my $rywit ( $ry->witnesses ) { + if( $rxwits{$rywit} ) { + $sharewit++; + $vector->{'reachable'} = 1; + last; + } + } + $vector->{'share_witness'} = $sharewit; + # Enough with the analysis if they do share witnesses. + next if $sharewit; + + # Is one node reachable from the other, even if they don't share a + # witness? + $vector->{'reachable'} = $nextcomm{$rx->rank} == $nextcomm{$ry->rank} ? 0 : 1; + } + } + + # Construct the vector list + # Dimensions are textdiff, rankdiff, reachable, share_witness + + my( $i, $values ) = ( 0, 0 ); + my $vecindex = {}; + my @keys; + my @lines; + foreach my $rx ( keys %$grid ) { + foreach my $ry ( keys %{$grid->{$rx}} ) { + my $vec = $grid->{$rx}->{$ry}; + next if $vec->{'reachable'}; # Skip all but colocations + $vecindex->{++$i} = [ $rx, $ry ]; + # Note the number of keys from the first vector we look at. + unless( @keys ) { + @keys = sort keys( %$vec ); + } + # Construct the matrix. + my @fields; + foreach my $j ( 0 .. $#keys ) { + my $v = $vec->{$keys[$j]}; + if( $v ) { + push( @fields, $j+1, $v ); + $values++; + } + } + push( @lines, join( " ", @fields ) ); + } + } + + my $matrix .= "$i " . scalar( @keys ) . " $values\n"; + foreach( @lines ) { + $matrix .= "$_\n"; + } + + # Open a filehandle on the matrix string we get. + open( my $matrix_fh, '<', \$matrix ) or die "Could not open filehandle on string"; + + # Now send the remainder of the filehandle for calculation. + my $simmatrix = Text::SenseClusters::Simat::simat( $matrix_fh ); + + # Match the returned values with the actual node pair comparisons they refer to. + my $cosine_values = {}; + foreach my $x ( keys %$simmatrix ) { + foreach my $y ( keys %{$simmatrix->{$x}} ) { + next unless $x < $y; # skip self-comparison & duplicates + my $val = $simmatrix->{$x}->{$y}; + # If the value is below our threshold of interest, skip it. + next if $val < $threshold; + # Sort the lookup of a pair alphabetically by reading ID. + my ( $r1x, $r1y ) = sort @{$vecindex->{$x}}; + my ( $r2x, $r2y ) = sort @{$vecindex->{$y}}; + push( @{$cosine_values->{$r1x}->{$r1y}->{$val}}, [ $r2x, $r2y ] ); + push( @{$cosine_values->{$r2x}->{$r2y}->{$val}}, [ $r1x, $r1y ] ); + } + } + + return $cosine_values; +} + +sub calc_similarity { + my( $matrix, $vecindex ) = @_; + + my $values = {}; + foreach my $val ( sort { $a <=> $b } keys %$values ) { + my $pairlist = join( ', ', @{$values->{$val}} ); + say "$val: $pairlist"; + } +} \ No newline at end of file