initial commit of IDP scripts for stemma analysis
[scpubgit/stemmatology.git] / base / script / vectors.pl
CommitLineData
5412ad2e 1#!/usr/bin/env perl
2
3use strict;
4use warnings;
5use feature 'say';
b12ef011 6use lib '/home/tla/cpanmods/Text-SenseClusters-1.03/lib';
5412ad2e 7use Text::SenseClusters::Simat;
8use Text::Tradition;
9use Text::WagnerFischer qw/distance/;
10
11binmode STDOUT, ':utf8';
12binmode STDERR, ':utf8';
13
14# Get our arguments
15my( $traditionfile, $threshold ) = @ARGV;
db80d3ec 16$threshold = 0.99 unless $threshold;
5412ad2e 17
18# Load up a tradition
19my $t;
20my $m;
21
22$t = Text::Tradition->new(
23 file => $traditionfile,
24 input => 'Self' );
25say STDERR "Parsed tradition file";
26my $c = $t->collation;
27
28# Get the cosine similarity values
29my ( $matrix ) = make_matrix( $t );
30
31# For each relationship in the graph, see how it compares to other node pairs
db80d3ec 32# rated > $threshold
5412ad2e 33
34foreach my $pair ( $c->relationships ) {
35 my $rel = $c->get_relationship( $pair );
36 my( $rx, $ry ) = map { $c->reading( $_ ) } sort @$pair;
5412ad2e 37
38 say STDERR "Checking relationship $rx -- $ry, of type " . $rel->type;
39 my $matches = 0;
40 my @matched_rels;
41 foreach my $val ( sort { $a<=>$b } keys %{$matrix->{"$rx"}->{"$ry"}} ) {
42 $matches++;
43 foreach my $mpair ( @{$matrix->{"$rx"}->{"$ry"}->{"$val"}} ) {
44 my $mrel = $c->get_relationship( $mpair );
45 my $mreltype = $mrel ? $mrel->type : '(no relation)';
46 my( $mx, $my ) = map { $c->reading( $_ ) } sort @$mpair;
47 push( @matched_rels, sprintf( "%f: %s (%s) -- %s (%s), type %s",
48 $val, $mx, $mx->text, $my, $my->text, $mreltype ) );
49 }
50 }
51 say sprintf( "Matches for %s (%s) -- %s (%s), type %s",
52 $rx, $rx->text, $ry, $ry->text, $rel->type );
53 foreach ( @matched_rels ) {
54 say "\t$_";
55 }
56}
57
58
59sub make_matrix {
60 my @comm = $c->calculate_common_readings();
61 my %nextcomm;
62 my $ri = 0;
63 foreach my $cr ( sort { $a->rank <=> $b->rank } @comm ) {
64 until( $cr->rank == $ri ) {
65 $nextcomm{$ri++} = $cr->rank;
66 }
67 }
68 until ( $ri == $c->end->rank ) {
69 $nextcomm{$ri++} = $c->end->rank;
70 }
71
72 # Find all the relatable pairs
73 my $grid = {};
74 my %analyzed;
75 my $rct = 0;
76 foreach my $rx ( $c->readings ) {
5412ad2e 77 next if $rx->is_meta();
78 # Have to compare each reading with each other, so do this only once
79 $analyzed{"$rx"} = 1;
80 $rct++;
81 say STDERR "Looking at reading $rct ( $rx )";
82 foreach my $ry ( $c->readings ) {
5412ad2e 83 next if $ry->is_meta();
84 next if $analyzed{"$ry"};
85
86 # Get their textual and graph distances from each other.
87 my $vector = {
88 textdiff => distance( $rx->text, $ry->text ),
89 rankdiff => abs( $rx->rank - $ry->rank )
90 };
91 $grid->{"$rx"}->{"$ry"} = $vector;
92
93 # Do they share one or more witnesses?
94 my %rxwits;
95 my $sharewit = 0;
96 map { $rxwits{$_} = 1 } $rx->witnesses;
97 foreach my $rywit ( $ry->witnesses ) {
98 if( $rxwits{$rywit} ) {
99 $sharewit++;
100 $vector->{'reachable'} = 1;
101 last;
102 }
103 }
104 $vector->{'share_witness'} = $sharewit;
105 # Enough with the analysis if they do share witnesses.
106 next if $sharewit;
107
108 # Is one node reachable from the other, even if they don't share a
109 # witness?
110 $vector->{'reachable'} = $nextcomm{$rx->rank} == $nextcomm{$ry->rank} ? 0 : 1;
111 }
112 }
113
114 # Construct the vector list
115 # Dimensions are textdiff, rankdiff, reachable, share_witness
116
117 my( $i, $values ) = ( 0, 0 );
118 my $vecindex = {};
119 my @keys;
120 my @lines;
121 foreach my $rx ( keys %$grid ) {
122 foreach my $ry ( keys %{$grid->{$rx}} ) {
123 my $vec = $grid->{$rx}->{$ry};
124 next if $vec->{'reachable'}; # Skip all but colocations
125 $vecindex->{++$i} = [ $rx, $ry ];
126 # Note the number of keys from the first vector we look at.
127 unless( @keys ) {
128 @keys = sort keys( %$vec );
129 }
130 # Construct the matrix.
131 my @fields;
132 foreach my $j ( 0 .. $#keys ) {
133 my $v = $vec->{$keys[$j]};
134 if( $v ) {
135 push( @fields, $j+1, $v );
136 $values++;
137 }
138 }
139 push( @lines, join( " ", @fields ) );
140 }
141 }
142
143 my $matrix .= "$i " . scalar( @keys ) . " $values\n";
144 foreach( @lines ) {
145 $matrix .= "$_\n";
146 }
147
148 # Open a filehandle on the matrix string we get.
149 open( my $matrix_fh, '<', \$matrix ) or die "Could not open filehandle on string";
150
151 # Now send the remainder of the filehandle for calculation.
152 my $simmatrix = Text::SenseClusters::Simat::simat( $matrix_fh );
153
154 # Match the returned values with the actual node pair comparisons they refer to.
155 my $cosine_values = {};
156 foreach my $x ( keys %$simmatrix ) {
157 foreach my $y ( keys %{$simmatrix->{$x}} ) {
158 next unless $x < $y; # skip self-comparison & duplicates
159 my $val = $simmatrix->{$x}->{$y};
160 # If the value is below our threshold of interest, skip it.
161 next if $val < $threshold;
162 # Sort the lookup of a pair alphabetically by reading ID.
163 my ( $r1x, $r1y ) = sort @{$vecindex->{$x}};
164 my ( $r2x, $r2y ) = sort @{$vecindex->{$y}};
165 push( @{$cosine_values->{$r1x}->{$r1y}->{$val}}, [ $r2x, $r2y ] );
166 push( @{$cosine_values->{$r2x}->{$r2y}->{$val}}, [ $r1x, $r1y ] );
167 }
168 }
169
170 return $cosine_values;
171}