Commit | Line | Data |
5412ad2e |
1 | #!/usr/bin/env perl |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use feature 'say'; |
b12ef011 |
6 | use lib '/home/tla/cpanmods/Text-SenseClusters-1.03/lib'; |
5412ad2e |
7 | use Text::SenseClusters::Simat; |
8 | use Text::Tradition; |
9 | use Text::WagnerFischer qw/distance/; |
10 | |
11 | binmode STDOUT, ':utf8'; |
12 | binmode STDERR, ':utf8'; |
13 | |
14 | # Get our arguments |
15 | my( $traditionfile, $threshold ) = @ARGV; |
db80d3ec |
16 | $threshold = 0.99 unless $threshold; |
5412ad2e |
17 | |
18 | # Load up a tradition |
19 | my $t; |
20 | my $m; |
21 | |
22 | $t = Text::Tradition->new( |
23 | file => $traditionfile, |
24 | input => 'Self' ); |
25 | say STDERR "Parsed tradition file"; |
26 | my $c = $t->collation; |
27 | |
28 | # Get the cosine similarity values |
29 | my ( $matrix ) = make_matrix( $t ); |
30 | |
31 | # For each relationship in the graph, see how it compares to other node pairs |
db80d3ec |
32 | # rated > $threshold |
5412ad2e |
33 | |
34 | foreach my $pair ( $c->relationships ) { |
35 | my $rel = $c->get_relationship( $pair ); |
36 | my( $rx, $ry ) = map { $c->reading( $_ ) } sort @$pair; |
5412ad2e |
37 | |
38 | say STDERR "Checking relationship $rx -- $ry, of type " . $rel->type; |
39 | my $matches = 0; |
40 | my @matched_rels; |
41 | foreach my $val ( sort { $a<=>$b } keys %{$matrix->{"$rx"}->{"$ry"}} ) { |
42 | $matches++; |
43 | foreach my $mpair ( @{$matrix->{"$rx"}->{"$ry"}->{"$val"}} ) { |
44 | my $mrel = $c->get_relationship( $mpair ); |
45 | my $mreltype = $mrel ? $mrel->type : '(no relation)'; |
46 | my( $mx, $my ) = map { $c->reading( $_ ) } sort @$mpair; |
47 | push( @matched_rels, sprintf( "%f: %s (%s) -- %s (%s), type %s", |
48 | $val, $mx, $mx->text, $my, $my->text, $mreltype ) ); |
49 | } |
50 | } |
51 | say sprintf( "Matches for %s (%s) -- %s (%s), type %s", |
52 | $rx, $rx->text, $ry, $ry->text, $rel->type ); |
53 | foreach ( @matched_rels ) { |
54 | say "\t$_"; |
55 | } |
56 | } |
57 | |
58 | |
59 | sub make_matrix { |
60 | my @comm = $c->calculate_common_readings(); |
61 | my %nextcomm; |
62 | my $ri = 0; |
63 | foreach my $cr ( sort { $a->rank <=> $b->rank } @comm ) { |
64 | until( $cr->rank == $ri ) { |
65 | $nextcomm{$ri++} = $cr->rank; |
66 | } |
67 | } |
68 | until ( $ri == $c->end->rank ) { |
69 | $nextcomm{$ri++} = $c->end->rank; |
70 | } |
71 | |
72 | # Find all the relatable pairs |
73 | my $grid = {}; |
74 | my %analyzed; |
75 | my $rct = 0; |
76 | foreach my $rx ( $c->readings ) { |
5412ad2e |
77 | next if $rx->is_meta(); |
78 | # Have to compare each reading with each other, so do this only once |
79 | $analyzed{"$rx"} = 1; |
80 | $rct++; |
81 | say STDERR "Looking at reading $rct ( $rx )"; |
82 | foreach my $ry ( $c->readings ) { |
5412ad2e |
83 | next if $ry->is_meta(); |
84 | next if $analyzed{"$ry"}; |
85 | |
86 | # Get their textual and graph distances from each other. |
87 | my $vector = { |
88 | textdiff => distance( $rx->text, $ry->text ), |
89 | rankdiff => abs( $rx->rank - $ry->rank ) |
90 | }; |
91 | $grid->{"$rx"}->{"$ry"} = $vector; |
92 | |
93 | # Do they share one or more witnesses? |
94 | my %rxwits; |
95 | my $sharewit = 0; |
96 | map { $rxwits{$_} = 1 } $rx->witnesses; |
97 | foreach my $rywit ( $ry->witnesses ) { |
98 | if( $rxwits{$rywit} ) { |
99 | $sharewit++; |
100 | $vector->{'reachable'} = 1; |
101 | last; |
102 | } |
103 | } |
104 | $vector->{'share_witness'} = $sharewit; |
105 | # Enough with the analysis if they do share witnesses. |
106 | next if $sharewit; |
107 | |
108 | # Is one node reachable from the other, even if they don't share a |
109 | # witness? |
110 | $vector->{'reachable'} = $nextcomm{$rx->rank} == $nextcomm{$ry->rank} ? 0 : 1; |
111 | } |
112 | } |
113 | |
114 | # Construct the vector list |
115 | # Dimensions are textdiff, rankdiff, reachable, share_witness |
116 | |
117 | my( $i, $values ) = ( 0, 0 ); |
118 | my $vecindex = {}; |
119 | my @keys; |
120 | my @lines; |
121 | foreach my $rx ( keys %$grid ) { |
122 | foreach my $ry ( keys %{$grid->{$rx}} ) { |
123 | my $vec = $grid->{$rx}->{$ry}; |
124 | next if $vec->{'reachable'}; # Skip all but colocations |
125 | $vecindex->{++$i} = [ $rx, $ry ]; |
126 | # Note the number of keys from the first vector we look at. |
127 | unless( @keys ) { |
128 | @keys = sort keys( %$vec ); |
129 | } |
130 | # Construct the matrix. |
131 | my @fields; |
132 | foreach my $j ( 0 .. $#keys ) { |
133 | my $v = $vec->{$keys[$j]}; |
134 | if( $v ) { |
135 | push( @fields, $j+1, $v ); |
136 | $values++; |
137 | } |
138 | } |
139 | push( @lines, join( " ", @fields ) ); |
140 | } |
141 | } |
142 | |
143 | my $matrix .= "$i " . scalar( @keys ) . " $values\n"; |
144 | foreach( @lines ) { |
145 | $matrix .= "$_\n"; |
146 | } |
147 | |
148 | # Open a filehandle on the matrix string we get. |
149 | open( my $matrix_fh, '<', \$matrix ) or die "Could not open filehandle on string"; |
150 | |
151 | # Now send the remainder of the filehandle for calculation. |
152 | my $simmatrix = Text::SenseClusters::Simat::simat( $matrix_fh ); |
153 | |
154 | # Match the returned values with the actual node pair comparisons they refer to. |
155 | my $cosine_values = {}; |
156 | foreach my $x ( keys %$simmatrix ) { |
157 | foreach my $y ( keys %{$simmatrix->{$x}} ) { |
158 | next unless $x < $y; # skip self-comparison & duplicates |
159 | my $val = $simmatrix->{$x}->{$y}; |
160 | # If the value is below our threshold of interest, skip it. |
161 | next if $val < $threshold; |
162 | # Sort the lookup of a pair alphabetically by reading ID. |
163 | my ( $r1x, $r1y ) = sort @{$vecindex->{$x}}; |
164 | my ( $r2x, $r2y ) = sort @{$vecindex->{$y}}; |
165 | push( @{$cosine_values->{$r1x}->{$r1y}->{$val}}, [ $r2x, $r2y ] ); |
166 | push( @{$cosine_values->{$r2x}->{$r2y}->{$val}}, [ $r1x, $r1y ] ); |
167 | } |
168 | } |
169 | |
170 | return $cosine_values; |
171 | } |