Commit | Line | Data |
e58153d6 |
1 | package Text::Tradition::Graph::Position; |
a25d4374 |
2 | |
3 | use strict; |
4 | use warnings; |
5 | |
6 | =head1 NAME |
7 | |
e58153d6 |
8 | Text::Tradition::Graph::Position |
a25d4374 |
9 | |
10 | =head1 SUMMARY |
11 | |
12 | An object to go with a text graph that keeps track of relative |
13 | positions of the nodes. |
14 | |
15 | =head1 METHODS |
16 | |
17 | =over 4 |
18 | |
19 | =item B<new> |
20 | |
21 | Takes two arguments: a list of names of common nodes in the graph, and |
22 | a list of witness paths. Calculates position identifiers for each |
23 | node based on this. |
24 | |
25 | =cut |
26 | |
27 | sub new { |
28 | my $proto = shift; |
29 | my( $common_nodes, $witness_paths ) = @_; |
30 | |
31 | my $self = {}; |
32 | |
33 | # We have to calculate the position identifiers for each word, |
34 | # keyed on the common nodes. This will be 'fun'. The end result |
35 | # is a hash per witness, whose key is the word node and whose |
36 | # value is its position in the text. Common nodes are always N,1 |
37 | # so have identical positions in each text. |
38 | |
39 | my $node_pos = {}; |
40 | foreach my $wit ( keys %$witness_paths ) { |
41 | # First we walk each path, making a matrix for each witness that |
42 | # corresponds to its eventual position identifier. Common nodes |
43 | # always start a new row, and are thus always in the first column. |
44 | |
45 | my $wit_matrix = []; |
46 | my $cn = 0; # We should hit the common nodes in order. |
47 | my $row = []; |
48 | foreach my $wn ( @{$witness_paths->{$wit}} ) { # $wn is a node name |
49 | if( $wn eq $common_nodes->[$cn] ) { |
50 | # Set up to look for the next common node, and |
51 | # start a new row of words. |
52 | $cn++; |
53 | push( @$wit_matrix, $row ) if scalar( @$row ); |
54 | $row = []; |
55 | } |
56 | push( @$row, $wn ); |
57 | } |
58 | push( @$wit_matrix, $row ); # Push the last row onto the matrix |
59 | |
60 | # Now we have a matrix per witness, so that each row in the |
61 | # matrix begins with a common node, and continues with all the |
62 | # variant words that appear in the witness. We turn this into |
63 | # real positions in row,cell format. But we need some |
64 | # trickery in order to make sure that each node gets assigned |
65 | # to only one position. |
66 | |
67 | foreach my $li ( 1..scalar(@$wit_matrix) ) { |
68 | foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) { |
69 | my $node = $wit_matrix->[$li-1]->[$di-1]; |
70 | my $position = "$li,$di"; |
71 | # If we have seen this node before, we need to compare |
72 | # its position with what went before. |
73 | unless( exists $node_pos->{ $node } && |
74 | _cmp_position( $position, $node_pos->{ $node }) < 1 ) { |
75 | # The new position ID replaces the old one. |
76 | $node_pos->{$node} = $position; |
77 | } # otherwise, the old position needs to stay. |
78 | } |
79 | } |
80 | } |
58a3c424 |
81 | |
a25d4374 |
82 | # Now we have a hash of node positions keyed on node. |
83 | $self->{'node_positions'} = $node_pos; |
58a3c424 |
84 | # We should also save our witness paths, as long as we have them. |
85 | # Right now each path is a list of nodes; we may want to make it |
86 | # a list of position refs. |
a25d4374 |
87 | $self->{'witness_paths'} = $witness_paths; |
88 | |
58a3c424 |
89 | # We are also going to want to keep track of whether a position has |
90 | # been explicitly emptied, for our lemmatization. |
91 | my $position_state = {}; |
92 | map { $position_state->{ $_ } = undef } values %$node_pos; |
93 | $self->{'position_state'} = $position_state; |
94 | |
95 | |
a25d4374 |
96 | bless( $self, $proto ); |
97 | return $self; |
98 | } |
99 | |
100 | sub node_position { |
101 | my( $self, $node ) = @_; |
102 | $node = _name( $node ); |
103 | |
104 | unless( exists( $self->{'node_positions'}->{ $node } ) ) { |
105 | warn "No node with name $node known to the graph"; |
106 | return; |
107 | } |
108 | |
109 | return $self->{'node_positions'}->{ $node }; |
110 | } |
111 | |
112 | sub nodes_at_position { |
113 | my( $self, $pos ) = @_; |
114 | |
115 | my $positions = $self->calc_positions(); |
116 | unless( exists $positions->{ $pos } ) { |
117 | warn "No position $pos in the graph"; |
118 | return; |
119 | } |
120 | return @{ $positions->{ $pos }}; |
121 | } |
122 | |
123 | sub colocated_nodes { |
124 | my( $self, $node ) = @_; |
125 | $node = _name( $node ); |
126 | my $pos = $self->node_position( $node ); |
127 | my @loc_nodes = $self->nodes_at_position( $pos ); |
128 | |
129 | my @cn = grep { $_ !~ /^$node$/ } @loc_nodes; |
130 | return @cn; |
131 | } |
132 | |
58a3c424 |
133 | # Returns an ordered list of positions in this graph |
a25d4374 |
134 | sub all { |
135 | my( $self ) = @_; |
136 | my $pos = $self->calc_positions; |
137 | return sort by_position keys( %$pos ); |
138 | } |
139 | |
58a3c424 |
140 | # Returns undef if no decision has been taken on this position, the |
141 | # node name if there is a lemma for it, and 0 if there is no lemma for |
142 | # it. |
143 | sub state { |
144 | my( $self, $pos ) = @_; |
145 | return $self->{'position_state'}->{ $pos }; |
146 | } |
147 | |
148 | sub set_state { |
149 | my( $self, $pos, $state ) = @_; |
150 | $self->{'position_state'}->{ $pos } = $state; |
151 | } |
152 | |
153 | sub init_lemmatizer { |
154 | my( $self, @nodes ) = @_; |
155 | foreach my $n ( @nodes ) { |
156 | $self->set_state( $self->node_position( $n ), $n ); |
157 | } |
158 | } |
159 | |
a25d4374 |
160 | sub witness_path { |
161 | my( $self, $wit ) = @_; |
162 | return @{$self->{'witness_paths'}->{ $wit }}; |
163 | } |
164 | |
165 | # At some point I may find myself using scalar references for the node |
166 | # positions, in order to keep them easily in sync. Just in case, I will |
167 | # calculate this every time I need it. |
168 | sub calc_positions { |
169 | my $self = shift; |
170 | return _invert_hash( $self->{'node_positions'} ) |
171 | } |
172 | |
173 | # Helper for dealing with node refs |
174 | sub _name { |
175 | my( $node ) = @_; |
176 | # We work with node names in this library |
177 | if( ref( $node ) && ref( $node ) eq 'Graph::Easy::Node' ) { |
178 | $node = $node->name(); |
179 | } |
180 | return $node; |
181 | } |
182 | |
183 | ### Comparison functions |
184 | |
185 | # Compares two nodes according to their positions in the witness |
186 | # index hash. |
187 | sub by_position { |
188 | my $self = shift; |
189 | return _cmp_position( $a, $b ); |
190 | } |
191 | |
192 | # Takes two position strings (X,Y) and sorts them. |
193 | sub _cmp_position { |
194 | my( $a, $b ) = @_; |
195 | my @pos_a = split(/,/, $a ); |
196 | my @pos_b = split(/,/, $b ); |
197 | |
198 | my $big_cmp = $pos_a[0] <=> $pos_b[0]; |
199 | return $big_cmp if $big_cmp; |
200 | # else |
201 | return $pos_a[1] <=> $pos_b[1]; |
202 | } |
203 | |
204 | # Useful helper. Will be especially useful if I find myself using |
205 | # scalar references for the positions after all - it can dereference |
206 | # them here. |
207 | sub _invert_hash { |
208 | my ( $hash, $plaintext_keys ) = @_; |
209 | my %new_hash; |
210 | foreach my $key ( keys %$hash ) { |
211 | my $val = $hash->{$key}; |
212 | my $valkey = $val; |
213 | if( $plaintext_keys |
214 | && ref( $val ) ) { |
215 | $valkey = $plaintext_keys->{ scalar( $val ) }; |
216 | warn( "No plaintext value given for $val" ) unless $valkey; |
217 | } |
218 | if( exists ( $new_hash{$valkey} ) ) { |
219 | push( @{$new_hash{$valkey}}, $key ); |
220 | } else { |
221 | $new_hash{$valkey} = [ $key ]; |
222 | } |
223 | } |
224 | return \%new_hash; |
225 | } |
226 | |
227 | 1; |