Commit | Line | Data |
e58153d6 |
1 | package Text::Tradition::Graph::Position; |
a25d4374 |
2 | |
3 | use strict; |
4 | use warnings; |
5 | |
6 | =head1 NAME |
7 | |
e58153d6 |
8 | Text::Tradition::Graph::Position |
a25d4374 |
9 | |
10 | =head1 SUMMARY |
11 | |
12 | An object to go with a text graph that keeps track of relative |
2ceca8c3 |
13 | positions of the nodes on that graph. This is useful for keeping |
14 | track of which readings are variants of each other, which is expensive |
15 | to calculate every time from the graph itself. |
a25d4374 |
16 | |
17 | =head1 METHODS |
18 | |
19 | =over 4 |
20 | |
21 | =item B<new> |
22 | |
23 | Takes two arguments: a list of names of common nodes in the graph, and |
24 | a list of witness paths. Calculates position identifiers for each |
25 | node based on this. |
26 | |
27 | =cut |
28 | |
2ceca8c3 |
29 | # TODO Why not just hand over the graph and calculate the common nodes |
30 | # and witness paths here? |
a25d4374 |
31 | sub new { |
32 | my $proto = shift; |
33 | my( $common_nodes, $witness_paths ) = @_; |
34 | |
35 | my $self = {}; |
36 | |
37 | # We have to calculate the position identifiers for each word, |
38 | # keyed on the common nodes. This will be 'fun'. The end result |
39 | # is a hash per witness, whose key is the word node and whose |
40 | # value is its position in the text. Common nodes are always N,1 |
41 | # so have identical positions in each text. |
42 | |
43 | my $node_pos = {}; |
44 | foreach my $wit ( keys %$witness_paths ) { |
45 | # First we walk each path, making a matrix for each witness that |
46 | # corresponds to its eventual position identifier. Common nodes |
47 | # always start a new row, and are thus always in the first column. |
48 | |
49 | my $wit_matrix = []; |
50 | my $cn = 0; # We should hit the common nodes in order. |
51 | my $row = []; |
52 | foreach my $wn ( @{$witness_paths->{$wit}} ) { # $wn is a node name |
53 | if( $wn eq $common_nodes->[$cn] ) { |
54 | # Set up to look for the next common node, and |
55 | # start a new row of words. |
56 | $cn++; |
57 | push( @$wit_matrix, $row ) if scalar( @$row ); |
58 | $row = []; |
59 | } |
60 | push( @$row, $wn ); |
61 | } |
62 | push( @$wit_matrix, $row ); # Push the last row onto the matrix |
63 | |
64 | # Now we have a matrix per witness, so that each row in the |
65 | # matrix begins with a common node, and continues with all the |
66 | # variant words that appear in the witness. We turn this into |
67 | # real positions in row,cell format. But we need some |
68 | # trickery in order to make sure that each node gets assigned |
69 | # to only one position. |
70 | |
71 | foreach my $li ( 1..scalar(@$wit_matrix) ) { |
72 | foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) { |
73 | my $node = $wit_matrix->[$li-1]->[$di-1]; |
74 | my $position = "$li,$di"; |
75 | # If we have seen this node before, we need to compare |
76 | # its position with what went before. |
77 | unless( exists $node_pos->{ $node } && |
78 | _cmp_position( $position, $node_pos->{ $node }) < 1 ) { |
79 | # The new position ID replaces the old one. |
80 | $node_pos->{$node} = $position; |
81 | } # otherwise, the old position needs to stay. |
82 | } |
83 | } |
84 | } |
58a3c424 |
85 | |
a25d4374 |
86 | # Now we have a hash of node positions keyed on node. |
87 | $self->{'node_positions'} = $node_pos; |
58a3c424 |
88 | # We should also save our witness paths, as long as we have them. |
89 | # Right now each path is a list of nodes; we may want to make it |
90 | # a list of position refs. |
a25d4374 |
91 | $self->{'witness_paths'} = $witness_paths; |
92 | |
58a3c424 |
93 | # We are also going to want to keep track of whether a position has |
94 | # been explicitly emptied, for our lemmatization. |
95 | my $position_state = {}; |
96 | map { $position_state->{ $_ } = undef } values %$node_pos; |
97 | $self->{'position_state'} = $position_state; |
98 | |
99 | |
a25d4374 |
100 | bless( $self, $proto ); |
101 | return $self; |
102 | } |
103 | |
2ceca8c3 |
104 | =item B<node_position> |
105 | |
106 | my $pos = $positions->node_position( $node ); |
107 | |
108 | Returns the position identifier for a given node in the graph. |
109 | |
110 | =cut |
111 | |
a25d4374 |
112 | sub node_position { |
113 | my( $self, $node ) = @_; |
114 | $node = _name( $node ); |
115 | |
116 | unless( exists( $self->{'node_positions'}->{ $node } ) ) { |
117 | warn "No node with name $node known to the graph"; |
118 | return; |
119 | } |
120 | |
121 | return $self->{'node_positions'}->{ $node }; |
122 | } |
123 | |
2ceca8c3 |
124 | =item B<nodes_at_position> |
125 | |
126 | my @nodes = $positions->nodes_at_position( $pos ); |
127 | |
128 | Returns the names of all the nodes in the graph at a given position. |
129 | |
130 | =cut |
131 | |
a25d4374 |
132 | sub nodes_at_position { |
133 | my( $self, $pos ) = @_; |
134 | |
2ceca8c3 |
135 | my $positions = $self->_calc_positions(); |
a25d4374 |
136 | unless( exists $positions->{ $pos } ) { |
137 | warn "No position $pos in the graph"; |
138 | return; |
139 | } |
140 | return @{ $positions->{ $pos }}; |
141 | } |
142 | |
2ceca8c3 |
143 | =item B<colocated_nodes> |
144 | |
145 | my @nodes = $positions->colocated_nodes( $node ); |
146 | |
147 | Returns the names of all the nodes in the graph at the same position |
148 | as the node given, apart from that node itself. |
149 | |
150 | =cut |
151 | |
a25d4374 |
152 | sub colocated_nodes { |
153 | my( $self, $node ) = @_; |
154 | $node = _name( $node ); |
155 | my $pos = $self->node_position( $node ); |
156 | my @loc_nodes = $self->nodes_at_position( $pos ); |
157 | |
158 | my @cn = grep { $_ !~ /^$node$/ } @loc_nodes; |
159 | return @cn; |
160 | } |
161 | |
2ceca8c3 |
162 | =item B<all> |
163 | |
164 | my @position_list = $positions->all() |
165 | |
166 | Returns an ordered list of positions in the graph. |
167 | |
168 | =cut |
169 | |
a25d4374 |
170 | sub all { |
171 | my( $self ) = @_; |
2ceca8c3 |
172 | my $pos = $self->_calc_positions; |
a25d4374 |
173 | return sort by_position keys( %$pos ); |
174 | } |
175 | |
2ceca8c3 |
176 | sub witness_path { |
177 | my( $self, $wit ) = @_; |
178 | return @{$self->{'witness_paths'}->{ $wit }}; |
58a3c424 |
179 | } |
180 | |
2ceca8c3 |
181 | =back |
182 | |
183 | =head2 Lemmatization functions |
184 | |
185 | For some traditions, each position will have at least one node that is |
186 | the 'lemma text', that is, the text that an editor has chosen to stand |
187 | as authoritative for the tradition. The following methods keep |
188 | track of what lemma, if any, should stand at each position. |
189 | |
190 | =over |
58a3c424 |
191 | |
2ceca8c3 |
192 | =item B<init_lemmatizer> |
193 | |
194 | $positions->init_lemmatizer( @nodelist ) |
195 | |
196 | Sets up the necessary logic for keeping track of lemmas. It should be |
197 | called once, with the initial list of lemmas. |
198 | |
199 | =cut |
200 | |
201 | # TODO We can initialize this without the argument, based on the passed |
202 | # list of common nodes. |
58a3c424 |
203 | sub init_lemmatizer { |
204 | my( $self, @nodes ) = @_; |
205 | foreach my $n ( @nodes ) { |
206 | $self->set_state( $self->node_position( $n ), $n ); |
207 | } |
208 | } |
209 | |
2ceca8c3 |
210 | =item B<state> |
a25d4374 |
211 | |
2ceca8c3 |
212 | my $answer = $positions->state( $position_id ) |
213 | |
214 | For the given position ID, returns the node (if any) that stands at |
215 | the lemma. If no node should stand as lemma at this position, returns |
216 | 0; if no decision has been made for this position, returns undef. |
217 | |
218 | =cut |
219 | |
220 | sub state { |
221 | my( $self, $pos ) = @_; |
222 | return $self->{'position_state'}->{ $pos }; |
a25d4374 |
223 | } |
224 | |
2ceca8c3 |
225 | =item B<set_state> |
226 | |
227 | $positions->set_state( $position_id, $state ) |
228 | |
229 | For the given position ID, sets the lemma (if any). State can be the |
230 | name of a node, 0 (for cases when no lemma should stand), or undef |
231 | (for cases when no decision has been made). |
232 | |
233 | =cut |
234 | |
235 | sub set_state { |
236 | my( $self, $pos, $state ) = @_; |
237 | $self->{'position_state'}->{ $pos } = $state; |
a25d4374 |
238 | } |
239 | |
2ceca8c3 |
240 | =back |
241 | |
242 | =head2 Comparison function |
243 | |
244 | =over |
245 | |
246 | =item B<by_position> |
247 | |
248 | my @nodelist = sort $positions->by_position @nodelist; |
249 | |
250 | For use in the 'sort' function. Returns a comparison value based on |
251 | the position of the given nodes. |
252 | |
253 | =cut |
a25d4374 |
254 | |
255 | # Compares two nodes according to their positions in the witness |
256 | # index hash. |
257 | sub by_position { |
258 | my $self = shift; |
259 | return _cmp_position( $a, $b ); |
260 | } |
261 | |
262 | # Takes two position strings (X,Y) and sorts them. |
263 | sub _cmp_position { |
264 | my( $a, $b ) = @_; |
265 | my @pos_a = split(/,/, $a ); |
266 | my @pos_b = split(/,/, $b ); |
267 | |
268 | my $big_cmp = $pos_a[0] <=> $pos_b[0]; |
269 | return $big_cmp if $big_cmp; |
270 | # else |
271 | return $pos_a[1] <=> $pos_b[1]; |
272 | } |
273 | |
2ceca8c3 |
274 | |
275 | #### HELPER FUNCTIONS #### |
276 | |
277 | # At some point I may find myself using scalar references for the node |
278 | # positions, in order to keep them easily in sync. Just in case, I will |
279 | # calculate this every time I need it. |
280 | sub _calc_positions { |
281 | my $self = shift; |
282 | return _invert_hash( $self->{'node_positions'} ) |
283 | } |
284 | |
285 | # Helper for dealing with node refs |
286 | sub _name { |
287 | my( $node ) = @_; |
288 | # We work with node names in this library |
289 | if( ref( $node ) && ref( $node ) eq 'Graph::Easy::Node' ) { |
290 | $node = $node->name(); |
291 | } |
292 | return $node; |
293 | } |
294 | |
a25d4374 |
295 | # Useful helper. Will be especially useful if I find myself using |
296 | # scalar references for the positions after all - it can dereference |
297 | # them here. |
298 | sub _invert_hash { |
299 | my ( $hash, $plaintext_keys ) = @_; |
300 | my %new_hash; |
301 | foreach my $key ( keys %$hash ) { |
302 | my $val = $hash->{$key}; |
303 | my $valkey = $val; |
304 | if( $plaintext_keys |
305 | && ref( $val ) ) { |
306 | $valkey = $plaintext_keys->{ scalar( $val ) }; |
307 | warn( "No plaintext value given for $val" ) unless $valkey; |
308 | } |
309 | if( exists ( $new_hash{$valkey} ) ) { |
310 | push( @{$new_hash{$valkey}}, $key ); |
311 | } else { |
312 | $new_hash{$valkey} = [ $key ]; |
313 | } |
314 | } |
315 | return \%new_hash; |
316 | } |
317 | |
2ceca8c3 |
318 | =back |
319 | |
320 | =head1 LICENSE |
321 | |
322 | This package is free software and is provided "as is" without express |
323 | or implied warranty. You can redistribute it and/or modify it under |
324 | the same terms as Perl itself. |
325 | |
326 | =head1 AUTHOR |
327 | |
328 | Tara L Andrews, aurum@cpan.org |
329 | |
330 | =cut |
331 | |
a25d4374 |
332 | 1; |