tests passing with new library, yay
[scpubgit/stemmatology.git] / lib / Text / Tradition / Graph / Position.pm
CommitLineData
e58153d6 1package Text::Tradition::Graph::Position;
a25d4374 2
3use strict;
4use warnings;
5
6=head1 NAME
7
e58153d6 8Text::Tradition::Graph::Position
a25d4374 9
10=head1 SUMMARY
11
12An object to go with a text graph that keeps track of relative
2ceca8c3 13positions of the nodes on that graph. This is useful for keeping
14track of which readings are variants of each other, which is expensive
15to calculate every time from the graph itself.
a25d4374 16
17=head1 METHODS
18
19=over 4
20
21=item B<new>
22
23Takes two arguments: a list of names of common nodes in the graph, and
24a list of witness paths. Calculates position identifiers for each
25node based on this.
26
27=cut
28
2ceca8c3 29# TODO Why not just hand over the graph and calculate the common nodes
30# and witness paths here?
a25d4374 31sub new {
32 my $proto = shift;
33 my( $common_nodes, $witness_paths ) = @_;
34
35 my $self = {};
36
37 # We have to calculate the position identifiers for each word,
38 # keyed on the common nodes. This will be 'fun'. The end result
39 # is a hash per witness, whose key is the word node and whose
40 # value is its position in the text. Common nodes are always N,1
41 # so have identical positions in each text.
42
43 my $node_pos = {};
44 foreach my $wit ( keys %$witness_paths ) {
45 # First we walk each path, making a matrix for each witness that
46 # corresponds to its eventual position identifier. Common nodes
47 # always start a new row, and are thus always in the first column.
48
49 my $wit_matrix = [];
50 my $cn = 0; # We should hit the common nodes in order.
51 my $row = [];
52 foreach my $wn ( @{$witness_paths->{$wit}} ) { # $wn is a node name
53 if( $wn eq $common_nodes->[$cn] ) {
54 # Set up to look for the next common node, and
55 # start a new row of words.
56 $cn++;
57 push( @$wit_matrix, $row ) if scalar( @$row );
58 $row = [];
59 }
60 push( @$row, $wn );
61 }
62 push( @$wit_matrix, $row ); # Push the last row onto the matrix
63
64 # Now we have a matrix per witness, so that each row in the
65 # matrix begins with a common node, and continues with all the
66 # variant words that appear in the witness. We turn this into
67 # real positions in row,cell format. But we need some
68 # trickery in order to make sure that each node gets assigned
69 # to only one position.
70
71 foreach my $li ( 1..scalar(@$wit_matrix) ) {
72 foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) {
73 my $node = $wit_matrix->[$li-1]->[$di-1];
74 my $position = "$li,$di";
75 # If we have seen this node before, we need to compare
76 # its position with what went before.
77 unless( exists $node_pos->{ $node } &&
78 _cmp_position( $position, $node_pos->{ $node }) < 1 ) {
79 # The new position ID replaces the old one.
80 $node_pos->{$node} = $position;
81 } # otherwise, the old position needs to stay.
82 }
83 }
84 }
58a3c424 85
a25d4374 86 # Now we have a hash of node positions keyed on node.
87 $self->{'node_positions'} = $node_pos;
58a3c424 88 # We should also save our witness paths, as long as we have them.
89 # Right now each path is a list of nodes; we may want to make it
90 # a list of position refs.
a25d4374 91 $self->{'witness_paths'} = $witness_paths;
92
58a3c424 93 # We are also going to want to keep track of whether a position has
94 # been explicitly emptied, for our lemmatization.
95 my $position_state = {};
96 map { $position_state->{ $_ } = undef } values %$node_pos;
97 $self->{'position_state'} = $position_state;
98
99
a25d4374 100 bless( $self, $proto );
101 return $self;
102}
103
2ceca8c3 104=item B<node_position>
105
106my $pos = $positions->node_position( $node );
107
108Returns the position identifier for a given node in the graph.
109
110=cut
111
a25d4374 112sub node_position {
113 my( $self, $node ) = @_;
114 $node = _name( $node );
115
116 unless( exists( $self->{'node_positions'}->{ $node } ) ) {
117 warn "No node with name $node known to the graph";
118 return;
119 }
120
121 return $self->{'node_positions'}->{ $node };
122}
123
2ceca8c3 124=item B<nodes_at_position>
125
126my @nodes = $positions->nodes_at_position( $pos );
127
128Returns the names of all the nodes in the graph at a given position.
129
130=cut
131
a25d4374 132sub nodes_at_position {
133 my( $self, $pos ) = @_;
134
2ceca8c3 135 my $positions = $self->_calc_positions();
a25d4374 136 unless( exists $positions->{ $pos } ) {
137 warn "No position $pos in the graph";
138 return;
139 }
140 return @{ $positions->{ $pos }};
141}
142
2ceca8c3 143=item B<colocated_nodes>
144
145my @nodes = $positions->colocated_nodes( $node );
146
147Returns the names of all the nodes in the graph at the same position
148as the node given, apart from that node itself.
149
150=cut
151
a25d4374 152sub colocated_nodes {
153 my( $self, $node ) = @_;
154 $node = _name( $node );
155 my $pos = $self->node_position( $node );
156 my @loc_nodes = $self->nodes_at_position( $pos );
157
158 my @cn = grep { $_ !~ /^$node$/ } @loc_nodes;
159 return @cn;
160}
161
2ceca8c3 162=item B<all>
163
164my @position_list = $positions->all()
165
166Returns an ordered list of positions in the graph.
167
168=cut
169
a25d4374 170sub all {
171 my( $self ) = @_;
2ceca8c3 172 my $pos = $self->_calc_positions;
a25d4374 173 return sort by_position keys( %$pos );
174}
175
2ceca8c3 176sub witness_path {
177 my( $self, $wit ) = @_;
178 return @{$self->{'witness_paths'}->{ $wit }};
58a3c424 179}
180
2ceca8c3 181=back
182
183=head2 Lemmatization functions
184
185For some traditions, each position will have at least one node that is
186the 'lemma text', that is, the text that an editor has chosen to stand
187as authoritative for the tradition. The following methods keep
188track of what lemma, if any, should stand at each position.
189
190=over
58a3c424 191
2ceca8c3 192=item B<init_lemmatizer>
193
194$positions->init_lemmatizer( @nodelist )
195
196Sets up the necessary logic for keeping track of lemmas. It should be
197called once, with the initial list of lemmas.
198
199=cut
200
201# TODO We can initialize this without the argument, based on the passed
202# list of common nodes.
58a3c424 203sub init_lemmatizer {
204 my( $self, @nodes ) = @_;
205 foreach my $n ( @nodes ) {
206 $self->set_state( $self->node_position( $n ), $n );
207 }
208}
209
2ceca8c3 210=item B<state>
a25d4374 211
2ceca8c3 212my $answer = $positions->state( $position_id )
213
214For the given position ID, returns the node (if any) that stands at
215the lemma. If no node should stand as lemma at this position, returns
2160; if no decision has been made for this position, returns undef.
217
218=cut
219
220sub state {
221 my( $self, $pos ) = @_;
222 return $self->{'position_state'}->{ $pos };
a25d4374 223}
224
2ceca8c3 225=item B<set_state>
226
227$positions->set_state( $position_id, $state )
228
229For the given position ID, sets the lemma (if any). State can be the
230name of a node, 0 (for cases when no lemma should stand), or undef
231(for cases when no decision has been made).
232
233=cut
234
235sub set_state {
236 my( $self, $pos, $state ) = @_;
237 $self->{'position_state'}->{ $pos } = $state;
a25d4374 238}
239
2ceca8c3 240=back
241
242=head2 Comparison function
243
244=over
245
246=item B<by_position>
247
248my @nodelist = sort $positions->by_position @nodelist;
249
250For use in the 'sort' function. Returns a comparison value based on
251the position of the given nodes.
252
253=cut
a25d4374 254
255# Compares two nodes according to their positions in the witness
256# index hash.
257sub by_position {
258 my $self = shift;
259 return _cmp_position( $a, $b );
260}
261
262# Takes two position strings (X,Y) and sorts them.
263sub _cmp_position {
264 my( $a, $b ) = @_;
265 my @pos_a = split(/,/, $a );
266 my @pos_b = split(/,/, $b );
267
268 my $big_cmp = $pos_a[0] <=> $pos_b[0];
269 return $big_cmp if $big_cmp;
270 # else
271 return $pos_a[1] <=> $pos_b[1];
272}
273
2ceca8c3 274
275#### HELPER FUNCTIONS ####
276
277# At some point I may find myself using scalar references for the node
278# positions, in order to keep them easily in sync. Just in case, I will
279# calculate this every time I need it.
280sub _calc_positions {
281 my $self = shift;
282 return _invert_hash( $self->{'node_positions'} )
283}
284
285# Helper for dealing with node refs
286sub _name {
287 my( $node ) = @_;
288 # We work with node names in this library
289 if( ref( $node ) && ref( $node ) eq 'Graph::Easy::Node' ) {
290 $node = $node->name();
291 }
292 return $node;
293}
294
a25d4374 295# Useful helper. Will be especially useful if I find myself using
296# scalar references for the positions after all - it can dereference
297# them here.
298sub _invert_hash {
299 my ( $hash, $plaintext_keys ) = @_;
300 my %new_hash;
301 foreach my $key ( keys %$hash ) {
302 my $val = $hash->{$key};
303 my $valkey = $val;
304 if( $plaintext_keys
305 && ref( $val ) ) {
306 $valkey = $plaintext_keys->{ scalar( $val ) };
307 warn( "No plaintext value given for $val" ) unless $valkey;
308 }
309 if( exists ( $new_hash{$valkey} ) ) {
310 push( @{$new_hash{$valkey}}, $key );
311 } else {
312 $new_hash{$valkey} = [ $key ];
313 }
314 }
315 return \%new_hash;
316}
317
2ceca8c3 318=back
319
320=head1 LICENSE
321
322This package is free software and is provided "as is" without express
323or implied warranty. You can redistribute it and/or modify it under
324the same terms as Perl itself.
325
326=head1 AUTHOR
327
328Tara L Andrews, aurum@cpan.org
329
330=cut
331
a25d4374 3321;