Commit | Line | Data |
dd3b58b0 |
1 | package Text::Tradition::Collation; |
d047cd52 |
2 | |
3 | use Graph::Easy; |
8e1394aa |
4 | use IPC::Run qw( run binary ); |
8e1394aa |
5 | use Text::Tradition::Collation::Reading; |
dd3b58b0 |
6 | use Moose; |
7 | |
8 | has 'graph' => ( |
d047cd52 |
9 | is => 'ro', |
10 | isa => 'Graph::Easy', |
11 | handles => { |
8e1394aa |
12 | add_reading => 'add_node', |
13 | del_reading => 'del_node', |
14 | add_path => 'add_edge', |
15 | del_path => 'del_edge', |
16 | reading => 'node', |
17 | path => 'edge', |
18 | readings => 'nodes', |
19 | paths => 'edges', |
d047cd52 |
20 | }, |
21 | default => sub { Graph::Easy->new( undirected => 0 ) }, |
22 | ); |
784877d9 |
23 | |
dd3b58b0 |
24 | |
dd3b58b0 |
25 | has 'tradition' => ( |
8e1394aa |
26 | is => 'rw', |
d047cd52 |
27 | isa => 'Text::Tradition', |
28 | ); |
dd3b58b0 |
29 | |
8e1394aa |
30 | has 'svg' => ( |
31 | is => 'ro', |
32 | isa => 'Str', |
33 | writer => '_save_svg', |
34 | predicate => 'has_svg', |
35 | ); |
36 | |
37 | has 'graphviz' => ( |
38 | is => 'ro', |
39 | isa => 'Str', |
40 | writer => '_save_graphviz', |
41 | predicate => 'has_graphviz', |
42 | ); |
43 | |
44 | has 'graphml' => ( |
45 | is => 'ro', |
46 | isa => 'XML::LibXML::Document', |
47 | writer => '_save_graphml', |
48 | predicate => 'has_graphml', |
49 | ); |
50 | |
4a8828f0 |
51 | has 'wit_list_separator' => ( |
52 | is => 'rw', |
53 | isa => 'Str', |
54 | default => ', ', |
55 | ); |
56 | |
dd3b58b0 |
57 | # The collation can be created two ways: |
58 | # 1. Collate a set of witnesses (with CollateX I guess) and process |
59 | # the results as in 2. |
60 | # 2. Read a pre-prepared collation in one of a variety of formats, |
61 | # and make the graph from that. |
62 | |
63 | # The graph itself will (for now) be immutable, and the positions |
64 | # within the graph will also be immutable. We need to calculate those |
65 | # positions upon graph construction. The equivalences between graph |
66 | # nodes will be mutable, entirely determined by the user (or possibly |
67 | # by some semantic pre-processing provided by the user.) So the |
68 | # constructor should just make an empty equivalences object. The |
69 | # constructor will also need to make the witness objects, if we didn't |
70 | # come through option 1. |
71 | |
d047cd52 |
72 | sub BUILD { |
73 | my( $self, $args ) = @_; |
8e1394aa |
74 | $self->graph->use_class('node', 'Text::Tradition::Collation::Reading'); |
d047cd52 |
75 | |
4a8828f0 |
76 | # Pass through any graph-specific options. |
77 | my $shape = exists( $args->{'shape'} ) ? $args->{'shape'} : 'ellipse'; |
78 | $self->graph->set_attribute( 'node', 'shape', $shape ); |
d047cd52 |
79 | } |
784877d9 |
80 | |
81 | # Wrappers around some methods |
82 | |
83 | sub merge_readings { |
84 | my $self = shift; |
85 | my $first_node = shift; |
86 | my $second_node = shift; |
87 | $first_node->merge_from( $second_node ); |
88 | unshift( @_, $first_node, $second_node ); |
89 | return $self->graph->merge_nodes( @_ ); |
90 | } |
91 | |
8e1394aa |
92 | =head2 Output method(s) |
93 | |
94 | =over |
95 | |
96 | =item B<as_svg> |
97 | |
98 | print $graph->as_svg( $recalculate ); |
99 | |
100 | Returns an SVG string that represents the graph. Uses GraphViz to do |
4a8828f0 |
101 | this, because Graph::Easy doesn\'t cope well with long graphs. Unless |
8e1394aa |
102 | $recalculate is passed (and is a true value), the method will return a |
103 | cached copy of the SVG after the first call to the method. |
104 | |
105 | =cut |
106 | |
107 | sub as_svg { |
108 | my( $self, $recalc ) = @_; |
109 | return $self->svg if $self->has_svg; |
110 | |
111 | $self->_save_graphviz( $self->graph->as_graphviz() ) |
112 | unless( $self->has_graphviz && !$recalc ); |
113 | |
114 | my @cmd = qw/dot -Tsvg/; |
115 | my( $svg, $err ); |
116 | my $in = $self->graphviz; |
117 | run( \@cmd, \$in, ">", binary(), \$svg ); |
118 | $self->{'svg'} = $svg; |
119 | return $svg; |
120 | } |
121 | |
122 | =item B<as_graphml> |
123 | |
124 | print $graph->as_graphml( $recalculate ) |
125 | |
126 | Returns a GraphML representation of the collation graph, with |
127 | transposition information and position information. Unless |
128 | $recalculate is passed (and is a true value), the method will return a |
129 | cached copy of the SVG after the first call to the method. |
130 | |
131 | =cut |
132 | |
133 | sub as_graphml { |
134 | my( $self, $recalc ) = @_; |
135 | return $self->graphml if $self->has_graphml; |
136 | |
137 | # Some namespaces |
138 | my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns'; |
139 | my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance'; |
140 | my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' . |
141 | 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd'; |
142 | |
143 | # Create the document and root node |
144 | my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" ); |
145 | my $root = $graphml->createElementNS( $graphml_ns, 'graphml' ); |
146 | $graphml->setDocumentElement( $root ); |
147 | $root->setNamespace( $xsi_ns, 'xsi', 0 ); |
148 | $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema ); |
149 | |
150 | # Add the data keys for nodes |
151 | my @node_data = ( 'name', 'token', 'identical', 'position' ); |
152 | foreach my $ndi ( 0 .. $#node_data ) { |
153 | my $key = $root->addNewChild( $graphml_ns, 'key' ); |
154 | $key->setAttribute( 'attr.name', $node_data[$ndi] ); |
155 | $key->setAttribute( 'attr.type', 'string' ); |
156 | $key->setAttribute( 'for', 'node' ); |
157 | $key->setAttribute( 'id', 'd'.$ndi ); |
158 | } |
159 | |
160 | # Add the data keys for edges |
161 | my %wit_hash; |
162 | my $wit_ctr = 0; |
163 | foreach my $wit ( $self->getWitnessList ) { |
164 | my $wit_key = 'w' . $wit_ctr++; |
165 | $wit_hash{$wit} = $wit_key; |
166 | my $key = $root->addNewChild( $graphml_ns, 'key' ); |
167 | $key->setAttribute( 'attr.name', $wit ); |
168 | $key->setAttribute( 'attr.type', 'string' ); |
169 | $key->setAttribute( 'for', 'edge' ); |
170 | $key->setAttribute( 'id', $wit_key ); |
171 | } |
172 | |
173 | # Add the graph, its nodes, and its edges |
174 | my $graph = $root->addNewChild( $graphml_ns, 'graph' ); |
175 | $graph->setAttribute( 'edgedefault', 'directed' ); |
176 | $graph->setAttribute( 'id', 'g0' ); # TODO make this meaningful |
177 | $graph->setAttribute( 'parse.edgeids', 'canonical' ); |
178 | $graph->setAttribute( 'parse.edges', $self->edges() ); |
179 | $graph->setAttribute( 'parse.nodeids', 'canonical' ); |
180 | $graph->setAttribute( 'parse.nodes', $self->nodes() ); |
181 | $graph->setAttribute( 'parse.order', 'nodesfirst' ); |
182 | |
183 | my $node_ctr = 0; |
184 | my %node_hash; |
185 | foreach my $n ( $self->readings ) { |
186 | my %this_node_data = (); |
187 | foreach my $ndi ( 0 .. $#node_data ) { |
188 | my $value; |
189 | $this_node_data{'d'.$ndi} = $n->name if $node_data[$ndi] eq 'name'; |
190 | $this_node_data{'d'.$ndi} = $n->label |
191 | if $node_data[$ndi] eq 'token'; |
192 | $this_node_data{'d'.$ndi} = $n->primary->name if $n->has_primary; |
193 | $this_node_data{'d'.$ndi} = |
194 | $self->{'positions'}->node_position( $n ) |
195 | if $node_data[$ndi] eq 'position'; |
196 | } |
197 | my $node_el = $graph->addNewChild( $graphml_ns, 'node' ); |
198 | my $node_xmlid = 'n' . $node_ctr++; |
199 | $node_hash{ $n->name } = $node_xmlid; |
200 | $node_el->setAttribute( 'id', $node_xmlid ); |
201 | |
202 | foreach my $dk ( keys %this_node_data ) { |
203 | my $d_el = $node_el->addNewChild( $graphml_ns, 'data' ); |
204 | $d_el->setAttribute( 'key', $dk ); |
205 | $d_el->appendTextChild( $this_node_data{$dk} ); |
206 | } |
207 | } |
208 | |
209 | foreach my $e ( $self->edges() ) { |
210 | my( $name, $from, $to ) = ( $e->name, |
211 | $node_hash{ $e->from()->name() }, |
212 | $node_hash{ $e->to()->name() } ); |
213 | my $edge_el = $graph->addNewChild( $graphml_ns, 'edge' ); |
214 | $edge_el->setAttribute( 'source', $from ); |
215 | $edge_el->setAttribute( 'target', $to ); |
216 | $edge_el->setAttribute( 'id', $name ); |
217 | # TODO Got to add the witnesses |
218 | } |
219 | |
220 | # Return the thing |
221 | $self->_save_graphml( $graphml ); |
222 | return $graphml; |
223 | } |
224 | |
225 | =back |
226 | |
227 | =item B<start> |
228 | |
229 | my $beginning = $collation->start(); |
230 | |
231 | Returns the beginning of the collation, a meta-reading with label '#START#'. |
232 | |
233 | =cut |
234 | |
235 | sub start { |
4a8828f0 |
236 | # Return the beginning reading of the graph. |
8e1394aa |
237 | my $self = shift; |
238 | my( $new_start ) = @_; |
239 | if( $new_start ) { |
240 | $self->del_reading( '#START#' ); |
241 | $self->graph->rename_node( $new_start, '#START#' ); |
242 | } |
243 | return $self->reading('#START#'); |
244 | } |
245 | |
4a8828f0 |
246 | =item B<next_reading> |
8e1394aa |
247 | |
4a8828f0 |
248 | my $next_reading = $graph->next_reading( $reading, $witpath ); |
8e1394aa |
249 | |
4a8828f0 |
250 | Returns the reading that follows the given reading along the given witness |
8e1394aa |
251 | path. TODO These are badly named. |
252 | |
253 | =cut |
254 | |
4a8828f0 |
255 | sub next_reading { |
8e1394aa |
256 | # Return the successor via the corresponding edge. |
257 | my $self = shift; |
4a8828f0 |
258 | return $self->_find_linked_reading( 'next', @_ ); |
8e1394aa |
259 | } |
260 | |
4a8828f0 |
261 | =item B<prior_reading> |
8e1394aa |
262 | |
4a8828f0 |
263 | my $prior_reading = $graph->prior_reading( $reading, $witpath ); |
8e1394aa |
264 | |
4a8828f0 |
265 | Returns the reading that precedes the given reading along the given witness |
8e1394aa |
266 | path. TODO These are badly named. |
267 | |
268 | =cut |
269 | |
4a8828f0 |
270 | sub prior_reading { |
8e1394aa |
271 | # Return the predecessor via the corresponding edge. |
272 | my $self = shift; |
4a8828f0 |
273 | return $self->_find_linked_reading( 'prior', @_ ); |
8e1394aa |
274 | } |
275 | |
4a8828f0 |
276 | sub _find_linked_reading { |
8e1394aa |
277 | my( $self, $direction, $node, $edge ) = @_; |
278 | $edge = 'base text' unless $edge; |
279 | my @linked_edges = $direction eq 'next' |
280 | ? $node->outgoing() : $node->incoming(); |
281 | return undef unless scalar( @linked_edges ); |
282 | |
283 | # We have to find the linked edge that contains all of the |
284 | # witnesses supplied in $edge. |
4a8828f0 |
285 | my @edge_wits = $self->witnesses_of_label( $edge ); |
8e1394aa |
286 | foreach my $le ( @linked_edges ) { |
4a8828f0 |
287 | my @le_wits = $self->witnesses_of_label( $le->name ); |
8e1394aa |
288 | if( _is_within( \@edge_wits, \@le_wits ) ) { |
289 | # This is the right edge. |
290 | return $direction eq 'next' ? $le->to() : $le->from(); |
291 | } |
292 | } |
293 | warn "Could not find $direction node from " . $node->label |
294 | . " along edge $edge"; |
295 | return undef; |
296 | } |
297 | |
4a8828f0 |
298 | # Some set logic. |
299 | sub _is_within { |
300 | my( $set1, $set2 ) = @_; |
301 | my $ret = 1; |
302 | foreach my $el ( @$set1 ) { |
303 | $ret = 0 unless grep { /^\Q$el\E$/ } @$set2; |
304 | } |
305 | return $ret; |
306 | } |
307 | |
308 | # Walk the paths for each witness in the graph, and return the nodes |
309 | # that the graph has in common. |
310 | |
311 | sub walk_witness_paths { |
312 | my( $self, $end ) = @_; |
313 | # For each witness, walk the path through the graph. |
314 | # Then we need to find the common nodes. |
315 | # TODO This method is going to fall down if we have a very gappy |
316 | # text in the collation. |
317 | my $paths = {}; |
318 | my @common_nodes; |
319 | foreach my $wit ( @{$self->tradition->witnesses} ) { |
320 | my $curr_reading = $self->start; |
321 | my @wit_path = ( $curr_reading ); |
322 | my %seen_readings; |
323 | # TODO Detect loops at some point |
324 | while( $curr_reading->name ne $end->name ) { |
325 | if( $seen_readings{$curr_reading->name} ) { |
326 | warn "Detected loop walking path for witness " . $wit->sigil |
327 | . " at reading " . $curr_reading->name; |
328 | last; |
329 | } |
330 | my $next_reading = $self->next_reading( $curr_reading, |
331 | $wit->sigil ); |
332 | push( @wit_path, $next_reading ); |
333 | $seen_readings{$curr_reading->name} = 1; |
334 | $curr_reading = $next_reading; |
335 | } |
336 | $wit->path( \@wit_path ); |
337 | if( @common_nodes ) { |
338 | my @cn; |
339 | foreach my $n ( @wit_path ) { |
340 | push( @cn, $n ) if grep { $_ eq $n } @common_nodes; |
341 | } |
342 | @common_nodes = (); |
343 | push( @common_nodes, @cn ); |
344 | } else { |
345 | push( @common_nodes, @wit_path ); |
346 | } |
347 | } |
348 | |
349 | # Mark all the nodes as either common or not. |
350 | foreach my $cn ( @common_nodes ) { |
351 | print STDERR "Setting " . $cn->name . " as common node\n"; |
352 | $cn->make_common; |
353 | } |
354 | foreach my $n ( $self->readings() ) { |
355 | $n->make_variant unless $n->is_common; |
356 | } |
357 | } |
358 | |
359 | sub common_readings { |
360 | my $self = shift; |
361 | my @common = grep { $_->is_common } $self->readings(); |
362 | return @common; |
363 | } |
364 | |
365 | # Calculate the relative positions of nodes in the graph, if they |
366 | # were not given to us. |
367 | sub calculate_positions { |
368 | my $self = shift; |
369 | |
370 | # We have to calculate the position identifiers for each word, |
371 | # keyed on the common nodes. This will be 'fun'. The end result |
372 | # is a hash per witness, whose key is the word node and whose |
373 | # value is its position in the text. Common nodes are always N,1 |
374 | # so have identical positions in each text. |
375 | my @common = $self->common_readings(); |
376 | |
377 | my $node_pos = {}; |
378 | foreach my $wit ( @{$self->tradition->witnesses} ) { |
379 | # First we walk each path, making a matrix for each witness that |
380 | # corresponds to its eventual position identifier. Common nodes |
381 | # always start a new row, and are thus always in the first column. |
382 | |
383 | my $wit_matrix = []; |
384 | my $cn = 0; # We should hit the common readings in order. |
385 | my $row = []; |
386 | foreach my $wn ( @{$wit->path} ) { |
387 | if( $wn eq $common[$cn] ) { |
388 | # Set up to look for the next common node, and |
389 | # start a new row of words. |
390 | $cn++; |
391 | push( @$wit_matrix, $row ) if scalar( @$row ); |
392 | $row = []; |
393 | } |
394 | push( @$row, $wn ); |
395 | } |
396 | push( @$wit_matrix, $row ); # Push the last row onto the matrix |
397 | |
398 | # Now we have a matrix per witness, so that each row in the |
399 | # matrix begins with a common node, and continues with all the |
400 | # variant words that appear in the witness. We turn this into |
401 | # real positions in row,cell format. But we need some |
402 | # trickery in order to make sure that each node gets assigned |
403 | # to only one position. |
404 | |
405 | foreach my $li ( 1..scalar(@$wit_matrix) ) { |
406 | foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) { |
407 | my $reading = $wit_matrix->[$li-1]->[$di-1]; |
408 | my $position = "$li,$di"; |
409 | # If we have seen this node before, we need to compare |
410 | # its position with what went before. |
411 | unless( $reading->has_position && |
412 | _cmp_position( $position, $reading->position ) < 1 ) { |
413 | # The new position ID replaces the old one. |
414 | $reading->position( $position ); |
415 | } # otherwise, the old position needs to stay. |
416 | } |
417 | } |
418 | } |
419 | } |
420 | |
421 | sub _cmp_position { |
422 | my( $a, $b ) = @_; |
423 | my @pos_a = split(/,/, $a ); |
424 | my @pos_b = split(/,/, $b ); |
8e1394aa |
425 | |
4a8828f0 |
426 | my $big_cmp = $pos_a[0] <=> $pos_b[0]; |
427 | return $big_cmp if $big_cmp; |
428 | # else |
429 | return $pos_a[1] <=> $pos_b[1]; |
8e1394aa |
430 | } |
4a8828f0 |
431 | |
432 | # Return the string that joins together a list of witnesses for |
433 | # display on a single path. |
434 | sub path_label { |
435 | my $self = shift; |
436 | return join( $self->wit_list_separator, @_ ); |
437 | } |
438 | |
439 | sub witnesses_of_label { |
440 | my $self = shift; |
441 | my $regex = $self->wit_list_separator; |
442 | return split( /^\Q$regex\E$/, @_ ); |
443 | } |
8e1394aa |
444 | |
dd3b58b0 |
445 | no Moose; |
446 | __PACKAGE__->meta->make_immutable; |