1 package Text::Tradition::Analysis;
6 use Encode qw/ encode_utf8 /;
8 use JSON qw/ encode_json decode_json /;
11 use Text::Tradition::Stemma;
13 use vars qw/ @EXPORT_OK /;
14 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
18 Text::Tradition::Analysis - functions for stemma analysis of a tradition
23 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
24 my $t = Text::Tradition->new(
25 'name' => 'this is a text',
27 'file' => '/path/to/tei_parallel_seg_file.xml' );
28 $t->add_stemma( 'dotfile' => $stemmafile );
30 my $variant_data = run_analysis( $tradition );
31 # Recalculate rank $n treating all orthographic variants as equivalent
32 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
36 Text::Tradition is a library for representation and analysis of collated
37 texts, particularly medieval ones. The Collation is the central feature of
38 a Tradition, where the text, its sequence of readings, and its relationships
39 between readings are actually kept.
43 =head2 run_analysis( $tradition, %opts )
45 Runs the analysis described in analyze_variant_location on every location in the
46 collation of the given tradition, with the given options. These include:
50 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
51 is 0 (i.e. the first).
53 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
55 =item * merge_types - Specify a list of relationship types, where related readings
56 should be treated as identical for the purposes of analysis.
63 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
65 my $datafile = 't/data/florilegium_tei_ps.xml';
66 my $tradition = Text::Tradition->new( 'input' => 'TEI',
68 'file' => $datafile );
69 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
70 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
72 my %expected_genealogical = (
103 my $data = run_analysis( $tradition );
104 foreach my $row ( @{$data->{'variants'}} ) {
105 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
106 "Got correct genealogical flag for row " . $row->{'id'} );
108 is( $data->{'conflict_count'}, 16, "Got right conflict count" );
109 is( $data->{'variant_count'}, 28, "Got right total variant number" );
116 my( $tradition, %opts ) = @_;
117 my $c = $tradition->collation;
119 my $stemma_id = $opts{'stemma_id'} || 0;
120 my @ranks = @{$opts{'ranks'}} if ref( $opts{'ranks'} ) eq 'ARRAY';
121 my @collapse = @{$opts{'merge_types'}} if ref( $opts{'merge_types'} ) eq 'ARRAY';
124 my $stemma = $tradition->stemma( $stemma_id );
125 # Figure out which witnesses we are working with
126 my @lacunose = $stemma->hypotheticals;
127 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
128 map { push( @tradition_wits, $_->sigil."_ac" ) if $_->is_layered }
129 $tradition->witnesses;
130 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
132 # Find and mark 'common' ranks for exclusion, unless they were
133 # explicitly specified.
136 foreach my $rdg ( $tradition->collation->common_readings ) {
137 $common_rank{$rdg->rank} = 1;
139 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
142 # Group the variants to send to the solver
144 foreach my $rank ( @ranks ) {
145 push( @groups, group_variants( $tradition, $rank, \@lacunose, \@collapse ) );
149 my $answer = solve_variants( $stemma->editable( ' ' ), @groups );
152 # Do further analysis on the answer
153 foreach my $idx ( 0 .. $#ranks ) {
154 my $location = $answer->{'variants'}->[$idx];
155 # Add the rank back in
156 $location->{'id'} = $ranks[$idx];
157 # Run the extra analysis we need.
158 analyze_location( $tradition, $stemma->graph, $location );
164 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
166 Groups the variants at the given $rank of the collation, treating any
167 relationships in @merge_relationship_types as equivalent. $lacunose should
168 be a reference to an array, to which the sigla of lacunose witnesses at this
169 rank will be appended.
171 Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
172 by the witnesses listed in $groups->[$n].
176 # Return group_readings, groups, lacunose
178 my( $tradition, $rank, $lacunose, $collapse ) = @_;
179 my $c = $tradition->collation;
180 my $aclabel = $c->ac_label;
181 # Get the alignment table readings
182 my %readings_at_rank;
184 foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) {
185 my $rdg = $tablewit->{'tokens'}->[$rank-1];
186 my $wit = $tablewit->{'witness'};
187 $wit =~ s/^(.*)\Q$aclabel\E$/${1}_ac/;
188 if( $rdg && $rdg->{'t'}->is_lacuna ) {
189 _add_to_witlist( $wit, $lacunose, '_ac' );
191 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
193 _add_to_witlist( $wit, \@gap_wits, '_ac' );
197 # Group the readings, collapsing groups by relationship if needed
198 my %grouped_readings;
199 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses } values %readings_at_rank ) {
200 # Skip readings that have been collapsed into others.
201 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
202 my @wits = $rdg->witnesses;
203 map { s/\Q$aclabel\E$/_ac/ } @wits;
205 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
206 foreach my $other ( $rdg->related_readings( $filter ) ) {
207 my @otherwits = $other->witnesses;
208 map { s/\Q$aclabel\E$/_ac/ } @otherwits;
209 push( @wits, @otherwits );
210 $grouped_readings{$other->id} = 0;
213 $grouped_readings{$rdg->id} = \@wits;
215 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
216 # Get rid of our collapsed readings
217 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
218 keys %grouped_readings
221 return \%grouped_readings;
224 =head2 solve_variants( $graph, @groups )
226 Sends the set of groups to the external graph solver service and returns
227 a cleaned-up answer, adding the rank IDs back where they belong.
229 The JSON has the form
230 { "graph": [ stemmagraph DOT string without newlines ],
231 "groupings": [ array of arrays of groups, one per rank ] }
233 The answer has the form
234 { "variants" => [ array of variant location structures ],
235 "variant_count" => total,
236 "conflict_count" => number of conflicts detected,
237 "genealogical_count" => number of solutions found }
242 my( $graph, @groups ) = @_;
244 # Make the json with stemma + groups
245 my $jsonstruct = { 'graph' => $graph, 'groupings' => [] };
246 foreach my $ghash ( @groups ) {
248 foreach my $k ( sort keys %$ghash ) {
249 push( @grouping, $ghash->{$k} );
251 push( @{$jsonstruct->{'groupings'}}, \@grouping );
253 my $json = encode_json( $jsonstruct );
255 # Send it off and get the result
256 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
257 my $ua = LWP::UserAgent->new();
258 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
259 'Content' => $json );
262 if( $resp->is_success ) {
263 $answer = decode_json( $resp->content );
265 # Fall back to the old method.
266 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
267 . "; falling back to perl method";
268 $answer = perl_solver( $graph, @groups );
271 # Fold the result back into what we know about the groups.
273 my $genealogical = 0;
274 foreach my $idx ( 0 .. $#groups ) {
275 my( $calc_groups, $result ) = @{$answer->[$idx]};
276 $genealogical++ if $result;
277 my $input_group = $groups[$idx];
278 foreach my $k ( sort keys %$input_group ) {
279 my $cg = shift @$calc_groups;
280 $input_group->{$k} = $cg;
283 'genealogical' => $result,
286 foreach my $k ( keys %$input_group ) {
287 push( @{$vstruct->{'readings'}},
288 { 'readingid' => $k, 'group' => $input_group->{$k}} );
290 push( @$variants, $vstruct );
293 return { 'variants' => $variants,
294 'variant_count' => scalar @$variants,
295 'genealogical_count' => $genealogical };
298 =head2 analyze_location ( $tradition, $graph, $location_hash )
300 Given the tradition, its stemma graph, and the solution from the graph solver,
301 work out the rest of the information we want. For each reading we need missing,
302 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
306 sub analyze_location {
307 my ( $tradition, $graph, $variant_row ) = @_;
309 # Make a hash of all known node memberships, and make the subgraphs.
311 my $reading_roots = {};
313 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
314 my $rid = $rdghash->{'readingid'};
315 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
318 my $part = $graph->copy;
320 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
321 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
322 $subgraph->{$rid} = $part;
323 # Get the reading roots.
324 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
327 # Now that we have all the node group memberships, calculate followed/
328 # non-followed/unknown values for each reading. Also figure out the
329 # reading's evident parent(s).
330 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
331 # Group string key - TODO do we need this?
332 my $gst = wit_stringify( $rdghash->{'group'} );
333 my $rid = $rdghash->{'readingid'};
335 my $part = $subgraph->{$rid};
337 # Start figuring things out.
338 my @roots = $part->predecessorless_vertices;
339 $rdghash->{'independent_occurrence'} = scalar @roots;
340 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
341 # Find the parent readings, if any, of this reading.
344 foreach my $wit ( @roots ) {
345 # Look in the main stemma to find this witness's extant or known-reading
346 # immediate ancestor(s), and look up the reading that each ancestor olds.
347 my @check = $graph->predecessors( $wit );
350 foreach my $wparent( @check ) {
351 my $preading = $contig->{$wparent};
353 $rdgparents{$preading} = 1;
355 push( @next, $graph->predecessors( $wparent ) );
361 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
363 # Find the number of times this reading was altered, and the number of
364 # times we're not sure.
365 my( %nofollow, %unknownfollow );
366 foreach my $wit ( $part->vertices ) {
367 foreach my $wchild ( $graph->successors( $wit ) ) {
368 next if $part->has_vertex( $wchild );
369 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
370 # It definitely changed here.
371 $nofollow{$wchild} = 1;
372 } elsif( !($contig->{$wchild}) ) {
373 # The child is a hypothetical node not definitely in
374 # any group. Answer is unknown.
375 $unknownfollow{$wchild} = 1;
376 } # else it's a non-root node in a known group, and therefore
377 # is presumed to have its reading from its group, not this link.
380 $rdghash->{'not_followed'} = keys %nofollow;
381 $rdghash->{'follow_unknown'} = keys %unknownfollow;
383 # Now say whether this reading represents a conflict.
384 unless( $variant_row->{'genealogical'} ) {
385 $rdghash->{'conflict'} = @roots != 1;
391 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
393 ** NOTE ** This method should hopefully not be called - it is not guaranteed
394 to be correct. Serves as a backup for the real solver.
396 Runs an analysis of the given tradition, at the location given in $rank,
397 against the graph of the stemma specified in $stemma_id. The argument
398 @merge_relationship_types is an optional list of relationship types for
399 which readings so related should be treated as equivalent.
401 Returns a nested array data structure as follows:
403 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
405 where the group list is the array of arrays passed in for each element of @groups,
406 possibly with the addition of hypothetical readings.
412 my( $graph, @groups ) = @_;
414 warn "Not implemented yet";
424 # my $variant_row = { 'id' => $rank, 'readings' => [] };
425 # # Mark each ms as in its own group, first.
426 # foreach my $g ( @$groups ) {
427 # my $gst = wit_stringify( $g );
428 # map { $contig->{$_} = $gst } @$g;
430 # # Now for each unmarked node in the graph, initialize an array
431 # # for possible group memberships. We will use this later to
432 # # resolve potential conflicts.
433 # map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
434 # foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
435 # my $gst = wit_stringify( $g ); # This is the group name
436 # # Copy the graph, and delete all non-members from the new graph.
437 # my $part = $graph->copy;
439 # $part->delete_vertices(
440 # grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
442 # # Now look to see if our group is connected.
443 # if( $undirected ) { # For use with distance trees etc.
444 # # Find all vertices reachable from the first (arbitrary) group
445 # # member. If we are genealogical this should include them all.
446 # my $reachable = {};
447 # map { $reachable->{$_} = 1 } $part->all_reachable( $g->[0] );
448 # # TODO This is a terrible way to do distance trees, since all
449 # # non-leaf nodes are included in every graph part now. We may
450 # # have to go back to SPDP.
453 # # We have to take directionality into account.
454 # # How many root nodes do we have?
455 # my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
456 # $part->predecessorless_vertices;
457 # # Assuming that @$g > 1, find the first root node that has at
458 # # least one successor belonging to our group. If this reading
459 # # is genealogical, there should be only one, but we will check
460 # # that implicitly later.
461 # foreach my $root ( @roots ) {
462 # # Prune the tree to get rid of extraneous hypotheticals.
463 # $root = _prune_subtree( $part, $root, $contig );
465 # # Save this root for our group.
466 # push( @group_roots, $root );
467 # # Get all the successor nodes of our root.
470 # # Dispense with the trivial case of one reading.
472 # @group_roots = ( $wit );
473 # foreach my $v ( $part->vertices ) {
474 # $part->delete_vertex( $v ) unless $v eq $wit;
479 # map { $reading_roots{$_} = 1 } @group_roots;
480 # if( @group_roots > 1 ) {
481 # $conflict->{$group_readings->{$gst}} = 1;
482 # $is_conflicted = 1;
484 # # Paint the 'hypotheticals' with our group.
485 # foreach my $wit ( $part->vertices ) {
486 # if( ref( $contig->{$wit} ) ) {
487 # push( @{$contig->{$wit}}, $gst );
488 # } elsif( $contig->{$wit} ne $gst ) {
489 # warn "How did we get here?";
494 # # Start to write the reading, and save the group subgraph.
495 # my $reading = { 'readingid' => $group_readings->{$gst},
496 # 'missing' => wit_stringify( \@lacunose ),
497 # 'group' => $gst }; # This will change if we find no conflict
498 # # Save the relevant subgraph.
499 # $subgraph->{$gst} = $part;
500 # push( @{$variant_row->{'readings'}}, $reading );
503 # # For each of our hypothetical readings, flatten its 'contig' array if
504 # # the array contains zero or one group. If we have any unflattened arrays,
505 # # we may need to run the resolution process. If the reading is already known
506 # # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
509 # foreach my $wit ( keys %$contig ) {
510 # next unless ref( $contig->{$wit} );
511 # if( @{$contig->{$wit}} > 1 ) {
512 # if( $is_conflicted ) {
513 # $contig->{$wit} = ''; # We aren't going to decide.
515 # push( @resolve, $wit );
518 # my $gst = pop @{$contig->{$wit}};
519 # $contig->{$wit} = $gst || '';
524 # my $still_contig = {};
525 # foreach my $h ( @resolve ) {
526 # # For each of the hypothetical readings with more than one possibility,
527 # # try deleting it from each of its member subgraphs in turn, and see
528 # # if that breaks the contiguous grouping.
529 # # TODO This can still break in a corner case where group A can use
530 # # either vertex 1 or 2, and group B can use either vertex 2 or 1.
531 # # Revisit this if necessary; it could get brute-force nasty.
532 # foreach my $gst ( @{$contig->{$h}} ) {
533 # my $gpart = $subgraph->{$gst}->copy();
534 # # If we have come this far, there is only one root and everything
535 # # is reachable from it.
536 # my( $root ) = $gpart->predecessorless_vertices;
537 # my $reachable = {};
538 # map { $reachable->{$_} = 1 } $gpart->vertices;
540 # # Try deleting the hypothetical node.
541 # $gpart->delete_vertex( $h );
542 # if( $h eq $root ) {
543 # # See if we still have a single root.
544 # my @roots = $gpart->predecessorless_vertices;
545 # warn "This shouldn't have happened" unless @roots;
547 # # $h is needed by this group.
548 # if( exists( $still_contig->{$h} ) ) {
550 # $conflict->{$group_readings->{$gst}} = 1;
551 # $still_contig->{$h} = '';
553 # $still_contig->{$h} = $gst;
557 # # $h is somewhere in the middle. See if everything
558 # # else can still be reached from the root.
559 # my %still_reachable = ( $root => 1 );
560 # map { $still_reachable{$_} = 1 }
561 # $gpart->all_successors( $root );
562 # foreach my $v ( keys %$reachable ) {
564 # if( !$still_reachable{$v}
565 # && ( $contig->{$v} eq $gst
566 # || ( exists $still_contig->{$v}
567 # && $still_contig->{$v} eq $gst ) ) ) {
569 # if( exists $still_contig->{$h} ) {
571 # $conflict->{$group_readings->{$gst}} = 1;
572 # $still_contig->{$h} = '';
574 # $still_contig->{$h} = $gst;
577 # } # else we don't need $h in this group.
579 # } # endif $h eq $root
580 # } # end foreach $gst
583 # # Now we have some hypothetical vertices in $still_contig that are the
584 # # "real" group memberships. Replace these in $contig.
585 # foreach my $v ( keys %$contig ) {
586 # next unless ref $contig->{$v};
587 # $contig->{$v} = $still_contig->{$v};
589 # } # end if @resolve
592 # $variant_row->{'genealogical'} = !( keys %$conflict );
593 # return $variant_row;
597 my( $tree, $root, $contighash ) = @_;
598 # First, delete hypothetical leaves / orphans until there are none left.
599 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
600 $tree->successorless_vertices;
601 while( @orphan_hypotheticals ) {
602 $tree->delete_vertices( @orphan_hypotheticals );
603 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
604 $tree->successorless_vertices;
606 # Then delete a hypothetical root with only one successor, moving the
607 # root to the first child that has no other predecessors.
608 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
609 my @nextroot = $tree->successors( $root );
610 $tree->delete_vertex( $root );
611 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
613 # The tree has been modified in place, but we need to know the new root.
614 $root = undef unless $root && $tree->has_vertex( $root );
617 # Add the variant, subject to a.c. representation logic.
618 # This assumes that we will see the 'main' version before the a.c. version.
619 sub add_variant_wit {
620 my( $arr, $wit, $acstr ) = @_;
622 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
624 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
626 push( @$arr, $wit ) unless $skip;
629 sub _useful_variant {
630 my( $group_readings, $graph, $acstr ) = @_;
632 # TODO Decide what to do with AC witnesses
634 # Sort by group size and return
636 my( @readings, @groups ); # The sorted groups for our answer.
637 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
638 keys %$group_readings ) {
639 push( @readings, $rdg );
640 push( @groups, $group_readings->{$rdg} );
641 if( @{$group_readings->{$rdg}} > 1 ) {
644 my( $wit ) = @{$group_readings->{$rdg}};
645 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
646 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
649 if( $is_useful > 1 ) {
650 return( \@readings, \@groups );
656 =head2 wit_stringify( $groups )
658 Takes an array of witness groupings and produces a string like
659 ['A','B'] / ['C','D','E'] / ['F']
666 # If we were passed an array of witnesses instead of an array of
667 # groupings, then "group" the witnesses first.
668 unless( ref( $groups->[0] ) ) {
669 my $mkgrp = [ $groups ];
672 foreach my $g ( @$groups ) {
673 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
675 return join( ' / ', @gst );
678 # Helper function to ensure that X and X a.c. never appear in the same list.
679 sub _add_to_witlist {
680 my( $wit, $list, $acstr ) = @_;
683 map { $inlist{$_} = $idx++ } @$list;
684 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
686 unless( exists $inlist{$acwit} ) {
687 push( @$list, $acwit.$acstr );
690 if( exists( $inlist{$wit.$acstr} ) ) {
691 # Replace the a.c. version with the main witness
692 my $i = $inlist{$wit.$acstr};
695 push( @$list, $wit );
701 my( $lista, $listb ) = @_;
704 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
705 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
706 my @set = grep { $union{$_} == 1 } keys %union;
707 return map { $scalars{$_} } @set;
714 This package is free software and is provided "as is" without express
715 or implied warranty. You can redistribute it and/or modify it under
716 the same terms as Perl itself.
720 Tara L Andrews E<lt>aurum@cpan.orgE<gt>