1 package Text::Tradition::Analysis;
6 use Encode qw/ encode_utf8 /;
8 use JSON qw/ encode_json decode_json /;
11 use Text::Tradition::Stemma;
13 use vars qw/ @EXPORT_OK /;
14 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
18 Text::Tradition::Analysis - functions for stemma analysis of a tradition
23 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
24 my $t = Text::Tradition->new(
25 'name' => 'this is a text',
27 'file' => '/path/to/tei_parallel_seg_file.xml' );
28 $t->add_stemma( 'dotfile' => $stemmafile );
30 my $variant_data = run_analysis( $tradition );
31 # Recalculate rank $n treating all orthographic variants as equivalent
32 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
36 Text::Tradition is a library for representation and analysis of collated
37 texts, particularly medieval ones. The Collation is the central feature of
38 a Tradition, where the text, its sequence of readings, and its relationships
39 between readings are actually kept.
43 =head2 run_analysis( $tradition, %opts )
45 Runs the analysis described in analyze_variant_location on every location in the
46 collation of the given tradition, with the given options. These include:
50 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
51 is 0 (i.e. the first).
53 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
55 =item * merge_types - Specify a list of relationship types, where related readings
56 should be treated as identical for the purposes of analysis.
63 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
65 my $datafile = 't/data/florilegium_tei_ps.xml';
66 my $tradition = Text::Tradition->new( 'input' => 'TEI',
68 'file' => $datafile );
69 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
70 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
72 my %expected_genealogical = (
103 my $data = run_analysis( $tradition );
104 foreach my $row ( @{$data->{'variants'}} ) {
105 # Account for rows that used to be "not useful"
106 unless( exists $expected_genealogical{$row->{'id'}} ) {
107 $expected_genealogical{$row->{'id'}} = 1;
109 is( $row->{'genealogical'}, $expected_genealogical{$row->{'id'}},
110 "Got correct genealogical flag for row " . $row->{'id'} );
112 is( $data->{'conflict_count'}, 34, "Got right conflict count" );
113 is( $data->{'variant_count'}, 58, "Got right total variant number" );
120 my( $tradition, %opts ) = @_;
121 my $c = $tradition->collation;
123 my $stemma_id = $opts{'stemma_id'} || 0;
124 my @ranks = @{$opts{'ranks'}} if ref( $opts{'ranks'} ) eq 'ARRAY';
125 my @collapse = @{$opts{'merge_types'}} if ref( $opts{'merge_types'} ) eq 'ARRAY';
128 my $stemma = $tradition->stemma( $stemma_id );
129 # Figure out which witnesses we are working with
130 my @lacunose = $stemma->hypotheticals;
131 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
132 map { push( @tradition_wits, $_->sigil."_ac" ) if $_->is_layered }
133 $tradition->witnesses;
134 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
136 # Find and mark 'common' ranks for exclusion, unless they were
137 # explicitly specified.
140 foreach my $rdg ( $c->common_readings ) {
141 $common_rank{$rdg->rank} = 1;
143 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
146 # Group the variants to send to the solver
149 foreach my $rank ( @ranks ) {
150 my $missing = [ @lacunose ];
151 push( @groups, group_variants( $tradition, $rank, $missing, \@collapse ) );
152 $lacunae{$rank} = $missing;
156 my $answer = solve_variants( $stemma->editable( ' ' ), @groups );
158 # Do further analysis on the answer
159 my $conflict_count = 0;
160 foreach my $idx ( 0 .. $#ranks ) {
161 my $location = $answer->{'variants'}->[$idx];
162 # Add the rank back in
163 $location->{'id'} = $ranks[$idx];
164 # Add the lacunae back in
165 $location->{'missing'} = $lacunae{$ranks[$idx]};
166 # Run the extra analysis we need.
167 analyze_location( $tradition, $stemma->graph, $location );
168 # Add the reading text back in
169 foreach my $rdghash ( @{$location->{'readings'}} ) {
171 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
172 my $rdg = $c->reading( $rdghash->{'readingid'} );
173 $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'};
176 $answer->{'conflict_count'} = $conflict_count;
181 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
183 Groups the variants at the given $rank of the collation, treating any
184 relationships in @merge_relationship_types as equivalent. $lacunose should
185 be a reference to an array, to which the sigla of lacunose witnesses at this
186 rank will be appended.
188 Returns two ordered lists $readings, $groups, where $readings->[$n] is attested
189 by the witnesses listed in $groups->[$n].
193 # Return group_readings, groups, lacunose
195 my( $tradition, $rank, $lacunose, $collapse ) = @_;
196 my $c = $tradition->collation;
197 my $aclabel = $c->ac_label;
198 # Get the alignment table readings
199 my %readings_at_rank;
201 foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) {
202 my $rdg = $tablewit->{'tokens'}->[$rank-1];
203 my $wit = $tablewit->{'witness'};
204 $wit =~ s/^(.*)\Q$aclabel\E$/${1}_ac/;
205 if( $rdg && $rdg->{'t'}->is_lacuna ) {
206 _add_to_witlist( $wit, $lacunose, '_ac' );
208 $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
210 _add_to_witlist( $wit, \@gap_wits, '_ac' );
214 # Group the readings, collapsing groups by relationship if needed
215 my %grouped_readings;
216 foreach my $rdg ( sort { $b->witnesses <=> $a->witnesses } values %readings_at_rank ) {
217 # Skip readings that have been collapsed into others.
218 next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
219 my @wits = $rdg->witnesses;
220 map { s/\Q$aclabel\E$/_ac/ } @wits;
222 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
223 foreach my $other ( $rdg->related_readings( $filter ) ) {
224 my @otherwits = $other->witnesses;
225 map { s/\Q$aclabel\E$/_ac/ } @otherwits;
226 push( @wits, @otherwits );
227 $grouped_readings{$other->id} = 0;
230 $grouped_readings{$rdg->id} = \@wits;
232 $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits;
233 # Get rid of our collapsed readings
234 map { delete $grouped_readings{$_} unless $grouped_readings{$_} }
235 keys %grouped_readings
238 return \%grouped_readings;
241 =head2 solve_variants( $graph, @groups )
243 Sends the set of groups to the external graph solver service and returns
244 a cleaned-up answer, adding the rank IDs back where they belong.
246 The JSON has the form
247 { "graph": [ stemmagraph DOT string without newlines ],
248 "groupings": [ array of arrays of groups, one per rank ] }
250 The answer has the form
251 { "variants" => [ array of variant location structures ],
252 "variant_count" => total,
253 "conflict_count" => number of conflicts detected,
254 "genealogical_count" => number of solutions found }
259 my( $graph, @groups ) = @_;
261 # Make the json with stemma + groups
262 my $jsonstruct = { 'graph' => $graph, 'groupings' => [] };
263 foreach my $ghash ( @groups ) {
265 foreach my $k ( sort keys %$ghash ) {
266 push( @grouping, $ghash->{$k} );
268 push( @{$jsonstruct->{'groupings'}}, \@grouping );
270 my $json = encode_json( $jsonstruct );
272 # Send it off and get the result
273 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
274 my $ua = LWP::UserAgent->new();
275 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
276 'Content' => $json );
279 if( $resp->is_success ) {
280 $answer = decode_json( $resp->content );
282 # Fall back to the old method.
283 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
284 . "; falling back to perl method";
285 $answer = perl_solver( $graph, @groups );
288 # Fold the result back into what we know about the groups.
290 my $genealogical = 0;
291 foreach my $idx ( 0 .. $#groups ) {
292 my( $calc_groups, $result ) = @{$answer->[$idx]};
293 $genealogical++ if $result;
294 my $input_group = $groups[$idx];
295 foreach my $k ( sort keys %$input_group ) {
296 my $cg = shift @$calc_groups;
297 $input_group->{$k} = $cg;
300 'genealogical' => $result,
303 foreach my $k ( keys %$input_group ) {
304 push( @{$vstruct->{'readings'}},
305 { 'readingid' => $k, 'group' => $input_group->{$k}} );
307 push( @$variants, $vstruct );
310 return { 'variants' => $variants,
311 'variant_count' => scalar @$variants,
312 'genealogical_count' => $genealogical };
315 =head2 analyze_location ( $tradition, $graph, $location_hash )
317 Given the tradition, its stemma graph, and the solution from the graph solver,
318 work out the rest of the information we want. For each reading we need missing,
319 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
323 sub analyze_location {
324 my ( $tradition, $graph, $variant_row ) = @_;
326 # Make a hash of all known node memberships, and make the subgraphs.
328 my $reading_roots = {};
330 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
331 my $rid = $rdghash->{'readingid'};
332 map { $contig->{$_} = $rid } @{$rdghash->{'group'}};
335 my $part = $graph->copy;
337 map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}};
338 $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices );
339 $subgraph->{$rid} = $part;
340 # Get the reading roots.
341 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
344 # Now that we have all the node group memberships, calculate followed/
345 # non-followed/unknown values for each reading. Also figure out the
346 # reading's evident parent(s).
347 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
348 # Group string key - TODO do we need this?
349 my $gst = wit_stringify( $rdghash->{'group'} );
350 my $rid = $rdghash->{'readingid'};
352 my $part = $subgraph->{$rid};
354 # Start figuring things out.
355 my @roots = $part->predecessorless_vertices;
356 $rdghash->{'independent_occurrence'} = scalar @roots;
357 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
358 # Find the parent readings, if any, of this reading.
360 foreach my $wit ( @roots ) {
361 # Look in the main stemma to find this witness's extant or known-reading
362 # immediate ancestor(s), and look up the reading that each ancestor olds.
363 my @check = $graph->predecessors( $wit );
366 foreach my $wparent( @check ) {
367 my $preading = $contig->{$wparent};
369 $rdgparents{$preading} = 1;
371 push( @next, $graph->predecessors( $wparent ) );
377 $rdghash->{'reading_parents'} = [ keys %rdgparents ];
379 # Find the number of times this reading was altered, and the number of
380 # times we're not sure.
381 my( %nofollow, %unknownfollow );
382 foreach my $wit ( $part->vertices ) {
383 foreach my $wchild ( $graph->successors( $wit ) ) {
384 next if $part->has_vertex( $wchild );
385 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
386 # It definitely changed here.
387 $nofollow{$wchild} = 1;
388 } elsif( !($contig->{$wchild}) ) {
389 # The child is a hypothetical node not definitely in
390 # any group. Answer is unknown.
391 $unknownfollow{$wchild} = 1;
392 } # else it's a non-root node in a known group, and therefore
393 # is presumed to have its reading from its group, not this link.
396 $rdghash->{'not_followed'} = keys %nofollow;
397 $rdghash->{'follow_unknown'} = keys %unknownfollow;
399 # Now say whether this reading represents a conflict.
400 unless( $variant_row->{'genealogical'} ) {
401 $rdghash->{'conflict'} = @roots != 1;
407 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
409 ** NOTE ** This method should hopefully not be called - it is not guaranteed
410 to be correct. Serves as a backup for the real solver.
412 Runs an analysis of the given tradition, at the location given in $rank,
413 against the graph of the stemma specified in $stemma_id. The argument
414 @merge_relationship_types is an optional list of relationship types for
415 which readings so related should be treated as equivalent.
417 Returns a nested array data structure as follows:
419 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
421 where the group list is the array of arrays passed in for each element of @groups,
422 possibly with the addition of hypothetical readings.
428 my( $graph, @groups ) = @_;
430 warn "Not implemented yet";
440 # my $variant_row = { 'id' => $rank, 'readings' => [] };
441 # # Mark each ms as in its own group, first.
442 # foreach my $g ( @$groups ) {
443 # my $gst = wit_stringify( $g );
444 # map { $contig->{$_} = $gst } @$g;
446 # # Now for each unmarked node in the graph, initialize an array
447 # # for possible group memberships. We will use this later to
448 # # resolve potential conflicts.
449 # map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
450 # foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
451 # my $gst = wit_stringify( $g ); # This is the group name
452 # # Copy the graph, and delete all non-members from the new graph.
453 # my $part = $graph->copy;
455 # $part->delete_vertices(
456 # grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
458 # # Now look to see if our group is connected.
459 # if( $undirected ) { # For use with distance trees etc.
460 # # Find all vertices reachable from the first (arbitrary) group
461 # # member. If we are genealogical this should include them all.
462 # my $reachable = {};
463 # map { $reachable->{$_} = 1 } $part->all_reachable( $g->[0] );
464 # # TODO This is a terrible way to do distance trees, since all
465 # # non-leaf nodes are included in every graph part now. We may
466 # # have to go back to SPDP.
469 # # We have to take directionality into account.
470 # # How many root nodes do we have?
471 # my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
472 # $part->predecessorless_vertices;
473 # # Assuming that @$g > 1, find the first root node that has at
474 # # least one successor belonging to our group. If this reading
475 # # is genealogical, there should be only one, but we will check
476 # # that implicitly later.
477 # foreach my $root ( @roots ) {
478 # # Prune the tree to get rid of extraneous hypotheticals.
479 # $root = _prune_subtree( $part, $root, $contig );
481 # # Save this root for our group.
482 # push( @group_roots, $root );
483 # # Get all the successor nodes of our root.
486 # # Dispense with the trivial case of one reading.
488 # @group_roots = ( $wit );
489 # foreach my $v ( $part->vertices ) {
490 # $part->delete_vertex( $v ) unless $v eq $wit;
495 # map { $reading_roots{$_} = 1 } @group_roots;
496 # if( @group_roots > 1 ) {
497 # $conflict->{$group_readings->{$gst}} = 1;
498 # $is_conflicted = 1;
500 # # Paint the 'hypotheticals' with our group.
501 # foreach my $wit ( $part->vertices ) {
502 # if( ref( $contig->{$wit} ) ) {
503 # push( @{$contig->{$wit}}, $gst );
504 # } elsif( $contig->{$wit} ne $gst ) {
505 # warn "How did we get here?";
510 # # Start to write the reading, and save the group subgraph.
511 # my $reading = { 'readingid' => $group_readings->{$gst},
512 # 'missing' => wit_stringify( \@lacunose ),
513 # 'group' => $gst }; # This will change if we find no conflict
514 # # Save the relevant subgraph.
515 # $subgraph->{$gst} = $part;
516 # push( @{$variant_row->{'readings'}}, $reading );
519 # # For each of our hypothetical readings, flatten its 'contig' array if
520 # # the array contains zero or one group. If we have any unflattened arrays,
521 # # we may need to run the resolution process. If the reading is already known
522 # # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
525 # foreach my $wit ( keys %$contig ) {
526 # next unless ref( $contig->{$wit} );
527 # if( @{$contig->{$wit}} > 1 ) {
528 # if( $is_conflicted ) {
529 # $contig->{$wit} = ''; # We aren't going to decide.
531 # push( @resolve, $wit );
534 # my $gst = pop @{$contig->{$wit}};
535 # $contig->{$wit} = $gst || '';
540 # my $still_contig = {};
541 # foreach my $h ( @resolve ) {
542 # # For each of the hypothetical readings with more than one possibility,
543 # # try deleting it from each of its member subgraphs in turn, and see
544 # # if that breaks the contiguous grouping.
545 # # TODO This can still break in a corner case where group A can use
546 # # either vertex 1 or 2, and group B can use either vertex 2 or 1.
547 # # Revisit this if necessary; it could get brute-force nasty.
548 # foreach my $gst ( @{$contig->{$h}} ) {
549 # my $gpart = $subgraph->{$gst}->copy();
550 # # If we have come this far, there is only one root and everything
551 # # is reachable from it.
552 # my( $root ) = $gpart->predecessorless_vertices;
553 # my $reachable = {};
554 # map { $reachable->{$_} = 1 } $gpart->vertices;
556 # # Try deleting the hypothetical node.
557 # $gpart->delete_vertex( $h );
558 # if( $h eq $root ) {
559 # # See if we still have a single root.
560 # my @roots = $gpart->predecessorless_vertices;
561 # warn "This shouldn't have happened" unless @roots;
563 # # $h is needed by this group.
564 # if( exists( $still_contig->{$h} ) ) {
566 # $conflict->{$group_readings->{$gst}} = 1;
567 # $still_contig->{$h} = '';
569 # $still_contig->{$h} = $gst;
573 # # $h is somewhere in the middle. See if everything
574 # # else can still be reached from the root.
575 # my %still_reachable = ( $root => 1 );
576 # map { $still_reachable{$_} = 1 }
577 # $gpart->all_successors( $root );
578 # foreach my $v ( keys %$reachable ) {
580 # if( !$still_reachable{$v}
581 # && ( $contig->{$v} eq $gst
582 # || ( exists $still_contig->{$v}
583 # && $still_contig->{$v} eq $gst ) ) ) {
585 # if( exists $still_contig->{$h} ) {
587 # $conflict->{$group_readings->{$gst}} = 1;
588 # $still_contig->{$h} = '';
590 # $still_contig->{$h} = $gst;
593 # } # else we don't need $h in this group.
595 # } # endif $h eq $root
596 # } # end foreach $gst
599 # # Now we have some hypothetical vertices in $still_contig that are the
600 # # "real" group memberships. Replace these in $contig.
601 # foreach my $v ( keys %$contig ) {
602 # next unless ref $contig->{$v};
603 # $contig->{$v} = $still_contig->{$v};
605 # } # end if @resolve
608 # $variant_row->{'genealogical'} = !( keys %$conflict );
609 # return $variant_row;
613 my( $tree, $root, $contighash ) = @_;
614 # First, delete hypothetical leaves / orphans until there are none left.
615 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
616 $tree->successorless_vertices;
617 while( @orphan_hypotheticals ) {
618 $tree->delete_vertices( @orphan_hypotheticals );
619 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
620 $tree->successorless_vertices;
622 # Then delete a hypothetical root with only one successor, moving the
623 # root to the first child that has no other predecessors.
624 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
625 my @nextroot = $tree->successors( $root );
626 $tree->delete_vertex( $root );
627 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
629 # The tree has been modified in place, but we need to know the new root.
630 $root = undef unless $root && $tree->has_vertex( $root );
633 # Add the variant, subject to a.c. representation logic.
634 # This assumes that we will see the 'main' version before the a.c. version.
635 sub add_variant_wit {
636 my( $arr, $wit, $acstr ) = @_;
638 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
640 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
642 push( @$arr, $wit ) unless $skip;
645 sub _useful_variant {
646 my( $group_readings, $graph, $acstr ) = @_;
648 # TODO Decide what to do with AC witnesses
650 # Sort by group size and return
652 my( @readings, @groups ); # The sorted groups for our answer.
653 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
654 keys %$group_readings ) {
655 push( @readings, $rdg );
656 push( @groups, $group_readings->{$rdg} );
657 if( @{$group_readings->{$rdg}} > 1 ) {
660 my( $wit ) = @{$group_readings->{$rdg}};
661 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
662 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
665 if( $is_useful > 1 ) {
666 return( \@readings, \@groups );
672 =head2 wit_stringify( $groups )
674 Takes an array of witness groupings and produces a string like
675 ['A','B'] / ['C','D','E'] / ['F']
682 # If we were passed an array of witnesses instead of an array of
683 # groupings, then "group" the witnesses first.
684 unless( ref( $groups->[0] ) ) {
685 my $mkgrp = [ $groups ];
688 foreach my $g ( @$groups ) {
689 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
691 return join( ' / ', @gst );
694 # Helper function to ensure that X and X a.c. never appear in the same list.
695 sub _add_to_witlist {
696 my( $wit, $list, $acstr ) = @_;
699 map { $inlist{$_} = $idx++ } @$list;
700 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
702 unless( exists $inlist{$acwit} ) {
703 push( @$list, $acwit.$acstr );
706 if( exists( $inlist{$wit.$acstr} ) ) {
707 # Replace the a.c. version with the main witness
708 my $i = $inlist{$wit.$acstr};
711 push( @$list, $wit );
717 my( $lista, $listb ) = @_;
720 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
721 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
722 my @set = grep { $union{$_} == 1 } keys %union;
723 return map { $scalars{$_} } @set;
730 This package is free software and is provided "as is" without express
731 or implied warranty. You can redistribute it and/or modify it under
732 the same terms as Perl itself.
736 Tara L Andrews E<lt>aurum@cpan.orgE<gt>