1 package Text::Tradition::Analysis;
6 use Encode qw/ encode_utf8 /;
9 use JSON qw/ encode_json decode_json /;
12 use Text::Tradition::Stemma;
15 use vars qw/ @EXPORT_OK /;
16 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
20 Text::Tradition::Analysis - functions for stemma analysis of a tradition
25 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
26 my $t = Text::Tradition->new(
27 'name' => 'this is a text',
29 'file' => '/path/to/tei_parallel_seg_file.xml' );
30 $t->add_stemma( 'dotfile' => $stemmafile );
32 my $variant_data = run_analysis( $tradition );
33 # Recalculate rank $n treating all orthographic variants as equivalent
34 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
38 Text::Tradition is a library for representation and analysis of collated
39 texts, particularly medieval ones. The Collation is the central feature of
40 a Tradition, where the text, its sequence of readings, and its relationships
41 between readings are actually kept.
45 =head2 run_analysis( $tradition, %opts )
47 Runs the analysis described in analyze_variant_location on every location in the
48 collation of the given tradition, with the given options. These include:
52 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
53 is 0 (i.e. the first).
55 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
57 =item * merge_types - Specify a list of relationship types, where related readings
58 should be treated as identical for the purposes of analysis.
60 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
67 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
69 my $datafile = 't/data/florilegium_tei_ps.xml';
70 my $tradition = Text::Tradition->new( 'input' => 'TEI',
72 'file' => $datafile );
73 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
74 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
76 my %expected_genealogical = (
107 my $data = run_analysis( $tradition );
108 my $c = $tradition->collation;
109 foreach my $row ( @{$data->{'variants'}} ) {
110 # Account for rows that used to be "not useful"
111 unless( exists $expected_genealogical{$row->{'id'}} ) {
112 $expected_genealogical{$row->{'id'}} = 1;
114 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
115 is( $gen_bool, $expected_genealogical{$row->{'id'}},
116 "Got correct genealogical flag for row " . $row->{'id'} );
117 # Check that we have the right row with the right groups
118 my $rank = $row->{'id'};
119 foreach my $rdghash ( @{$row->{'readings'}} ) {
120 # Skip 'readings' that aren't really
121 next unless $c->reading( $rdghash->{'readingid'} );
123 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
124 "Got correct reading rank" );
125 # Check the witnesses
126 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
127 my @sgrp = sort @{$rdghash->{'group'}};
128 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
131 is( $data->{'variant_count'}, 58, "Got right total variant number" );
132 # TODO Make something meaningful of conflict count, maybe test other bits
139 my( $tradition, %opts ) = @_;
140 my $c = $tradition->collation;
142 my $stemma_id = $opts{'stemma_id'} || 0;
143 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
144 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
147 my $stemma = $tradition->stemma( $stemma_id );
149 # Figure out which witnesses we are working with - that is, the ones that
150 # appear both in the stemma and in the tradition. All others are 'lacunose'
152 my @lacunose = $stemma->hypotheticals;
153 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
154 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
156 # Find and mark 'common' ranks for exclusion, unless they were
157 # explicitly specified.
160 foreach my $rdg ( $c->common_readings ) {
161 $common_rank{$rdg->rank} = 1;
163 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
166 # Group the variants to send to the solver
171 foreach my $rank ( @ranks ) {
172 my $missing = [ @lacunose ];
173 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
174 # Filter out any empty rankgroups
175 # (e.g. from the later rank for a transposition)
176 next unless keys %$rankgroup;
177 if( $opts{'exclude_type1'} ) {
178 # Check to see whether this is a "useful" group.
179 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
180 $stemma->graph, $c->ac_label );
183 push( @use_ranks, $rank );
184 push( @groups, $rankgroup );
185 $lacunae{$rank} = $missing;
188 my $answer = solve_variants( $stemma, @groups );
190 # Do further analysis on the answer
191 my $conflict_count = 0;
192 my $aclabel = $c->ac_label;
193 foreach my $idx ( 0 .. $#use_ranks ) {
194 my $location = $answer->{'variants'}->[$idx];
195 # Add the rank back in
196 my $rank = $use_ranks[$idx];
197 $location->{'id'} = $rank;
198 # Note what our lacunae are
200 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
201 $location->{'missing'} = [ keys %lmiss ];
203 # Run the extra analysis we need.
204 analyze_location( $tradition, $stemma, $location, \%lmiss );
207 # Do the final post-analysis tidying up of the data.
208 foreach my $rdghash ( @{$location->{'readings'}} ) {
210 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
211 # Add the reading text back in, setting display value as needed
212 my $rdg = $c->reading( $rdghash->{'readingid'} );
214 $rdghash->{'text'} = $rdg->text .
215 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
216 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
217 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
219 # Remove lacunose witnesses from this reading's list now that the
222 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
223 $rdghash->{'group'} = \@realgroup;
224 # Note any layered witnesses that appear in this group
225 foreach( @realgroup ) {
226 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
227 push( @layerwits, $1 );
231 $location->{'layerwits'} = \@layerwits if @layerwits;
233 $answer->{'conflict_count'} = $conflict_count;
238 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
240 Groups the variants at the given $rank of the collation, treating any
241 relationships in @merge_relationship_types as equivalent. $lacunose should
242 be a reference to an array, to which the sigla of lacunose witnesses at this
243 rank will be appended; $transposed should be a reference to a hash, wherein
244 the identities of transposed readings and their relatives will be stored.
246 Returns a hash $group_readings where $rdg is attested by the witnesses listed
247 in $group_readings->{$rdg}.
251 # Return group_readings, groups, lacunose
253 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
254 my $c = $tradition->collation;
255 my $aclabel = $c->ac_label;
256 my $table = $c->alignment_table;
257 # Get the alignment table readings
258 my %readings_at_rank;
259 my %is_lacunose; # lookup table for witnesses not in stemma
260 map { $is_lacunose{$_} = 1; $is_lacunose{$_.$aclabel} = 1 } @$lacunose;
263 my $has_transposition;
264 foreach my $tablewit ( @{$table->{'alignment'}} ) {
265 my $rdg = $tablewit->{'tokens'}->[$rank-1];
266 my $wit = $tablewit->{'witness'};
267 # Exclude the witness if it is "lacunose" which if we got here
268 # means "not in the stemma".
269 next if $is_lacunose{$wit};
270 # Note if the witness is actually in a lacuna
271 if( $rdg && $rdg->{'t'}->is_lacuna ) {
272 _add_to_witlist( $wit, $lacunose, $aclabel );
273 # Otherwise the witness either has a positive reading...
275 # If the reading has been counted elsewhere as a transposition, ignore it.
276 if( $transposed->{$rdg->{'t'}->id} ) {
277 # TODO Does this cope with three-way transpositions?
278 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
281 # Otherwise, record it...
282 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
283 # ...and grab any transpositions, and their relations.
284 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
285 foreach my $trdg ( @transp ) {
286 next if exists $readings_at_rank{$trdg->id};
287 $has_transposition = 1;
288 my @affected_wits = _table_witnesses(
289 $table, $trdg, \%is_lacunose, $aclabel );
290 next unless @affected_wits;
291 map { $moved_wits{$_} = 1 } @affected_wits;
292 $transposed->{$trdg->id} =
293 [ _table_witnesses( $table, $rdg->{'t'}, \%is_lacunose, $aclabel ) ];
294 $readings_at_rank{$trdg->id} = $trdg;
296 # ...or it is empty, ergo a gap.
298 _add_to_witlist( $wit, \@check_for_gaps, $aclabel );
302 map { _add_to_witlist( $_, \@gap_wits, $aclabel )
303 unless $moved_wits{$_} } @check_for_gaps;
304 # Group the readings, collapsing groups by relationship if needed
305 my $grouped_readings = {};
306 foreach my $rdg ( values %readings_at_rank ) {
307 # Skip readings that have been collapsed into others.
308 next if exists $grouped_readings->{$rdg->id}
309 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
310 # Get the witness list, including from readings collapsed into this one.
311 my @wits = _table_witnesses( $table, $rdg, \%is_lacunose, $aclabel );
312 if( $collapse && @$collapse ) {
313 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
314 foreach my $other ( $rdg->related_readings( $filter ) ) {
315 my @otherwits = _table_witnesses(
316 $table, $other, \%is_lacunose, $aclabel );
317 push( @wits, @otherwits );
318 $grouped_readings->{$other->id} = 'COLLAPSE';
321 $grouped_readings->{$rdg->id} = \@wits;
323 $grouped_readings->{'(omitted)'} = \@gap_wits if @gap_wits;
324 # Get rid of our collapsed readings
325 map { delete $grouped_readings->{$_} if $grouped_readings->{$_} eq 'COLLAPSE' }
326 keys %$grouped_readings
329 # If something was transposed, check the groups for doubled-up readings
330 if( $has_transposition ) {
331 # print STDERR "Group for rank $rank:\n";
332 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
333 # keys %$grouped_readings;
334 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
338 return $grouped_readings;
341 # Helper function to query the alignment table for all witnesses (a.c. included)
342 # that have a given reading at its rank.
343 sub _table_witnesses {
344 my( $table, $trdg, $lacunose, $aclabel ) = @_;
345 my $tableidx = $trdg->rank - 1;
347 foreach my $row ( @{$table->{'alignment'}} ) {
348 my $wit = $row->{'witness'};
349 next if $lacunose->{$wit};
350 my $rdg = $row->{'tokens'}->[$tableidx];
351 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
352 _add_to_witlist( $wit, \@has_reading, $aclabel )
353 if $rdg->{'t'}->id eq $trdg->id;
358 # Helper function to ensure that X and X a.c. never appear in the same list.
359 sub _add_to_witlist {
360 my( $wit, $list, $acstr ) = @_;
363 map { $inlist{$_} = $idx++ } @$list;
364 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
366 unless( exists $inlist{$acwit} ) {
367 push( @$list, $acwit.$acstr );
370 if( exists( $inlist{$wit.$acstr} ) ) {
371 # Replace the a.c. version with the main witness
372 my $i = $inlist{$wit.$acstr};
375 push( @$list, $wit );
380 sub _check_transposed_consistency {
381 my( $c, $rank, $transposed, $groupings ) = @_;
384 # Note which readings are actually at this rank, and which witnesses
385 # belong to which reading.
386 foreach my $rdg ( keys %$groupings ) {
387 my $rdgobj = $c->reading( $rdg );
388 # Count '(omitted)' as a reading at this rank
389 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
390 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
392 # Our work is done if we have no witness belonging to more than one
394 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
395 return unless @doubled;
396 # If we have a symmetric related transposition, drop the non-rank readings.
397 if( @doubled == scalar keys %seen_wits ) {
398 foreach my $rdg ( keys %$groupings ) {
399 if( !$thisrank{$rdg} ) {
400 my $groupstr = wit_stringify( $groupings->{$rdg} );
401 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
403 delete $groupings->{$rdg};
404 # If we found a group match, assume there is a symmetry happening.
405 # TODO think more about this
406 # print STDERR "*** Deleting symmetric reading $rdg\n";
408 delete $transposed->{$rdg};
409 warn "Found problem in evident symmetry with reading $rdg";
413 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
415 foreach my $dup ( @doubled ) {
416 foreach my $rdg ( @{$seen_wits{$dup}} ) {
417 next if $thisrank{$rdg};
418 next unless exists $groupings->{$rdg};
419 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
420 delete $groupings->{$rdg};
421 delete $transposed->{$rdg};
424 # and put any now-orphaned readings into an 'omitted' reading.
425 foreach my $wit ( keys %seen_wits ) {
426 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
427 $groupings->{'(omitted)'} = [] unless exists $groupings->{'(omitted)'};
428 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
434 =head2 solve_variants( $graph, @groups )
436 Sends the set of groups to the external graph solver service and returns
437 a cleaned-up answer, adding the rank IDs back where they belong.
439 The JSON has the form
440 { "graph": [ stemmagraph DOT string without newlines ],
441 "groupings": [ array of arrays of groups, one per rank ] }
443 The answer has the form
444 { "variants" => [ array of variant location structures ],
445 "variant_count" => total,
446 "conflict_count" => number of conflicts detected,
447 "genealogical_count" => number of solutions found }
452 my( $stemma, @groups ) = @_;
453 my $aclabel = $stemma->collation->ac_label;
455 # Filter the groups down to distinct groups, and work out what graph
456 # should be used in the calculation of each group. We want to send each
457 # distinct problem to the solver only once.
458 # We need a whole bunch of lookup tables for this.
459 my $index_groupkeys = {}; # Save the order of readings
460 my $group_indices = {}; # Save the indices that have a given grouping
461 my $graph_problems = {}; # Save the groupings for the given graph
463 foreach my $idx ( 0..$#groups ) {
464 my $ghash = $groups[$idx];
466 # Sort the groupings from big to little, and scan for a.c. witnesses
467 # that would need an extended graph.
468 my @acwits; # note which AC witnesses crop up at this rank
469 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
471 foreach my $rdg ( @idxkeys ) {
472 my @sg = sort @{$ghash->{$rdg}};
473 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
474 push( @grouping, \@sg );
476 # Save the reading order
477 $index_groupkeys->{$idx} = \@idxkeys;
479 # Now associate the distinct group with this index
480 my $gstr = wit_stringify( \@grouping );
481 push( @{$group_indices->{$gstr}}, $idx );
483 # Finally, add the group to the list to be calculated for this graph.
484 map { s/\Q$aclabel\E$// } @acwits;
487 $graph = $stemma->extend_graph( \@acwits );
489 die "Unable to extend graph with @acwits";
491 unless( exists $graph_problems->{"$graph"} ) {
492 $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
494 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
497 ## For each distinct graph, send its groups to the solver.
498 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
499 my $ua = LWP::UserAgent->new();
500 ## Witness map is a HACK to get around limitations in node names from IDP
501 my $witness_map = {};
502 ## Variables to store answers as they come back
503 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
504 my $genealogical = 0;
505 foreach my $graphkey ( keys %$graph_problems ) {
506 my $graph = $graph_problems->{$graphkey}->{'object'};
507 my $groupings = $graph_problems->{$graphkey}->{'groups'};
508 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
509 $groupings, $witness_map ) );
510 # Send it off and get the result
511 #print STDERR "Sending request: $json\n";
512 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
513 'Content' => $json );
516 if( $resp->is_success ) {
517 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
520 # Fall back to the old method.
521 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
522 . "; falling back to perl method";
523 $answer = perl_solver( $graph, @$groupings );
525 ## The answer is the evaluated groupings, plus a boolean for whether
526 ## they were genealogical. Reconstruct our original groups.
527 foreach my $gidx ( 0 .. $#{$groupings} ) {
528 my( $calc_groups, $result ) = @{$answer->[$gidx]};
531 # Prune the calculated groups, in case the IDP solver failed to.
534 foreach my $cg ( @$calc_groups ) {
535 # This is a little wasteful but the path of least
536 # resistance. Send both the stemma, which knows what
537 # its hypotheticals are, and the actual graph used.
538 my @pg = _prune_group( $cg, $stemma, $graph );
539 push( @pruned_groups, \@pg );
541 $calc_groups = \@pruned_groups;
544 # Retrieve the key for the original group that went to the solver
545 my $input_group = wit_stringify( $groupings->[$gidx] );
546 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
547 my @readings = @{$index_groupkeys->{$oidx}};
549 'genealogical' => $result,
552 foreach my $ridx ( 0 .. $#readings ) {
553 push( @{$vstruct->{'readings'}},
554 { 'readingid' => $readings[$ridx],
555 'group' => $calc_groups->[$ridx] } );
557 $variants->[$oidx] = $vstruct;
562 return { 'variants' => $variants,
563 'variant_count' => scalar @$variants,
564 'genealogical_count' => $genealogical };
567 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
569 sub _safe_wit_strings {
570 my( $graph, $c, $groupings, $witness_map ) = @_;
571 # Parse the graph we were given into a stemma.
572 my $safegraph = Graph->new();
573 # Convert the graph to a safe representation and store the conversion.
574 foreach my $n ( $graph->vertices ) {
575 my $sn = _safe_witstr( $n );
576 if( exists $witness_map->{$sn} ) {
577 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
578 if $witness_map->{$sn} ne $n;
580 $witness_map->{$sn} = $n;
582 $safegraph->add_vertex( $sn );
583 $safegraph->set_vertex_attributes( $sn,
584 $graph->get_vertex_attributes( $n ) );
586 foreach my $e ( $graph->edges ) {
587 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
588 $safegraph->add_edge( @safe_e );
590 my $safe_stemma = Text::Tradition::Stemma->new(
591 'collation' => $c, 'graph' => $safegraph );
593 # Now convert the witness groupings to a safe representation.
594 my $safe_groupings = [];
595 foreach my $grouping ( @$groupings ) {
596 my $safe_grouping = [];
597 foreach my $group ( @$grouping ) {
599 foreach my $n ( @$group ) {
600 my $sn = _safe_witstr( $n );
601 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
602 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
603 $witness_map->{$sn} = $n;
604 push( @$safe_group, $sn );
606 push( @$safe_grouping, $safe_group );
608 push( @$safe_groupings, $safe_grouping );
611 # Return it all in the struct we expect. We have stored the reductions
612 # in the $witness_map that we were passed.
613 return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
614 'groupings' => $safe_groupings };
619 $witstr =~ s/\s+/_/g;
620 $witstr =~ s/[^\w\d-]//g;
624 sub _desanitize_names {
625 my( $jsonstruct, $witness_map ) = @_;
627 foreach my $grouping ( @$jsonstruct ) {
628 my $real_grouping = [];
629 foreach my $element ( @$grouping ) {
630 if( ref( $element ) eq 'ARRAY' ) {
632 my $real_groupset = [];
633 foreach my $group ( @$element ) {
635 foreach my $n ( @$group ) {
636 my $rn = $witness_map->{$n};
637 push( @$real_group, $rn );
639 push( @$real_groupset, $real_group );
641 push( @$real_grouping, $real_groupset );
643 # It is the boolean, not actually a group.
644 push( @$real_grouping, $element );
647 push( @$result, $real_grouping );
654 =head2 analyze_location ( $tradition, $graph, $location_hash )
656 Given the tradition, its stemma graph, and the solution from the graph solver,
657 work out the rest of the information we want. For each reading we need missing,
658 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
662 sub analyze_location {
663 my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
664 my $c = $tradition->collation;
666 # Make a hash of all known node memberships, and make the subgraphs.
668 my $reading_roots = {};
670 my $acstr = $c->ac_label;
672 $DB::single = 1 if $variant_row->{id} == 87;
673 # Note which witnesses positively belong to which group
674 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
675 my $rid = $rdghash->{'readingid'};
676 foreach my $wit ( @{$rdghash->{'group'}} ) {
677 $contig->{$wit} = $rid;
678 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
684 # Get the actual graph we should work with
687 $graph = @acwits ? $stemma->extend_graph( \@acwits ) : $stemma->graph;
689 die "Could not extend graph with a.c. witnesses @acwits";
692 # Now, armed with that knowledge, make a subgraph for each reading
693 # and note the root(s) of each subgraph.
694 foreach my $rdghash( @{$variant_row->{'readings'}} ) {
695 my $rid = $rdghash->{'readingid'};
698 my $part = $graph->copy;
699 my @todelete = grep { exists $contig->{$_} && $contig->{$_} ne $rid }
701 $part->delete_vertices( @todelete );
702 _prune_subtree( $part, $lacunose );
703 $subgraph->{$rid} = $part;
704 # Record the remaining lacunose nodes as part of this group, if
705 # we are dealing with a non-genealogical reading.
706 unless( $variant_row->{'genealogical'} ) {
707 map { $contig->{$_} = $rid } $part->vertices;
709 # Get the reading roots.
710 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
713 # Now that we have all the node group memberships, calculate followed/
714 # non-followed/unknown values for each reading. Also figure out the
715 # reading's evident parent(s).
716 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
717 my $rid = $rdghash->{'readingid'};
719 my $part = $subgraph->{$rid};
721 # Start figuring things out.
722 my @roots = grep { $reading_roots->{$_} eq $rid } keys %$reading_roots;
723 $rdghash->{'independent_occurrence'} = \@roots;
724 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
725 # Find the parent readings, if any, of this reading.
727 foreach my $wit ( @roots ) {
728 # Look in the main stemma to find this witness's extant or known-reading
729 # immediate ancestor(s), and look up the reading that each ancestor olds.
730 my @check = $graph->predecessors( $wit );
733 foreach my $wparent( @check ) {
734 my $preading = $contig->{$wparent};
736 $rdgparents->{$preading} = 1;
738 push( @next, $graph->predecessors( $wparent ) );
744 foreach my $p ( keys %$rdgparents ) {
745 # Resolve the relationship of the parent to the reading, and
746 # save it in our hash.
747 my $pobj = $c->reading( $p );
749 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
751 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
753 $relation = { type => $rel->type };
754 if( $rel->has_annotation ) {
755 $relation->{'annotation'} = $rel->annotation;
759 my $phash = { 'label' => $prep, 'relation' => $relation };
760 $phash->{'text'} = $pobj->text if $pobj;
761 $rdgparents->{$p} = $phash;
764 $rdghash->{'reading_parents'} = $rdgparents;
766 # Find the number of times this reading was altered, and the number of
767 # times we're not sure.
768 my( %nofollow, %unknownfollow );
769 foreach my $wit ( $part->vertices ) {
770 foreach my $wchild ( $graph->successors( $wit ) ) {
771 next if $part->has_vertex( $wchild );
772 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
773 # It definitely changed here.
774 $nofollow{$wchild} = 1;
775 } elsif( !($contig->{$wchild}) ) {
776 # The child is a hypothetical node not definitely in
777 # any group. Answer is unknown.
778 $unknownfollow{$wchild} = 1;
779 } # else it's a non-root node in a known group, and therefore
780 # is presumed to have its reading from its group, not this link.
783 $rdghash->{'not_followed'} = keys %nofollow;
784 $rdghash->{'follow_unknown'} = keys %unknownfollow;
786 # Now say whether this reading represents a conflict.
787 unless( $variant_row->{'genealogical'} ) {
788 $rdghash->{'conflict'} = @roots != 1;
794 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
796 ** NOTE ** This method should hopefully not be called - it is not guaranteed
797 to be correct. Serves as a backup for the real solver.
799 Runs an analysis of the given tradition, at the location given in $rank,
800 against the graph of the stemma specified in $stemma_id. The argument
801 @merge_relationship_types is an optional list of relationship types for
802 which readings so related should be treated as equivalent.
804 Returns a nested array data structure as follows:
806 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
808 where the group list is the array of arrays passed in for each element of @groups,
809 possibly with the addition of hypothetical readings.
815 my( $graph, @groups ) = @_;
817 foreach my $g ( @groups ) {
818 push( @answer, _solve_variant_location( $graph, $g ) );
823 sub _solve_variant_location {
824 my( $graph, $groups ) = @_;
831 # Mark each ms as in its own group, first.
832 foreach my $g ( @$groups ) {
833 my $gst = wit_stringify( $g );
834 map { $contig->{$_} = $gst } @$g;
837 # Now for each unmarked node in the graph, initialize an array
838 # for possible group memberships. We will use this later to
839 # resolve potential conflicts.
840 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
841 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
842 my $gst = wit_stringify( $g ); # This is the group name
843 # Copy the graph, and delete all non-members from the new graph.
844 my $part = $graph->copy;
846 $part->delete_vertices(
847 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
849 # Now look to see if our group is connected.
851 # We have to take directionality into account.
852 # How many root nodes do we have?
853 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
854 $part->predecessorless_vertices;
855 # Assuming that @$g > 1, find the first root node that has at
856 # least one successor belonging to our group. If this reading
857 # is genealogical, there should be only one, but we will check
858 # that implicitly later.
859 foreach my $root ( @roots ) {
860 # Prune the tree to get rid of extraneous hypotheticals.
861 $root = _prune_subtree_old( $part, $root, $contig );
863 # Save this root for our group.
864 push( @group_roots, $root );
865 # Get all the successor nodes of our root.
868 # Dispense with the trivial case of one reading.
870 @group_roots = ( $wit );
871 foreach my $v ( $part->vertices ) {
872 $part->delete_vertex( $v ) unless $v eq $wit;
876 if( @group_roots > 1 ) {
877 $conflict->{$gst} = 1;
880 # Paint the 'hypotheticals' with our group.
881 foreach my $wit ( $part->vertices ) {
882 if( ref( $contig->{$wit} ) ) {
883 push( @{$contig->{$wit}}, $gst );
884 } elsif( $contig->{$wit} ne $gst ) {
885 warn "How did we get here?";
890 # Save the relevant subgraph.
891 $subgraph->{$gst} = $part;
894 # For each of our hypothetical readings, flatten its 'contig' array if
895 # the array contains zero or one group. If we have any unflattened arrays,
896 # we may need to run the resolution process. If the reading is already known
897 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
900 foreach my $wit ( keys %$contig ) {
901 next unless ref( $contig->{$wit} );
902 if( @{$contig->{$wit}} > 1 ) {
903 if( $is_conflicted ) {
904 $contig->{$wit} = ''; # We aren't going to decide.
906 push( @resolve, $wit );
909 my $gst = pop @{$contig->{$wit}};
910 $contig->{$wit} = $gst || '';
915 my $still_contig = {};
916 foreach my $h ( @resolve ) {
917 # For each of the hypothetical readings with more than one possibility,
918 # try deleting it from each of its member subgraphs in turn, and see
919 # if that breaks the contiguous grouping.
920 # TODO This can still break in a corner case where group A can use
921 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
922 # Revisit this if necessary; it could get brute-force nasty.
923 foreach my $gst ( @{$contig->{$h}} ) {
924 my $gpart = $subgraph->{$gst}->copy();
925 # If we have come this far, there is only one root and everything
926 # is reachable from it.
927 my( $root ) = $gpart->predecessorless_vertices;
929 map { $reachable->{$_} = 1 } $gpart->vertices;
931 # Try deleting the hypothetical node.
932 $gpart->delete_vertex( $h );
934 # See if we still have a single root.
935 my @roots = $gpart->predecessorless_vertices;
936 warn "This shouldn't have happened" unless @roots;
938 # $h is needed by this group.
939 if( exists( $still_contig->{$h} ) ) {
941 $conflict->{$gst} = 1;
942 $still_contig->{$h} = '';
944 $still_contig->{$h} = $gst;
948 # $h is somewhere in the middle. See if everything
949 # else can still be reached from the root.
950 my %still_reachable = ( $root => 1 );
951 map { $still_reachable{$_} = 1 }
952 $gpart->all_successors( $root );
953 foreach my $v ( keys %$reachable ) {
955 if( !$still_reachable{$v}
956 && ( $contig->{$v} eq $gst
957 || ( exists $still_contig->{$v}
958 && $still_contig->{$v} eq $gst ) ) ) {
960 if( exists $still_contig->{$h} ) {
962 $conflict->{$gst} = 1;
963 $still_contig->{$h} = '';
965 $still_contig->{$h} = $gst;
968 } # else we don't need $h in this group.
970 } # endif $h eq $root
974 # Now we have some hypothetical vertices in $still_contig that are the
975 # "real" group memberships. Replace these in $contig.
976 foreach my $v ( keys %$contig ) {
977 next unless ref $contig->{$v};
978 $contig->{$v} = $still_contig->{$v};
982 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
983 my $variant_row = [ [], $is_genealogical ];
984 # Fill in the groupings from $contig.
985 foreach my $g ( @$groups ) {
986 my $gst = wit_stringify( $g );
987 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
988 push( @{$variant_row->[0]}, \@realgroup );
994 my( $group, $stemma, $graph ) = @_;
996 map { $lacunose->{$_} = 1 } $stemma->hypotheticals;
997 map { $lacunose->{$_} = 0 } @$group;
999 my $subgraph = $graph->copy;
1000 map { $subgraph->delete_vertex( $_ ) unless exists $lacunose->{$_} }
1001 $subgraph->vertices;
1002 # ...and find the root.
1003 # Now prune and return the remaining vertices.
1004 _prune_subtree( $subgraph, $lacunose );
1005 return $subgraph->vertices;
1008 sub _prune_subtree {
1009 my( $tree, $lacunose ) = @_;
1011 # Delete lacunose witnesses that have no successors
1012 my @orphan_hypotheticals;
1015 die "Infinite loop on leaves" if $ctr > 100;
1016 @orphan_hypotheticals = grep { $lacunose->{$_} }
1017 $tree->successorless_vertices;
1018 $tree->delete_vertices( @orphan_hypotheticals );
1020 } while( @orphan_hypotheticals );
1022 # Delete lacunose roots that have a single successor
1026 die "Infinite loop on roots" if $ctr > 100;
1027 @redundant_root = grep { $lacunose->{$_} && $tree->successors( $_ ) == 1 }
1028 $tree->predecessorless_vertices;
1029 $tree->delete_vertices( @redundant_root );
1031 } while( @redundant_root );
1034 sub _prune_subtree_old {
1035 my( $tree, $root, $contighash ) = @_;
1036 # First, delete hypothetical leaves / orphans until there are none left.
1037 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1038 $tree->successorless_vertices;
1039 while( @orphan_hypotheticals ) {
1040 $tree->delete_vertices( @orphan_hypotheticals );
1041 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1042 $tree->successorless_vertices;
1044 # Then delete a hypothetical root with only one successor, moving the
1045 # root to the first child that has no other predecessors.
1046 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
1047 my @nextroot = $tree->successors( $root );
1048 $tree->delete_vertex( $root );
1049 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
1051 # The tree has been modified in place, but we need to know the new root.
1052 $root = undef unless $root && $tree->has_vertex( $root );
1055 # Add the variant, subject to a.c. representation logic.
1056 # This assumes that we will see the 'main' version before the a.c. version.
1057 sub add_variant_wit {
1058 my( $arr, $wit, $acstr ) = @_;
1060 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
1062 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
1064 push( @$arr, $wit ) unless $skip;
1067 sub _useful_variant {
1068 my( $group_readings, $graph, $acstr ) = @_;
1070 # TODO Decide what to do with AC witnesses
1072 # Sort by group size and return
1074 my( @readings, @groups ); # The sorted groups for our answer.
1075 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
1076 keys %$group_readings ) {
1077 push( @readings, $rdg );
1078 push( @groups, $group_readings->{$rdg} );
1079 if( @{$group_readings->{$rdg}} > 1 ) {
1082 my( $wit ) = @{$group_readings->{$rdg}};
1083 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
1084 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
1087 if( $is_useful > 1 ) {
1088 return( \@readings, \@groups );
1094 =head2 wit_stringify( $groups )
1096 Takes an array of witness groupings and produces a string like
1097 ['A','B'] / ['C','D','E'] / ['F']
1104 # If we were passed an array of witnesses instead of an array of
1105 # groupings, then "group" the witnesses first.
1106 unless( ref( $groups->[0] ) ) {
1107 my $mkgrp = [ $groups ];
1110 foreach my $g ( @$groups ) {
1111 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1113 return join( ' / ', @gst );
1117 my( $lista, $listb ) = @_;
1120 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1121 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
1122 my @set = grep { $union{$_} == 1 } keys %union;
1123 return map { $scalars{$_} } @set;
1130 This package is free software and is provided "as is" without express
1131 or implied warranty. You can redistribute it and/or modify it under
1132 the same terms as Perl itself.
1136 Tara L Andrews E<lt>aurum@cpan.orgE<gt>