1 package Text::Tradition::Analysis;
6 use Encode qw/ encode_utf8 /;
9 use JSON qw/ encode_json decode_json /;
12 use Text::Tradition::Stemma;
15 use vars qw/ @EXPORT_OK /;
16 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
20 Text::Tradition::Analysis - functions for stemma analysis of a tradition
25 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
26 my $t = Text::Tradition->new(
27 'name' => 'this is a text',
29 'file' => '/path/to/tei_parallel_seg_file.xml' );
30 $t->add_stemma( 'dotfile' => $stemmafile );
32 my $variant_data = run_analysis( $tradition );
33 # Recalculate rank $n treating all orthographic variants as equivalent
34 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
38 Text::Tradition is a library for representation and analysis of collated
39 texts, particularly medieval ones. The Collation is the central feature of
40 a Tradition, where the text, its sequence of readings, and its relationships
41 between readings are actually kept.
45 =head2 run_analysis( $tradition, %opts )
47 Runs the analysis described in analyze_variant_location on every location in the
48 collation of the given tradition, with the given options. These include:
52 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
53 is 0 (i.e. the first).
55 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
57 =item * merge_types - Specify a list of relationship types, where related readings
58 should be treated as identical for the purposes of analysis.
60 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
67 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
69 my $datafile = 't/data/florilegium_tei_ps.xml';
70 my $tradition = Text::Tradition->new( 'input' => 'TEI',
72 'file' => $datafile );
73 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
74 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
76 my %expected_genealogical = (
107 my $data = run_analysis( $tradition );
108 my $c = $tradition->collation;
109 foreach my $row ( @{$data->{'variants'}} ) {
110 # Account for rows that used to be "not useful"
111 unless( exists $expected_genealogical{$row->{'id'}} ) {
112 $expected_genealogical{$row->{'id'}} = 1;
114 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
115 is( $gen_bool, $expected_genealogical{$row->{'id'}},
116 "Got correct genealogical flag for row " . $row->{'id'} );
117 # Check that we have the right row with the right groups
118 my $rank = $row->{'id'};
119 foreach my $rdghash ( @{$row->{'readings'}} ) {
120 # Skip 'readings' that aren't really
121 next unless $c->reading( $rdghash->{'readingid'} );
123 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
124 "Got correct reading rank" );
125 # Check the witnesses
126 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
127 my @sgrp = sort @{$rdghash->{'group'}};
128 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
131 is( $data->{'variant_count'}, 58, "Got right total variant number" );
132 # TODO Make something meaningful of conflict count, maybe test other bits
139 my( $tradition, %opts ) = @_;
140 my $c = $tradition->collation;
142 my $stemma_id = $opts{'stemma_id'} || 0;
143 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
144 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
147 my $stemma = $tradition->stemma( $stemma_id );
149 # Figure out which witnesses we are working with - that is, the ones that
150 # appear both in the stemma and in the tradition. All others are 'lacunose'
152 my @lacunose = $stemma->hypotheticals;
153 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
154 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
156 # Find and mark 'common' ranks for exclusion, unless they were
157 # explicitly specified.
160 foreach my $rdg ( $c->common_readings ) {
161 $common_rank{$rdg->rank} = 1;
163 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
166 # Group the variants to send to the solver
171 foreach my $rank ( @ranks ) {
172 my $missing = [ @lacunose ];
173 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
174 # Filter out any empty rankgroups
175 # (e.g. from the later rank for a transposition)
176 next unless keys %$rankgroup;
177 if( $opts{'exclude_type1'} ) {
178 # Check to see whether this is a "useful" group.
179 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
180 $stemma->graph, $c->ac_label );
183 push( @use_ranks, $rank );
184 push( @groups, $rankgroup );
185 $lacunae{$rank} = $missing;
188 my $answer = solve_variants( $stemma, @groups );
190 # Do further analysis on the answer
191 my $conflict_count = 0;
192 my $aclabel = $c->ac_label;
193 foreach my $idx ( 0 .. $#use_ranks ) {
194 my $location = $answer->{'variants'}->[$idx];
195 # Add the rank back in
196 my $rank = $use_ranks[$idx];
197 $location->{'id'} = $rank;
198 # Note what our lacunae are
200 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
201 $location->{'missing'} = [ keys %lmiss ];
203 # Run the extra analysis we need.
204 analyze_location( $tradition, $stemma, $location, \%lmiss );
207 # Do the final post-analysis tidying up of the data.
208 foreach my $rdghash ( @{$location->{'readings'}} ) {
210 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
211 # Add the reading text back in, setting display value as needed
212 my $rdg = $c->reading( $rdghash->{'readingid'} );
214 $rdghash->{'text'} = $rdg->text .
215 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
217 # Remove lacunose witnesses from this reading's list now that the
220 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
221 $rdghash->{'group'} = \@realgroup;
222 # Note any layered witnesses that appear in this group
223 foreach( @realgroup ) {
224 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
225 push( @layerwits, $1 );
229 $location->{'layerwits'} = \@layerwits if @layerwits;
231 $answer->{'conflict_count'} = $conflict_count;
236 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
238 Groups the variants at the given $rank of the collation, treating any
239 relationships in @merge_relationship_types as equivalent. $lacunose should
240 be a reference to an array, to which the sigla of lacunose witnesses at this
241 rank will be appended; $transposed should be a reference to a hash, wherein
242 the identities of transposed readings and their relatives will be stored.
244 Returns a hash $group_readings where $rdg is attested by the witnesses listed
245 in $group_readings->{$rdg}.
249 # Return group_readings, groups, lacunose
251 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
252 my $c = $tradition->collation;
253 my $aclabel = $c->ac_label;
254 my $table = $c->alignment_table;
255 # Get the alignment table readings
256 my %readings_at_rank;
257 my %is_lacunose; # lookup table for witnesses not in stemma
258 map { $is_lacunose{$_} = 1; $is_lacunose{$_.$aclabel} = 1 } @$lacunose;
261 my $has_transposition;
262 foreach my $tablewit ( @{$table->{'alignment'}} ) {
263 my $rdg = $tablewit->{'tokens'}->[$rank-1];
264 my $wit = $tablewit->{'witness'};
265 # Exclude the witness if it is "lacunose" which if we got here
266 # means "not in the stemma".
267 next if $is_lacunose{$wit};
268 # Note if the witness is actually in a lacuna
269 if( $rdg && $rdg->{'t'}->is_lacuna ) {
270 _add_to_witlist( $wit, $lacunose, $aclabel );
271 # Otherwise the witness either has a positive reading...
273 # If the reading has been counted elsewhere as a transposition, ignore it.
274 if( $transposed->{$rdg->{'t'}->id} ) {
275 # TODO Does this cope with three-way transpositions?
276 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
279 # Otherwise, record it...
280 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
281 # ...and grab any transpositions, and their relations.
282 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
283 foreach my $trdg ( @transp ) {
284 next if exists $readings_at_rank{$trdg->id};
285 $has_transposition = 1;
286 my @affected_wits = _table_witnesses(
287 $table, $trdg, \%is_lacunose, $aclabel );
288 next unless @affected_wits;
289 map { $moved_wits{$_} = 1 } @affected_wits;
290 $transposed->{$trdg->id} =
291 [ _table_witnesses( $table, $rdg->{'t'}, \%is_lacunose, $aclabel ) ];
292 $readings_at_rank{$trdg->id} = $trdg;
294 # ...or it is empty, ergo a gap.
296 _add_to_witlist( $wit, \@check_for_gaps, $aclabel );
300 map { _add_to_witlist( $_, \@gap_wits, $aclabel )
301 unless $moved_wits{$_} } @check_for_gaps;
302 # Group the readings, collapsing groups by relationship if needed
303 my $grouped_readings = {};
304 foreach my $rdg ( values %readings_at_rank ) {
305 # Skip readings that have been collapsed into others.
306 next if exists $grouped_readings->{$rdg->id}
307 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
308 # Get the witness list, including from readings collapsed into this one.
309 my @wits = _table_witnesses( $table, $rdg, \%is_lacunose, $aclabel );
310 if( $collapse && @$collapse ) {
311 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
312 foreach my $other ( $rdg->related_readings( $filter ) ) {
313 my @otherwits = _table_witnesses(
314 $table, $other, \%is_lacunose, $aclabel );
315 push( @wits, @otherwits );
316 $grouped_readings->{$other->id} = 'COLLAPSE';
319 $grouped_readings->{$rdg->id} = \@wits;
321 $grouped_readings->{'(omitted)'} = \@gap_wits if @gap_wits;
322 # Get rid of our collapsed readings
323 map { delete $grouped_readings->{$_} if $grouped_readings->{$_} eq 'COLLAPSE' }
324 keys %$grouped_readings
327 # If something was transposed, check the groups for doubled-up readings
328 if( $has_transposition ) {
329 # print STDERR "Group for rank $rank:\n";
330 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
331 # keys %$grouped_readings;
332 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
336 return $grouped_readings;
339 # Helper function to query the alignment table for all witnesses (a.c. included)
340 # that have a given reading at its rank.
341 sub _table_witnesses {
342 my( $table, $trdg, $lacunose, $aclabel ) = @_;
343 my $tableidx = $trdg->rank - 1;
345 foreach my $row ( @{$table->{'alignment'}} ) {
346 my $wit = $row->{'witness'};
347 next if $lacunose->{$wit};
348 my $rdg = $row->{'tokens'}->[$tableidx];
349 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
350 _add_to_witlist( $wit, \@has_reading, $aclabel )
351 if $rdg->{'t'}->id eq $trdg->id;
356 # Helper function to ensure that X and X a.c. never appear in the same list.
357 sub _add_to_witlist {
358 my( $wit, $list, $acstr ) = @_;
361 map { $inlist{$_} = $idx++ } @$list;
362 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
364 unless( exists $inlist{$acwit} ) {
365 push( @$list, $acwit.$acstr );
368 if( exists( $inlist{$wit.$acstr} ) ) {
369 # Replace the a.c. version with the main witness
370 my $i = $inlist{$wit.$acstr};
373 push( @$list, $wit );
378 sub _check_transposed_consistency {
379 my( $c, $rank, $transposed, $groupings ) = @_;
382 # Note which readings are actually at this rank, and which witnesses
383 # belong to which reading.
384 foreach my $rdg ( keys %$groupings ) {
385 my $rdgobj = $c->reading( $rdg );
386 # Count '(omitted)' as a reading at this rank
387 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
388 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
390 # Our work is done if we have no witness belonging to more than one
392 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
393 return unless @doubled;
394 # If we have a symmetric related transposition, drop the non-rank readings.
395 if( @doubled == scalar keys %seen_wits ) {
396 foreach my $rdg ( keys %$groupings ) {
397 if( !$thisrank{$rdg} ) {
398 my $groupstr = wit_stringify( $groupings->{$rdg} );
399 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
401 delete $groupings->{$rdg};
402 # If we found a group match, assume there is a symmetry happening.
403 # TODO think more about this
404 # print STDERR "*** Deleting symmetric reading $rdg\n";
406 delete $transposed->{$rdg};
407 warn "Found problem in evident symmetry with reading $rdg";
411 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
413 foreach my $dup ( @doubled ) {
414 foreach my $rdg ( @{$seen_wits{$dup}} ) {
415 next if $thisrank{$rdg};
416 next unless exists $groupings->{$rdg};
417 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
418 delete $groupings->{$rdg};
419 delete $transposed->{$rdg};
422 # and put any now-orphaned readings into an 'omitted' reading.
423 foreach my $wit ( keys %seen_wits ) {
424 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
425 $groupings->{'(omitted)'} = [] unless exists $groupings->{'(omitted)'};
426 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
432 =head2 solve_variants( $graph, @groups )
434 Sends the set of groups to the external graph solver service and returns
435 a cleaned-up answer, adding the rank IDs back where they belong.
437 The JSON has the form
438 { "graph": [ stemmagraph DOT string without newlines ],
439 "groupings": [ array of arrays of groups, one per rank ] }
441 The answer has the form
442 { "variants" => [ array of variant location structures ],
443 "variant_count" => total,
444 "conflict_count" => number of conflicts detected,
445 "genealogical_count" => number of solutions found }
450 my( $stemma, @groups ) = @_;
451 my $aclabel = $stemma->collation->ac_label;
453 # Filter the groups down to distinct groups, and work out what graph
454 # should be used in the calculation of each group. We want to send each
455 # distinct problem to the solver only once.
456 # We need a whole bunch of lookup tables for this.
457 my $index_groupkeys = {}; # Save the order of readings
458 my $group_indices = {}; # Save the indices that have a given grouping
459 my $graph_problems = {}; # Save the groupings for the given graph
461 foreach my $idx ( 0..$#groups ) {
462 my $ghash = $groups[$idx];
464 # Sort the groupings from big to little, and scan for a.c. witnesses
465 # that would need an extended graph.
466 my @acwits; # note which AC witnesses crop up at this rank
467 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
469 foreach my $rdg ( @idxkeys ) {
470 my @sg = sort @{$ghash->{$rdg}};
471 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
472 push( @grouping, \@sg );
474 # Save the reading order
475 $index_groupkeys->{$idx} = \@idxkeys;
477 # Now associate the distinct group with this index
478 my $gstr = wit_stringify( \@grouping );
479 push( @{$group_indices->{$gstr}}, $idx );
481 # Finally, add the group to the list to be calculated for this graph.
482 map { s/\Q$aclabel\E$// } @acwits;
485 $graph = $stemma->extend_graph( \@acwits );
487 die "Unable to extend graph with @acwits";
489 unless( exists $graph_problems->{"$graph"} ) {
490 $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
492 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
495 ## For each distinct graph, send its groups to the solver.
496 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
497 my $ua = LWP::UserAgent->new();
498 ## Witness map is a HACK to get around limitations in node names from IDP
499 my $witness_map = {};
500 ## Variables to store answers as they come back
501 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
502 my $genealogical = 0;
503 foreach my $graphkey ( keys %$graph_problems ) {
504 my $graph = $graph_problems->{$graphkey}->{'object'};
505 my $groupings = $graph_problems->{$graphkey}->{'groups'};
506 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
507 $groupings, $witness_map ) );
508 # Send it off and get the result
509 #print STDERR "Sending request: $json\n";
510 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
511 'Content' => $json );
514 if( $resp->is_success ) {
515 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
518 # Fall back to the old method.
519 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
520 . "; falling back to perl method";
521 $answer = perl_solver( $graph, @$groupings );
523 ## The answer is the evaluated groupings, plus a boolean for whether
524 ## they were genealogical. Reconstruct our original groups.
525 foreach my $gidx ( 0 .. $#{$groupings} ) {
526 my( $calc_groups, $result ) = @{$answer->[$gidx]};
529 # Prune the calculated groups, in case the IDP solver failed to.
532 foreach my $cg ( @$calc_groups ) {
533 # This is a little wasteful but the path of least
534 # resistance. Send both the stemma, which knows what
535 # its hypotheticals are, and the actual graph used.
536 my @pg = _prune_group( $cg, $stemma, $graph );
537 push( @pruned_groups, \@pg );
539 $calc_groups = \@pruned_groups;
542 # Retrieve the key for the original group that went to the solver
543 my $input_group = wit_stringify( $groupings->[$gidx] );
544 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
545 my @readings = @{$index_groupkeys->{$oidx}};
547 'genealogical' => $result,
550 foreach my $ridx ( 0 .. $#readings ) {
551 push( @{$vstruct->{'readings'}},
552 { 'readingid' => $readings[$ridx],
553 'group' => $calc_groups->[$ridx] } );
555 $variants->[$oidx] = $vstruct;
560 return { 'variants' => $variants,
561 'variant_count' => scalar @$variants,
562 'genealogical_count' => $genealogical };
565 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
567 sub _safe_wit_strings {
568 my( $graph, $c, $groupings, $witness_map ) = @_;
569 # Parse the graph we were given into a stemma.
570 my $safegraph = Graph->new();
571 # Convert the graph to a safe representation and store the conversion.
572 foreach my $n ( $graph->vertices ) {
573 my $sn = _safe_witstr( $n );
574 if( exists $witness_map->{$sn} ) {
575 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
576 if $witness_map->{$sn} ne $n;
578 $witness_map->{$sn} = $n;
580 $safegraph->add_vertex( $sn );
581 $safegraph->set_vertex_attributes( $sn,
582 $graph->get_vertex_attributes( $n ) );
584 foreach my $e ( $graph->edges ) {
585 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
586 $safegraph->add_edge( @safe_e );
588 my $safe_stemma = Text::Tradition::Stemma->new(
589 'collation' => $c, 'graph' => $safegraph );
591 # Now convert the witness groupings to a safe representation.
592 my $safe_groupings = [];
593 foreach my $grouping ( @$groupings ) {
594 my $safe_grouping = [];
595 foreach my $group ( @$grouping ) {
597 foreach my $n ( @$group ) {
598 my $sn = _safe_witstr( $n );
599 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
600 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
601 $witness_map->{$sn} = $n;
602 push( @$safe_group, $sn );
604 push( @$safe_grouping, $safe_group );
606 push( @$safe_groupings, $safe_grouping );
609 # Return it all in the struct we expect. We have stored the reductions
610 # in the $witness_map that we were passed.
611 return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
612 'groupings' => $safe_groupings };
617 $witstr =~ s/\s+/_/g;
618 $witstr =~ s/[^\w\d-]//g;
622 sub _desanitize_names {
623 my( $jsonstruct, $witness_map ) = @_;
625 foreach my $grouping ( @$jsonstruct ) {
626 my $real_grouping = [];
627 foreach my $element ( @$grouping ) {
628 if( ref( $element ) eq 'ARRAY' ) {
630 my $real_groupset = [];
631 foreach my $group ( @$element ) {
633 foreach my $n ( @$group ) {
634 my $rn = $witness_map->{$n};
635 push( @$real_group, $rn );
637 push( @$real_groupset, $real_group );
639 push( @$real_grouping, $real_groupset );
641 # It is the boolean, not actually a group.
642 push( @$real_grouping, $element );
645 push( @$result, $real_grouping );
652 =head2 analyze_location ( $tradition, $graph, $location_hash )
654 Given the tradition, its stemma graph, and the solution from the graph solver,
655 work out the rest of the information we want. For each reading we need missing,
656 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
660 sub analyze_location {
661 my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
662 my $c = $tradition->collation;
664 # Make a hash of all known node memberships, and make the subgraphs.
666 my $reading_roots = {};
668 my $acstr = $c->ac_label;
670 $DB::single = 1 if $variant_row->{id} == 87;
671 # Note which witnesses positively belong to which group
672 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
673 my $rid = $rdghash->{'readingid'};
674 foreach my $wit ( @{$rdghash->{'group'}} ) {
675 $contig->{$wit} = $rid;
676 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
682 # Get the actual graph we should work with
685 $graph = @acwits ? $stemma->extend_graph( \@acwits ) : $stemma->graph;
687 die "Could not extend graph with a.c. witnesses @acwits";
690 # Now, armed with that knowledge, make a subgraph for each reading
691 # and note the root(s) of each subgraph.
692 foreach my $rdghash( @{$variant_row->{'readings'}} ) {
693 my $rid = $rdghash->{'readingid'};
696 my $part = $graph->copy;
697 my @todelete = grep { exists $contig->{$_} && $contig->{$_} ne $rid }
699 $part->delete_vertices( @todelete );
700 _prune_subtree( $part, $lacunose );
701 $subgraph->{$rid} = $part;
702 # Record the remaining lacunose nodes as part of this group, if
703 # we are dealing with a non-genealogical reading.
704 unless( $variant_row->{'genealogical'} ) {
705 map { $contig->{$_} = $rid } $part->vertices;
707 # Get the reading roots.
708 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
711 # Now that we have all the node group memberships, calculate followed/
712 # non-followed/unknown values for each reading. Also figure out the
713 # reading's evident parent(s).
714 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
715 my $rid = $rdghash->{'readingid'};
717 my $part = $subgraph->{$rid};
719 # Start figuring things out.
720 my @roots = grep { $reading_roots->{$_} eq $rid } keys %$reading_roots;
721 $rdghash->{'independent_occurrence'} = \@roots;
722 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
723 # Find the parent readings, if any, of this reading.
725 foreach my $wit ( @roots ) {
726 # Look in the main stemma to find this witness's extant or known-reading
727 # immediate ancestor(s), and look up the reading that each ancestor olds.
728 my @check = $graph->predecessors( $wit );
731 foreach my $wparent( @check ) {
732 my $preading = $contig->{$wparent};
734 $rdgparents->{$preading} = 1;
736 push( @next, $graph->predecessors( $wparent ) );
742 foreach my $p ( keys %$rdgparents ) {
743 # Resolve the relationship of the parent to the reading, and
744 # save it in our hash.
745 my $pobj = $c->reading( $p );
747 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
749 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
751 $relation = { type => $rel->type };
752 if( $rel->has_annotation ) {
753 $relation->{'annotation'} = $rel->annotation;
757 $rdgparents->{$p} = { 'label' => $prep, 'relation' => $relation };
760 $rdghash->{'reading_parents'} = $rdgparents;
762 # Find the number of times this reading was altered, and the number of
763 # times we're not sure.
764 my( %nofollow, %unknownfollow );
765 foreach my $wit ( $part->vertices ) {
766 foreach my $wchild ( $graph->successors( $wit ) ) {
767 next if $part->has_vertex( $wchild );
768 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
769 # It definitely changed here.
770 $nofollow{$wchild} = 1;
771 } elsif( !($contig->{$wchild}) ) {
772 # The child is a hypothetical node not definitely in
773 # any group. Answer is unknown.
774 $unknownfollow{$wchild} = 1;
775 } # else it's a non-root node in a known group, and therefore
776 # is presumed to have its reading from its group, not this link.
779 $rdghash->{'not_followed'} = keys %nofollow;
780 $rdghash->{'follow_unknown'} = keys %unknownfollow;
782 # Now say whether this reading represents a conflict.
783 unless( $variant_row->{'genealogical'} ) {
784 $rdghash->{'conflict'} = @roots != 1;
790 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
792 ** NOTE ** This method should hopefully not be called - it is not guaranteed
793 to be correct. Serves as a backup for the real solver.
795 Runs an analysis of the given tradition, at the location given in $rank,
796 against the graph of the stemma specified in $stemma_id. The argument
797 @merge_relationship_types is an optional list of relationship types for
798 which readings so related should be treated as equivalent.
800 Returns a nested array data structure as follows:
802 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
804 where the group list is the array of arrays passed in for each element of @groups,
805 possibly with the addition of hypothetical readings.
811 my( $graph, @groups ) = @_;
813 foreach my $g ( @groups ) {
814 push( @answer, _solve_variant_location( $graph, $g ) );
819 sub _solve_variant_location {
820 my( $graph, $groups ) = @_;
827 # Mark each ms as in its own group, first.
828 foreach my $g ( @$groups ) {
829 my $gst = wit_stringify( $g );
830 map { $contig->{$_} = $gst } @$g;
833 # Now for each unmarked node in the graph, initialize an array
834 # for possible group memberships. We will use this later to
835 # resolve potential conflicts.
836 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
837 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
838 my $gst = wit_stringify( $g ); # This is the group name
839 # Copy the graph, and delete all non-members from the new graph.
840 my $part = $graph->copy;
842 $part->delete_vertices(
843 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
845 # Now look to see if our group is connected.
847 # We have to take directionality into account.
848 # How many root nodes do we have?
849 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
850 $part->predecessorless_vertices;
851 # Assuming that @$g > 1, find the first root node that has at
852 # least one successor belonging to our group. If this reading
853 # is genealogical, there should be only one, but we will check
854 # that implicitly later.
855 foreach my $root ( @roots ) {
856 # Prune the tree to get rid of extraneous hypotheticals.
857 $root = _prune_subtree_old( $part, $root, $contig );
859 # Save this root for our group.
860 push( @group_roots, $root );
861 # Get all the successor nodes of our root.
864 # Dispense with the trivial case of one reading.
866 @group_roots = ( $wit );
867 foreach my $v ( $part->vertices ) {
868 $part->delete_vertex( $v ) unless $v eq $wit;
872 if( @group_roots > 1 ) {
873 $conflict->{$gst} = 1;
876 # Paint the 'hypotheticals' with our group.
877 foreach my $wit ( $part->vertices ) {
878 if( ref( $contig->{$wit} ) ) {
879 push( @{$contig->{$wit}}, $gst );
880 } elsif( $contig->{$wit} ne $gst ) {
881 warn "How did we get here?";
886 # Save the relevant subgraph.
887 $subgraph->{$gst} = $part;
890 # For each of our hypothetical readings, flatten its 'contig' array if
891 # the array contains zero or one group. If we have any unflattened arrays,
892 # we may need to run the resolution process. If the reading is already known
893 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
896 foreach my $wit ( keys %$contig ) {
897 next unless ref( $contig->{$wit} );
898 if( @{$contig->{$wit}} > 1 ) {
899 if( $is_conflicted ) {
900 $contig->{$wit} = ''; # We aren't going to decide.
902 push( @resolve, $wit );
905 my $gst = pop @{$contig->{$wit}};
906 $contig->{$wit} = $gst || '';
911 my $still_contig = {};
912 foreach my $h ( @resolve ) {
913 # For each of the hypothetical readings with more than one possibility,
914 # try deleting it from each of its member subgraphs in turn, and see
915 # if that breaks the contiguous grouping.
916 # TODO This can still break in a corner case where group A can use
917 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
918 # Revisit this if necessary; it could get brute-force nasty.
919 foreach my $gst ( @{$contig->{$h}} ) {
920 my $gpart = $subgraph->{$gst}->copy();
921 # If we have come this far, there is only one root and everything
922 # is reachable from it.
923 my( $root ) = $gpart->predecessorless_vertices;
925 map { $reachable->{$_} = 1 } $gpart->vertices;
927 # Try deleting the hypothetical node.
928 $gpart->delete_vertex( $h );
930 # See if we still have a single root.
931 my @roots = $gpart->predecessorless_vertices;
932 warn "This shouldn't have happened" unless @roots;
934 # $h is needed by this group.
935 if( exists( $still_contig->{$h} ) ) {
937 $conflict->{$gst} = 1;
938 $still_contig->{$h} = '';
940 $still_contig->{$h} = $gst;
944 # $h is somewhere in the middle. See if everything
945 # else can still be reached from the root.
946 my %still_reachable = ( $root => 1 );
947 map { $still_reachable{$_} = 1 }
948 $gpart->all_successors( $root );
949 foreach my $v ( keys %$reachable ) {
951 if( !$still_reachable{$v}
952 && ( $contig->{$v} eq $gst
953 || ( exists $still_contig->{$v}
954 && $still_contig->{$v} eq $gst ) ) ) {
956 if( exists $still_contig->{$h} ) {
958 $conflict->{$gst} = 1;
959 $still_contig->{$h} = '';
961 $still_contig->{$h} = $gst;
964 } # else we don't need $h in this group.
966 } # endif $h eq $root
970 # Now we have some hypothetical vertices in $still_contig that are the
971 # "real" group memberships. Replace these in $contig.
972 foreach my $v ( keys %$contig ) {
973 next unless ref $contig->{$v};
974 $contig->{$v} = $still_contig->{$v};
978 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
979 my $variant_row = [ [], $is_genealogical ];
980 # Fill in the groupings from $contig.
981 foreach my $g ( @$groups ) {
982 my $gst = wit_stringify( $g );
983 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
984 push( @{$variant_row->[0]}, \@realgroup );
990 my( $group, $stemma, $graph ) = @_;
992 map { $lacunose->{$_} = 1 } $stemma->hypotheticals;
993 map { $lacunose->{$_} = 0 } @$group;
995 my $subgraph = $graph->copy;
996 map { $subgraph->delete_vertex( $_ ) unless exists $lacunose->{$_} }
998 # ...and find the root.
999 # Now prune and return the remaining vertices.
1000 _prune_subtree( $subgraph, $lacunose );
1001 return $subgraph->vertices;
1004 sub _prune_subtree {
1005 my( $tree, $lacunose ) = @_;
1007 # Delete lacunose witnesses that have no successors
1008 my @orphan_hypotheticals;
1011 die "Infinite loop on leaves" if $ctr > 100;
1012 @orphan_hypotheticals = grep { $lacunose->{$_} }
1013 $tree->successorless_vertices;
1014 $tree->delete_vertices( @orphan_hypotheticals );
1016 } while( @orphan_hypotheticals );
1018 # Delete lacunose roots that have a single successor
1022 die "Infinite loop on roots" if $ctr > 100;
1023 @redundant_root = grep { $lacunose->{$_} && $tree->successors( $_ ) == 1 }
1024 $tree->predecessorless_vertices;
1025 $tree->delete_vertices( @redundant_root );
1027 } while( @redundant_root );
1030 sub _prune_subtree_old {
1031 my( $tree, $root, $contighash ) = @_;
1032 # First, delete hypothetical leaves / orphans until there are none left.
1033 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1034 $tree->successorless_vertices;
1035 while( @orphan_hypotheticals ) {
1036 $tree->delete_vertices( @orphan_hypotheticals );
1037 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1038 $tree->successorless_vertices;
1040 # Then delete a hypothetical root with only one successor, moving the
1041 # root to the first child that has no other predecessors.
1042 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
1043 my @nextroot = $tree->successors( $root );
1044 $tree->delete_vertex( $root );
1045 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
1047 # The tree has been modified in place, but we need to know the new root.
1048 $root = undef unless $root && $tree->has_vertex( $root );
1051 # Add the variant, subject to a.c. representation logic.
1052 # This assumes that we will see the 'main' version before the a.c. version.
1053 sub add_variant_wit {
1054 my( $arr, $wit, $acstr ) = @_;
1056 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
1058 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
1060 push( @$arr, $wit ) unless $skip;
1063 sub _useful_variant {
1064 my( $group_readings, $graph, $acstr ) = @_;
1066 # TODO Decide what to do with AC witnesses
1068 # Sort by group size and return
1070 my( @readings, @groups ); # The sorted groups for our answer.
1071 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
1072 keys %$group_readings ) {
1073 push( @readings, $rdg );
1074 push( @groups, $group_readings->{$rdg} );
1075 if( @{$group_readings->{$rdg}} > 1 ) {
1078 my( $wit ) = @{$group_readings->{$rdg}};
1079 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
1080 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
1083 if( $is_useful > 1 ) {
1084 return( \@readings, \@groups );
1090 =head2 wit_stringify( $groups )
1092 Takes an array of witness groupings and produces a string like
1093 ['A','B'] / ['C','D','E'] / ['F']
1100 # If we were passed an array of witnesses instead of an array of
1101 # groupings, then "group" the witnesses first.
1102 unless( ref( $groups->[0] ) ) {
1103 my $mkgrp = [ $groups ];
1106 foreach my $g ( @$groups ) {
1107 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1109 return join( ' / ', @gst );
1113 my( $lista, $listb ) = @_;
1116 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1117 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
1118 my @set = grep { $union{$_} == 1 } keys %union;
1119 return map { $scalars{$_} } @set;
1126 This package is free software and is provided "as is" without express
1127 or implied warranty. You can redistribute it and/or modify it under
1128 the same terms as Perl itself.
1132 Tara L Andrews E<lt>aurum@cpan.orgE<gt>