1 package Text::Tradition::Analysis;
6 use Encode qw/ encode_utf8 /;
9 use JSON qw/ encode_json decode_json /;
11 use Text::LevenshteinXS qw/ distance /;
13 use Text::Tradition::Stemma;
16 use vars qw/ @EXPORT_OK /;
17 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
21 Text::Tradition::Analysis - functions for stemma analysis of a tradition
26 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
27 my $t = Text::Tradition->new(
28 'name' => 'this is a text',
30 'file' => '/path/to/tei_parallel_seg_file.xml' );
31 $t->add_stemma( 'dotfile' => $stemmafile );
33 my $variant_data = run_analysis( $tradition );
34 # Recalculate rank $n treating all orthographic variants as equivalent
35 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
39 Text::Tradition is a library for representation and analysis of collated
40 texts, particularly medieval ones. The Collation is the central feature of
41 a Tradition, where the text, its sequence of readings, and its relationships
42 between readings are actually kept.
46 =head2 run_analysis( $tradition, %opts )
48 Runs the analysis described in analyze_variant_location on every location in the
49 collation of the given tradition, with the given options. These include:
53 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
54 is 0 (i.e. the first).
56 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
58 =item * merge_types - Specify a list of relationship types, where related readings
59 should be treated as identical for the purposes of analysis.
61 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
68 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
70 my $datafile = 't/data/florilegium_tei_ps.xml';
71 my $tradition = Text::Tradition->new( 'input' => 'TEI',
73 'file' => $datafile );
74 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
75 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
77 my %expected_genealogical = (
108 my $data = run_analysis( $tradition );
109 my $c = $tradition->collation;
110 foreach my $row ( @{$data->{'variants'}} ) {
111 # Account for rows that used to be "not useful"
112 unless( exists $expected_genealogical{$row->{'id'}} ) {
113 $expected_genealogical{$row->{'id'}} = 1;
115 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
116 is( $gen_bool, $expected_genealogical{$row->{'id'}},
117 "Got correct genealogical flag for row " . $row->{'id'} );
118 # Check that we have the right row with the right groups
119 my $rank = $row->{'id'};
120 foreach my $rdghash ( @{$row->{'readings'}} ) {
121 # Skip 'readings' that aren't really
122 next unless $c->reading( $rdghash->{'readingid'} );
124 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
125 "Got correct reading rank" );
126 # Check the witnesses
127 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
128 my @sgrp = sort @{$rdghash->{'group'}};
129 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
132 is( $data->{'variant_count'}, 58, "Got right total variant number" );
133 # TODO Make something meaningful of conflict count, maybe test other bits
140 my( $tradition, %opts ) = @_;
141 my $c = $tradition->collation;
143 my $stemma_id = $opts{'stemma_id'} || 0;
144 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
145 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
148 my $stemma = $tradition->stemma( $stemma_id );
150 # Figure out which witnesses we are working with - that is, the ones that
151 # appear both in the stemma and in the tradition. All others are 'lacunose'
153 my @lacunose = $stemma->hypotheticals;
154 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
155 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
157 # Find and mark 'common' ranks for exclusion, unless they were
158 # explicitly specified.
161 foreach my $rdg ( $c->common_readings ) {
162 $common_rank{$rdg->rank} = 1;
164 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
167 # Group the variants to send to the solver
172 foreach my $rank ( @ranks ) {
173 my $missing = [ @lacunose ];
174 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
175 # Filter out any empty rankgroups
176 # (e.g. from the later rank for a transposition)
177 next unless keys %$rankgroup;
178 if( $opts{'exclude_type1'} ) {
179 # Check to see whether this is a "useful" group.
180 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
181 $stemma->graph, $c->ac_label );
184 push( @use_ranks, $rank );
185 push( @groups, $rankgroup );
186 $lacunae{$rank} = $missing;
189 my $answer = solve_variants( $stemma, @groups );
191 # Do further analysis on the answer
192 my $conflict_count = 0;
193 my $aclabel = $c->ac_label;
194 foreach my $idx ( 0 .. $#use_ranks ) {
195 my $location = $answer->{'variants'}->[$idx];
196 # Add the rank back in
197 my $rank = $use_ranks[$idx];
198 $location->{'id'} = $rank;
199 # Note what our lacunae are
201 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
202 $location->{'missing'} = [ keys %lmiss ];
204 # Run the extra analysis we need.
205 analyze_location( $tradition, $stemma, $location, \%lmiss );
208 # Do the final post-analysis tidying up of the data.
209 foreach my $rdghash ( @{$location->{'readings'}} ) {
211 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
212 # Add the reading text back in, setting display value as needed
213 my $rdg = $c->reading( $rdghash->{'readingid'} );
215 $rdghash->{'text'} = $rdg->text .
216 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
217 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
218 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
220 # Remove lacunose witnesses from this reading's list now that the
223 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
224 $rdghash->{'group'} = \@realgroup;
225 # Note any layered witnesses that appear in this group
226 foreach( @realgroup ) {
227 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
228 push( @layerwits, $1 );
232 $location->{'layerwits'} = \@layerwits if @layerwits;
234 $answer->{'conflict_count'} = $conflict_count;
239 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
241 Groups the variants at the given $rank of the collation, treating any
242 relationships in @merge_relationship_types as equivalent. $lacunose should
243 be a reference to an array, to which the sigla of lacunose witnesses at this
244 rank will be appended; $transposed should be a reference to a hash, wherein
245 the identities of transposed readings and their relatives will be stored.
247 Returns a hash $group_readings where $rdg is attested by the witnesses listed
248 in $group_readings->{$rdg}.
252 # Return group_readings, groups, lacunose
254 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
255 my $c = $tradition->collation;
256 my $aclabel = $c->ac_label;
257 my $table = $c->alignment_table;
258 # Get the alignment table readings
259 my %readings_at_rank;
260 my %is_lacunose; # lookup table for witnesses not in stemma
261 map { $is_lacunose{$_} = 1; $is_lacunose{$_.$aclabel} = 1 } @$lacunose;
264 my $has_transposition;
265 foreach my $tablewit ( @{$table->{'alignment'}} ) {
266 my $rdg = $tablewit->{'tokens'}->[$rank-1];
267 my $wit = $tablewit->{'witness'};
268 # Exclude the witness if it is "lacunose" which if we got here
269 # means "not in the stemma".
270 next if $is_lacunose{$wit};
271 # Note if the witness is actually in a lacuna
272 if( $rdg && $rdg->{'t'}->is_lacuna ) {
273 _add_to_witlist( $wit, $lacunose, $aclabel );
274 # Otherwise the witness either has a positive reading...
276 # If the reading has been counted elsewhere as a transposition, ignore it.
277 if( $transposed->{$rdg->{'t'}->id} ) {
278 # TODO Does this cope with three-way transpositions?
279 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
282 # Otherwise, record it...
283 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
284 # ...and grab any transpositions, and their relations.
285 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
286 foreach my $trdg ( @transp ) {
287 next if exists $readings_at_rank{$trdg->id};
288 $has_transposition = 1;
289 my @affected_wits = _table_witnesses(
290 $table, $trdg, \%is_lacunose, $aclabel );
291 next unless @affected_wits;
292 map { $moved_wits{$_} = 1 } @affected_wits;
293 $transposed->{$trdg->id} =
294 [ _table_witnesses( $table, $rdg->{'t'}, \%is_lacunose, $aclabel ) ];
295 $readings_at_rank{$trdg->id} = $trdg;
297 # ...or it is empty, ergo a gap.
299 _add_to_witlist( $wit, \@check_for_gaps, $aclabel );
303 map { _add_to_witlist( $_, \@gap_wits, $aclabel )
304 unless $moved_wits{$_} } @check_for_gaps;
305 # Group the readings, collapsing groups by relationship if needed
306 my $grouped_readings = {};
307 foreach my $rdg ( values %readings_at_rank ) {
308 # Skip readings that have been collapsed into others.
309 next if exists $grouped_readings->{$rdg->id}
310 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
311 # Get the witness list, including from readings collapsed into this one.
312 my @wits = _table_witnesses( $table, $rdg, \%is_lacunose, $aclabel );
313 if( $collapse && @$collapse ) {
314 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
315 foreach my $other ( $rdg->related_readings( $filter ) ) {
316 my @otherwits = _table_witnesses(
317 $table, $other, \%is_lacunose, $aclabel );
318 push( @wits, @otherwits );
319 $grouped_readings->{$other->id} = 'COLLAPSE';
322 $grouped_readings->{$rdg->id} = \@wits;
324 $grouped_readings->{'(omitted)'} = \@gap_wits if @gap_wits;
325 # Get rid of our collapsed readings
326 map { delete $grouped_readings->{$_} if $grouped_readings->{$_} eq 'COLLAPSE' }
327 keys %$grouped_readings
330 # If something was transposed, check the groups for doubled-up readings
331 if( $has_transposition ) {
332 # print STDERR "Group for rank $rank:\n";
333 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
334 # keys %$grouped_readings;
335 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
339 return $grouped_readings;
342 # Helper function to query the alignment table for all witnesses (a.c. included)
343 # that have a given reading at its rank.
344 sub _table_witnesses {
345 my( $table, $trdg, $lacunose, $aclabel ) = @_;
346 my $tableidx = $trdg->rank - 1;
348 foreach my $row ( @{$table->{'alignment'}} ) {
349 my $wit = $row->{'witness'};
350 next if $lacunose->{$wit};
351 my $rdg = $row->{'tokens'}->[$tableidx];
352 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
353 _add_to_witlist( $wit, \@has_reading, $aclabel )
354 if $rdg->{'t'}->id eq $trdg->id;
359 # Helper function to ensure that X and X a.c. never appear in the same list.
360 sub _add_to_witlist {
361 my( $wit, $list, $acstr ) = @_;
364 map { $inlist{$_} = $idx++ } @$list;
365 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
367 unless( exists $inlist{$acwit} ) {
368 push( @$list, $acwit.$acstr );
371 if( exists( $inlist{$wit.$acstr} ) ) {
372 # Replace the a.c. version with the main witness
373 my $i = $inlist{$wit.$acstr};
376 push( @$list, $wit );
381 sub _check_transposed_consistency {
382 my( $c, $rank, $transposed, $groupings ) = @_;
385 # Note which readings are actually at this rank, and which witnesses
386 # belong to which reading.
387 foreach my $rdg ( keys %$groupings ) {
388 my $rdgobj = $c->reading( $rdg );
389 # Count '(omitted)' as a reading at this rank
390 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
391 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
393 # Our work is done if we have no witness belonging to more than one
395 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
396 return unless @doubled;
397 # If we have a symmetric related transposition, drop the non-rank readings.
398 if( @doubled == scalar keys %seen_wits ) {
399 foreach my $rdg ( keys %$groupings ) {
400 if( !$thisrank{$rdg} ) {
401 my $groupstr = wit_stringify( $groupings->{$rdg} );
402 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
404 delete $groupings->{$rdg};
405 # If we found a group match, assume there is a symmetry happening.
406 # TODO think more about this
407 # print STDERR "*** Deleting symmetric reading $rdg\n";
409 delete $transposed->{$rdg};
410 warn "Found problem in evident symmetry with reading $rdg";
414 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
416 foreach my $dup ( @doubled ) {
417 foreach my $rdg ( @{$seen_wits{$dup}} ) {
418 next if $thisrank{$rdg};
419 next unless exists $groupings->{$rdg};
420 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
421 delete $groupings->{$rdg};
422 delete $transposed->{$rdg};
425 # and put any now-orphaned readings into an 'omitted' reading.
426 foreach my $wit ( keys %seen_wits ) {
427 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
428 $groupings->{'(omitted)'} = [] unless exists $groupings->{'(omitted)'};
429 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
435 =head2 solve_variants( $graph, @groups )
437 Sends the set of groups to the external graph solver service and returns
438 a cleaned-up answer, adding the rank IDs back where they belong.
440 The JSON has the form
441 { "graph": [ stemmagraph DOT string without newlines ],
442 "groupings": [ array of arrays of groups, one per rank ] }
444 The answer has the form
445 { "variants" => [ array of variant location structures ],
446 "variant_count" => total,
447 "conflict_count" => number of conflicts detected,
448 "genealogical_count" => number of solutions found }
453 my( $stemma, @groups ) = @_;
454 my $aclabel = $stemma->collation->ac_label;
456 # Filter the groups down to distinct groups, and work out what graph
457 # should be used in the calculation of each group. We want to send each
458 # distinct problem to the solver only once.
459 # We need a whole bunch of lookup tables for this.
460 my $index_groupkeys = {}; # Save the order of readings
461 my $group_indices = {}; # Save the indices that have a given grouping
462 my $graph_problems = {}; # Save the groupings for the given graph
464 foreach my $idx ( 0..$#groups ) {
465 my $ghash = $groups[$idx];
467 # Sort the groupings from big to little, and scan for a.c. witnesses
468 # that would need an extended graph.
469 my @acwits; # note which AC witnesses crop up at this rank
470 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
472 foreach my $rdg ( @idxkeys ) {
473 my @sg = sort @{$ghash->{$rdg}};
474 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
475 push( @grouping, \@sg );
477 # Save the reading order
478 $index_groupkeys->{$idx} = \@idxkeys;
480 # Now associate the distinct group with this index
481 my $gstr = wit_stringify( \@grouping );
482 push( @{$group_indices->{$gstr}}, $idx );
484 # Finally, add the group to the list to be calculated for this graph.
485 map { s/\Q$aclabel\E$// } @acwits;
488 $graph = $stemma->extend_graph( \@acwits );
490 die "Unable to extend graph with @acwits";
492 unless( exists $graph_problems->{"$graph"} ) {
493 $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
495 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
498 ## For each distinct graph, send its groups to the solver.
499 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
500 my $ua = LWP::UserAgent->new();
501 ## Witness map is a HACK to get around limitations in node names from IDP
502 my $witness_map = {};
503 ## Variables to store answers as they come back
504 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
505 my $genealogical = 0;
506 foreach my $graphkey ( keys %$graph_problems ) {
507 my $graph = $graph_problems->{$graphkey}->{'object'};
508 my $groupings = $graph_problems->{$graphkey}->{'groups'};
509 my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation,
510 $groupings, $witness_map ) );
511 # Send it off and get the result
512 #print STDERR "Sending request: $json\n";
513 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
514 'Content' => $json );
517 if( $resp->is_success ) {
518 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
521 # Fall back to the old method.
522 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
523 . "; falling back to perl method";
524 $answer = perl_solver( $graph, @$groupings );
526 ## The answer is the evaluated groupings, plus a boolean for whether
527 ## they were genealogical. Reconstruct our original groups.
528 foreach my $gidx ( 0 .. $#{$groupings} ) {
529 my( $calc_groups, $result ) = @{$answer->[$gidx]};
532 # Prune the calculated groups, in case the IDP solver failed to.
535 foreach my $cg ( @$calc_groups ) {
536 # This is a little wasteful but the path of least
537 # resistance. Send both the stemma, which knows what
538 # its hypotheticals are, and the actual graph used.
539 my @pg = _prune_group( $cg, $stemma, $graph );
540 push( @pruned_groups, \@pg );
542 $calc_groups = \@pruned_groups;
545 # Retrieve the key for the original group that went to the solver
546 my $input_group = wit_stringify( $groupings->[$gidx] );
547 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
548 my @readings = @{$index_groupkeys->{$oidx}};
550 'genealogical' => $result,
553 foreach my $ridx ( 0 .. $#readings ) {
554 push( @{$vstruct->{'readings'}},
555 { 'readingid' => $readings[$ridx],
556 'group' => $calc_groups->[$ridx] } );
558 $variants->[$oidx] = $vstruct;
563 return { 'variants' => $variants,
564 'variant_count' => scalar @$variants,
565 'genealogical_count' => $genealogical };
568 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
570 sub _safe_wit_strings {
571 my( $graph, $c, $groupings, $witness_map ) = @_;
572 # Parse the graph we were given into a stemma.
573 my $safegraph = Graph->new();
574 # Convert the graph to a safe representation and store the conversion.
575 foreach my $n ( $graph->vertices ) {
576 my $sn = _safe_witstr( $n );
577 if( exists $witness_map->{$sn} ) {
578 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
579 if $witness_map->{$sn} ne $n;
581 $witness_map->{$sn} = $n;
583 $safegraph->add_vertex( $sn );
584 $safegraph->set_vertex_attributes( $sn,
585 $graph->get_vertex_attributes( $n ) );
587 foreach my $e ( $graph->edges ) {
588 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
589 $safegraph->add_edge( @safe_e );
591 my $safe_stemma = Text::Tradition::Stemma->new(
592 'collation' => $c, 'graph' => $safegraph );
594 # Now convert the witness groupings to a safe representation.
595 my $safe_groupings = [];
596 foreach my $grouping ( @$groupings ) {
597 my $safe_grouping = [];
598 foreach my $group ( @$grouping ) {
600 foreach my $n ( @$group ) {
601 my $sn = _safe_witstr( $n );
602 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
603 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
604 $witness_map->{$sn} = $n;
605 push( @$safe_group, $sn );
607 push( @$safe_grouping, $safe_group );
609 push( @$safe_groupings, $safe_grouping );
612 # Return it all in the struct we expect. We have stored the reductions
613 # in the $witness_map that we were passed.
614 return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
615 'groupings' => $safe_groupings };
620 $witstr =~ s/\s+/_/g;
621 $witstr =~ s/[^\w\d-]//g;
625 sub _desanitize_names {
626 my( $jsonstruct, $witness_map ) = @_;
628 foreach my $grouping ( @$jsonstruct ) {
629 my $real_grouping = [];
630 foreach my $element ( @$grouping ) {
631 if( ref( $element ) eq 'ARRAY' ) {
633 my $real_groupset = [];
634 foreach my $group ( @$element ) {
636 foreach my $n ( @$group ) {
637 my $rn = $witness_map->{$n};
638 push( @$real_group, $rn );
640 push( @$real_groupset, $real_group );
642 push( @$real_grouping, $real_groupset );
644 # It is the boolean, not actually a group.
645 push( @$real_grouping, $element );
648 push( @$result, $real_grouping );
655 =head2 analyze_location ( $tradition, $graph, $location_hash )
657 Given the tradition, its stemma graph, and the solution from the graph solver,
658 work out the rest of the information we want. For each reading we need missing,
659 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
663 sub analyze_location {
664 my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
665 my $c = $tradition->collation;
667 # Make a hash of all known node memberships, and make the subgraphs.
669 my $reading_roots = {};
671 my $acstr = $c->ac_label;
673 # Note which witnesses positively belong to which group
674 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
675 my $rid = $rdghash->{'readingid'};
676 foreach my $wit ( @{$rdghash->{'group'}} ) {
677 $contig->{$wit} = $rid;
678 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
684 # Get the actual graph we should work with
687 $graph = @acwits ? $stemma->extend_graph( \@acwits ) : $stemma->graph;
689 die "Could not extend graph with a.c. witnesses @acwits";
692 # Now, armed with that knowledge, make a subgraph for each reading
693 # and note the root(s) of each subgraph.
694 foreach my $rdghash( @{$variant_row->{'readings'}} ) {
695 my $rid = $rdghash->{'readingid'};
698 my $part = $graph->copy;
699 my @todelete = grep { exists $contig->{$_} && $contig->{$_} ne $rid }
701 $part->delete_vertices( @todelete );
702 _prune_subtree( $part, $lacunose );
703 $subgraph->{$rid} = $part;
704 # Record the remaining lacunose nodes as part of this group, if
705 # we are dealing with a non-genealogical reading.
706 unless( $variant_row->{'genealogical'} ) {
707 map { $contig->{$_} = $rid } $part->vertices;
709 # Get the reading roots.
710 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
713 # Now that we have all the node group memberships, calculate followed/
714 # non-followed/unknown values for each reading. Also figure out the
715 # reading's evident parent(s).
716 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
717 my $rid = $rdghash->{'readingid'};
719 my $part = $subgraph->{$rid};
721 # Start figuring things out.
722 my @roots = grep { $reading_roots->{$_} eq $rid } keys %$reading_roots;
723 $rdghash->{'independent_occurrence'} = \@roots;
724 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
725 # Find the parent readings, if any, of this reading.
727 foreach my $wit ( @roots ) {
728 # Look in the main stemma to find this witness's extant or known-reading
729 # immediate ancestor(s), and look up the reading that each ancestor olds.
730 my @check = $graph->predecessors( $wit );
733 foreach my $wparent( @check ) {
734 my $preading = $contig->{$wparent};
736 $rdgparents->{$preading} = 1;
738 push( @next, $graph->predecessors( $wparent ) );
744 foreach my $p ( keys %$rdgparents ) {
745 # Resolve the relationship of the parent to the reading, and
746 # save it in our hash.
747 my $pobj = $c->reading( $p );
748 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
749 my $phash = { 'label' => $prep };
751 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
753 $phash->{relation} = { type => $rel->type };
754 if( $rel->has_annotation ) {
755 $phash->{relation}->{'annotation'} = $rel->annotation;
757 } elsif( $rdghash->{readingid} eq '(omitted)' ) {
758 $phash->{relation} = { type => 'deletion' };
759 } elsif( $rdghash->{text} ) {
760 # Check for sheer word similarity.
761 my $rtext = $rdghash->{text};
762 my $ptext = $pobj->text;
763 my $min = length( $rtext ) > length( $ptext )
764 ? length( $ptext ) : length( $rtext );
765 my $distance = distance( $rtext, $ptext );
766 if( $distance < $min ) {
767 $phash->{relation} = { type => 'wordsimilar' };
770 # Get the attributes of the parent object while we are here
771 $phash->{'text'} = $pobj->text if $pobj;
772 $phash->{'is_nonsense'} = $pobj->is_nonsense;
773 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
774 } elsif( $p eq '(omitted)' ) {
775 $phash->{relation} = { type => 'addition' };
778 $rdgparents->{$p} = $phash;
781 $rdghash->{'reading_parents'} = $rdgparents;
783 # Find the number of times this reading was altered, and the number of
784 # times we're not sure.
785 my( %nofollow, %unknownfollow );
786 foreach my $wit ( $part->vertices ) {
787 foreach my $wchild ( $graph->successors( $wit ) ) {
788 next if $part->has_vertex( $wchild );
789 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
790 # It definitely changed here.
791 $nofollow{$wchild} = 1;
792 } elsif( !($contig->{$wchild}) ) {
793 # The child is a hypothetical node not definitely in
794 # any group. Answer is unknown.
795 $unknownfollow{$wchild} = 1;
796 } # else it's a non-root node in a known group, and therefore
797 # is presumed to have its reading from its group, not this link.
800 $rdghash->{'not_followed'} = keys %nofollow;
801 $rdghash->{'follow_unknown'} = keys %unknownfollow;
803 # Now say whether this reading represents a conflict.
804 unless( $variant_row->{'genealogical'} ) {
805 $rdghash->{'conflict'} = @roots != 1;
811 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
813 ** NOTE ** This method should hopefully not be called - it is not guaranteed
814 to be correct. Serves as a backup for the real solver.
816 Runs an analysis of the given tradition, at the location given in $rank,
817 against the graph of the stemma specified in $stemma_id. The argument
818 @merge_relationship_types is an optional list of relationship types for
819 which readings so related should be treated as equivalent.
821 Returns a nested array data structure as follows:
823 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
825 where the group list is the array of arrays passed in for each element of @groups,
826 possibly with the addition of hypothetical readings.
832 my( $graph, @groups ) = @_;
834 foreach my $g ( @groups ) {
835 push( @answer, _solve_variant_location( $graph, $g ) );
840 sub _solve_variant_location {
841 my( $graph, $groups ) = @_;
848 # Mark each ms as in its own group, first.
849 foreach my $g ( @$groups ) {
850 my $gst = wit_stringify( $g );
851 map { $contig->{$_} = $gst } @$g;
854 # Now for each unmarked node in the graph, initialize an array
855 # for possible group memberships. We will use this later to
856 # resolve potential conflicts.
857 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
858 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
859 my $gst = wit_stringify( $g ); # This is the group name
860 # Copy the graph, and delete all non-members from the new graph.
861 my $part = $graph->copy;
863 $part->delete_vertices(
864 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
866 # Now look to see if our group is connected.
868 # We have to take directionality into account.
869 # How many root nodes do we have?
870 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
871 $part->predecessorless_vertices;
872 # Assuming that @$g > 1, find the first root node that has at
873 # least one successor belonging to our group. If this reading
874 # is genealogical, there should be only one, but we will check
875 # that implicitly later.
876 foreach my $root ( @roots ) {
877 # Prune the tree to get rid of extraneous hypotheticals.
878 $root = _prune_subtree_old( $part, $root, $contig );
880 # Save this root for our group.
881 push( @group_roots, $root );
882 # Get all the successor nodes of our root.
885 # Dispense with the trivial case of one reading.
887 @group_roots = ( $wit );
888 foreach my $v ( $part->vertices ) {
889 $part->delete_vertex( $v ) unless $v eq $wit;
893 if( @group_roots > 1 ) {
894 $conflict->{$gst} = 1;
897 # Paint the 'hypotheticals' with our group.
898 foreach my $wit ( $part->vertices ) {
899 if( ref( $contig->{$wit} ) ) {
900 push( @{$contig->{$wit}}, $gst );
901 } elsif( $contig->{$wit} ne $gst ) {
902 warn "How did we get here?";
907 # Save the relevant subgraph.
908 $subgraph->{$gst} = $part;
911 # For each of our hypothetical readings, flatten its 'contig' array if
912 # the array contains zero or one group. If we have any unflattened arrays,
913 # we may need to run the resolution process. If the reading is already known
914 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
917 foreach my $wit ( keys %$contig ) {
918 next unless ref( $contig->{$wit} );
919 if( @{$contig->{$wit}} > 1 ) {
920 if( $is_conflicted ) {
921 $contig->{$wit} = ''; # We aren't going to decide.
923 push( @resolve, $wit );
926 my $gst = pop @{$contig->{$wit}};
927 $contig->{$wit} = $gst || '';
932 my $still_contig = {};
933 foreach my $h ( @resolve ) {
934 # For each of the hypothetical readings with more than one possibility,
935 # try deleting it from each of its member subgraphs in turn, and see
936 # if that breaks the contiguous grouping.
937 # TODO This can still break in a corner case where group A can use
938 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
939 # Revisit this if necessary; it could get brute-force nasty.
940 foreach my $gst ( @{$contig->{$h}} ) {
941 my $gpart = $subgraph->{$gst}->copy();
942 # If we have come this far, there is only one root and everything
943 # is reachable from it.
944 my( $root ) = $gpart->predecessorless_vertices;
946 map { $reachable->{$_} = 1 } $gpart->vertices;
948 # Try deleting the hypothetical node.
949 $gpart->delete_vertex( $h );
951 # See if we still have a single root.
952 my @roots = $gpart->predecessorless_vertices;
953 warn "This shouldn't have happened" unless @roots;
955 # $h is needed by this group.
956 if( exists( $still_contig->{$h} ) ) {
958 $conflict->{$gst} = 1;
959 $still_contig->{$h} = '';
961 $still_contig->{$h} = $gst;
965 # $h is somewhere in the middle. See if everything
966 # else can still be reached from the root.
967 my %still_reachable = ( $root => 1 );
968 map { $still_reachable{$_} = 1 }
969 $gpart->all_successors( $root );
970 foreach my $v ( keys %$reachable ) {
972 if( !$still_reachable{$v}
973 && ( $contig->{$v} eq $gst
974 || ( exists $still_contig->{$v}
975 && $still_contig->{$v} eq $gst ) ) ) {
977 if( exists $still_contig->{$h} ) {
979 $conflict->{$gst} = 1;
980 $still_contig->{$h} = '';
982 $still_contig->{$h} = $gst;
985 } # else we don't need $h in this group.
987 } # endif $h eq $root
991 # Now we have some hypothetical vertices in $still_contig that are the
992 # "real" group memberships. Replace these in $contig.
993 foreach my $v ( keys %$contig ) {
994 next unless ref $contig->{$v};
995 $contig->{$v} = $still_contig->{$v};
999 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
1000 my $variant_row = [ [], $is_genealogical ];
1001 # Fill in the groupings from $contig.
1002 foreach my $g ( @$groups ) {
1003 my $gst = wit_stringify( $g );
1004 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
1005 push( @{$variant_row->[0]}, \@realgroup );
1007 return $variant_row;
1011 my( $group, $stemma, $graph ) = @_;
1013 map { $lacunose->{$_} = 1 } $stemma->hypotheticals;
1014 map { $lacunose->{$_} = 0 } @$group;
1016 my $subgraph = $graph->copy;
1017 map { $subgraph->delete_vertex( $_ ) unless exists $lacunose->{$_} }
1018 $subgraph->vertices;
1019 # ...and find the root.
1020 # Now prune and return the remaining vertices.
1021 _prune_subtree( $subgraph, $lacunose );
1022 return $subgraph->vertices;
1025 sub _prune_subtree {
1026 my( $tree, $lacunose ) = @_;
1028 # Delete lacunose witnesses that have no successors
1029 my @orphan_hypotheticals;
1032 die "Infinite loop on leaves" if $ctr > 100;
1033 @orphan_hypotheticals = grep { $lacunose->{$_} }
1034 $tree->successorless_vertices;
1035 $tree->delete_vertices( @orphan_hypotheticals );
1037 } while( @orphan_hypotheticals );
1039 # Delete lacunose roots that have a single successor
1043 die "Infinite loop on roots" if $ctr > 100;
1044 @redundant_root = grep { $lacunose->{$_} && $tree->successors( $_ ) == 1 }
1045 $tree->predecessorless_vertices;
1046 $tree->delete_vertices( @redundant_root );
1048 } while( @redundant_root );
1051 sub _prune_subtree_old {
1052 my( $tree, $root, $contighash ) = @_;
1053 # First, delete hypothetical leaves / orphans until there are none left.
1054 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1055 $tree->successorless_vertices;
1056 while( @orphan_hypotheticals ) {
1057 $tree->delete_vertices( @orphan_hypotheticals );
1058 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1059 $tree->successorless_vertices;
1061 # Then delete a hypothetical root with only one successor, moving the
1062 # root to the first child that has no other predecessors.
1063 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
1064 my @nextroot = $tree->successors( $root );
1065 $tree->delete_vertex( $root );
1066 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
1068 # The tree has been modified in place, but we need to know the new root.
1069 $root = undef unless $root && $tree->has_vertex( $root );
1072 # Add the variant, subject to a.c. representation logic.
1073 # This assumes that we will see the 'main' version before the a.c. version.
1074 sub add_variant_wit {
1075 my( $arr, $wit, $acstr ) = @_;
1077 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
1079 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
1081 push( @$arr, $wit ) unless $skip;
1084 sub _useful_variant {
1085 my( $group_readings, $graph, $acstr ) = @_;
1087 # TODO Decide what to do with AC witnesses
1089 # Sort by group size and return
1091 my( @readings, @groups ); # The sorted groups for our answer.
1092 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
1093 keys %$group_readings ) {
1094 push( @readings, $rdg );
1095 push( @groups, $group_readings->{$rdg} );
1096 if( @{$group_readings->{$rdg}} > 1 ) {
1099 my( $wit ) = @{$group_readings->{$rdg}};
1100 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
1101 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
1104 if( $is_useful > 1 ) {
1105 return( \@readings, \@groups );
1111 =head2 wit_stringify( $groups )
1113 Takes an array of witness groupings and produces a string like
1114 ['A','B'] / ['C','D','E'] / ['F']
1121 # If we were passed an array of witnesses instead of an array of
1122 # groupings, then "group" the witnesses first.
1123 unless( ref( $groups->[0] ) ) {
1124 my $mkgrp = [ $groups ];
1127 foreach my $g ( @$groups ) {
1128 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1130 return join( ' / ', @gst );
1134 my( $lista, $listb ) = @_;
1137 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1138 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
1139 my @set = grep { $union{$_} == 1 } keys %union;
1140 return map { $scalars{$_} } @set;
1147 This package is free software and is provided "as is" without express
1148 or implied warranty. You can redistribute it and/or modify it under
1149 the same terms as Perl itself.
1153 Tara L Andrews E<lt>aurum@cpan.orgE<gt>