1 package Text::Tradition::Analysis;
5 use Algorithm::Diff; # for word similarity measure
7 use Digest::MD5 qw/ md5_hex /;
8 use Encode qw/ encode_utf8 /;
11 use JSON qw/ to_json /;
14 use Text::Tradition::Analysis::Result;
15 use Text::Tradition::Directory;
16 use Text::Tradition::Stemma;
19 use vars qw/ @EXPORT_OK /;
20 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
22 my $unsolved_problems = {};
26 Text::Tradition::Analysis - functions for stemma analysis of a tradition
31 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
32 my $t = Text::Tradition->new(
33 'name' => 'this is a text',
35 'file' => '/path/to/tei_parallel_seg_file.xml' );
36 $t->add_stemma( 'dotfile' => $stemmafile );
38 my $variant_data = run_analysis( $tradition );
39 # Recalculate rank $n treating all orthographic variants as equivalent
40 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
44 Text::Tradition is a library for representation and analysis of collated
45 texts, particularly medieval ones. The Collation is the central feature of
46 a Tradition, where the text, its sequence of readings, and its relationships
47 between readings are actually kept.
51 =head2 run_analysis( $tradition, %opts )
53 Runs the analysis described in analyze_variant_location on every location in the
54 collation of the given tradition, with the given options. These include:
58 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
59 is 0 (i.e. the first).
61 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
63 =item * merge_types - Specify a list of relationship types, where related readings
64 should be treated as identical for the purposes of analysis.
66 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
73 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
75 my $datafile = 't/data/florilegium_tei_ps.xml';
76 my $tradition = Text::Tradition->new( 'input' => 'TEI',
78 'file' => $datafile );
79 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
80 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
82 my %expected_genealogical = (
113 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
114 my $c = $tradition->collation;
115 foreach my $row ( @{$data->{'variants'}} ) {
116 # Account for rows that used to be "not useful"
117 unless( exists $expected_genealogical{$row->{'id'}} ) {
118 $expected_genealogical{$row->{'id'}} = 1;
120 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
121 is( $gen_bool, $expected_genealogical{$row->{'id'}},
122 "Got correct genealogical flag for row " . $row->{'id'} );
123 # Check that we have the right row with the right groups
124 my $rank = $row->{'id'};
125 foreach my $rdghash ( @{$row->{'readings'}} ) {
126 # Skip 'readings' that aren't really
127 next unless $c->reading( $rdghash->{'readingid'} );
129 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
130 "Got correct reading rank" );
131 # Check the witnesses
132 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
133 my @sgrp = sort @{$rdghash->{'group'}};
134 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
137 is( $data->{'variant_count'}, 58, "Got right total variant number" );
138 # TODO Make something meaningful of conflict count, maybe test other bits
145 my( $tradition, %opts ) = @_;
146 my $c = $tradition->collation;
147 my $aclabel = $c->ac_label;
149 my $stemma_id = $opts{'stemma_id'} || 0;
150 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
151 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
153 # Make sure we have a lookup DB for graph calculation results; this will die
154 # if calcdir or calcdsn isn't passed.
155 my $dir = $opts{'calcdir'} ? delete $opts{'calcdir'}
156 : Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
159 my $stemma = $tradition->stemma( $stemma_id );
161 # Figure out which witnesses we are working with - that is, the ones that
162 # appear both in the stemma and in the tradition. All others are 'lacunose'
164 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
165 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
166 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
167 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
169 # Find and mark 'common' ranks for exclusion, unless they were
170 # explicitly specified.
173 foreach my $rdg ( $c->common_readings ) {
174 $common_rank{$rdg->rank} = 1;
176 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
179 # Group the variants to send to the solver
184 foreach my $rank ( @ranks ) {
185 my $missing = $lacunose->clone();
186 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
187 # Filter out any empty rankgroups
188 # (e.g. from the later rank for a transposition)
189 next unless keys %$rankgroup;
190 # Get the graph for this rankgroup
191 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
192 if( $opts{'exclude_type1'} ) {
193 # Check to see whether this is a "useful" group.
194 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
196 push( @use_ranks, $rank );
197 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
198 $lacunae{$rank} = $missing;
201 my $answer = solve_variants( $dir, @groups );
203 # Do further analysis on the answer
204 my $conflict_count = 0;
205 my $reversion_count = 0;
206 foreach my $idx ( 0 .. $#use_ranks ) {
207 my $location = $answer->{'variants'}->[$idx];
208 # Add the rank back in
209 my $rank = $use_ranks[$idx];
210 $location->{'id'} = $rank;
211 # Note what our lacunae are
213 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
214 $location->{'missing'} = [ keys %lmiss ];
216 # Run the extra analysis we need.
217 ## TODO We run through all the variants in this call, so
218 ## why not add the reading data there instead of here below?
219 my $graph = $groups[$idx]->{graph};
220 analyze_location( $tradition, $graph, $location, \%lmiss );
223 # Do the final post-analysis tidying up of the data.
224 foreach my $rdghash ( @{$location->{'readings'}} ) {
226 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
227 $reversion_count++ if $rdghash->{'reverted'};
228 # Add the reading text back in, setting display value as needed
229 my $rdg = $c->reading( $rdghash->{'readingid'} );
231 $rdghash->{'text'} = $rdg->text .
232 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
233 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
234 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
236 # Remove lacunose witnesses from this reading's list now that the
239 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
240 $rdghash->{'group'} = \@realgroup;
241 # Note any layered witnesses that appear in this group
242 foreach( @realgroup ) {
243 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
244 push( @layerwits, $1 );
248 $location->{'layerwits'} = \@layerwits if @layerwits;
250 $answer->{'conflict_count'} = $conflict_count;
251 $answer->{'reversion_count'} = $reversion_count;
256 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
258 Groups the variants at the given $rank of the collation, treating any
259 relationships in @merge_relationship_types as equivalent. $lacunose should
260 be a reference to an array, to which the sigla of lacunose witnesses at this
261 rank will be appended; $transposed should be a reference to a hash, wherein
262 the identities of transposed readings and their relatives will be stored.
264 Returns a hash $group_readings where $rdg is attested by the witnesses listed
265 in $group_readings->{$rdg}.
269 # Return group_readings, groups, lacunose
271 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
272 my $c = $tradition->collation;
273 my $aclabel = $c->ac_label;
274 my $table = $c->alignment_table;
275 # Get the alignment table readings
276 my %readings_at_rank;
277 my $check_for_gaps = Set::Scalar->new();
279 my $has_transposition;
280 foreach my $tablewit ( @{$table->{'alignment'}} ) {
281 my $rdg = $tablewit->{'tokens'}->[$rank-1];
282 my $wit = $tablewit->{'witness'};
283 # Exclude the witness if it is "lacunose" which if we got here
284 # means "not in the stemma".
285 next if _is_lacunose( $wit, $lacunose, $aclabel );
286 # Note if the witness is actually in a lacuna
287 if( $rdg && $rdg->{'t'}->is_lacuna ) {
288 _add_to_witlist( $wit, $lacunose, $aclabel );
289 # Otherwise the witness either has a positive reading...
291 # If the reading has been counted elsewhere as a transposition, ignore it.
292 if( $transposed->{$rdg->{'t'}->id} ) {
293 # TODO Does this cope with three-way transpositions?
294 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
297 # Otherwise, record it...
298 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
299 # ...and grab any transpositions, and their relations.
300 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
301 foreach my $trdg ( @transp ) {
302 next if exists $readings_at_rank{$trdg->id};
303 $has_transposition = 1;
304 my @affected_wits = _table_witnesses(
305 $table, $trdg, $lacunose, $aclabel );
306 next unless @affected_wits;
307 map { $moved_wits{$_} = 1 } @affected_wits;
308 $transposed->{$trdg->id} =
309 [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
310 $readings_at_rank{$trdg->id} = $trdg;
312 # ...or it is empty, ergo a gap.
314 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
317 my $gap_wits = Set::Scalar->new();
318 map { _add_to_witlist( $_, $gap_wits, $aclabel )
319 unless $moved_wits{$_} } $check_for_gaps->members;
321 # Group the readings, collapsing groups by relationship if needed.
322 my $grouped_readings = {};
323 foreach my $rdg ( values %readings_at_rank ) {
324 # Skip readings that have been collapsed into others.
325 next if exists $grouped_readings->{$rdg->id}
326 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
327 # Get the witness list, including from readings collapsed into this one.
328 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
329 if( $collapse && @$collapse ) {
330 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
331 foreach my $other ( $rdg->related_readings( $filter ) ) {
332 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
333 push( @wits, @otherwits );
334 $grouped_readings->{$other->id} = 'COLLAPSE';
337 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
339 if( $gap_wits->members ) {
340 $grouped_readings->{'(omitted)'} = $gap_wits;
343 # Get rid of our collapsed readings
344 map { delete $grouped_readings->{$_} if(
345 $grouped_readings->{$_} eq 'COLLAPSE'
346 || $grouped_readings->{$_}->is_empty ) }
347 keys %$grouped_readings;
349 # If something was transposed, check the groups for doubled-up readings
350 if( $has_transposition ) {
351 # print STDERR "Group for rank $rank:\n";
352 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
353 # keys %$grouped_readings;
354 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
358 return $grouped_readings;
361 # Helper function to query the alignment table for all witnesses (a.c. included)
362 # that have a given reading at its rank.
363 sub _table_witnesses {
364 my( $table, $trdg, $lacunose, $aclabel ) = @_;
365 my $tableidx = $trdg->rank - 1;
366 my $has_reading = Set::Scalar->new();
367 foreach my $row ( @{$table->{'alignment'}} ) {
368 my $wit = $row->{'witness'};
369 next if _is_lacunose( $wit, $lacunose, $aclabel );
370 my $rdg = $row->{'tokens'}->[$tableidx];
371 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
372 _add_to_witlist( $wit, $has_reading, $aclabel )
373 if $rdg->{'t'}->id eq $trdg->id;
375 return $has_reading->members;
378 # Helper function to see if a witness is lacunose even if we are asking about
381 my ( $wit, $lac, $acstr ) = @_;
382 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
385 return $lac->has( $wit );
388 # Helper function to ensure that X and X a.c. never appear in the same list.
389 sub _add_to_witlist {
390 my( $wit, $list, $acstr ) = @_;
391 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
392 # Don't add X a.c. if we already have X
393 return if $list->has( $1 );
395 # Delete X a.c. if we are about to add X
396 $list->delete( $wit.$acstr );
398 $list->insert( $wit );
401 sub _check_transposed_consistency {
402 my( $c, $rank, $transposed, $groupings ) = @_;
405 # Note which readings are actually at this rank, and which witnesses
406 # belong to which reading.
407 foreach my $rdg ( keys %$groupings ) {
408 my $rdgobj = $c->reading( $rdg );
409 # Count '(omitted)' as a reading at this rank
410 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
411 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
413 # Our work is done if we have no witness belonging to more than one
415 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
416 return unless @doubled;
417 # If we have a symmetric related transposition, drop the non-rank readings.
418 if( @doubled == scalar keys %seen_wits ) {
419 foreach my $rdg ( keys %$groupings ) {
420 if( !$thisrank{$rdg} ) {
421 my $groupstr = wit_stringify( $groupings->{$rdg} );
422 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
424 delete $groupings->{$rdg};
425 # If we found a group match, assume there is a symmetry happening.
426 # TODO think more about this
427 # print STDERR "*** Deleting symmetric reading $rdg\n";
429 delete $transposed->{$rdg};
430 warn "Found problem in evident symmetry with reading $rdg";
434 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
436 foreach my $dup ( @doubled ) {
437 foreach my $rdg ( @{$seen_wits{$dup}} ) {
438 next if $thisrank{$rdg};
439 next unless exists $groupings->{$rdg};
440 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
441 delete $groupings->{$rdg};
442 delete $transposed->{$rdg};
445 # and put any now-orphaned readings into an 'omitted' reading.
446 foreach my $wit ( keys %seen_wits ) {
447 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
448 $groupings->{'(omitted)'} = Set::Scalar->new()
449 unless exists $groupings->{'(omitted)'};
450 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
456 # For the given grouping, return its situation graph based on the stemma.
457 sub _graph_for_grouping {
458 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
461 foreach my $gs ( values %$grouping ) {
463 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
464 push( @$acwits, $1 ) unless $lacunose->has( $1 );
466 $extant->{$_} = 1 unless $lacunose->has( $_ );
472 # contig contains all extant wits and all hypothetical wits
473 # needed to make up the groups.
474 $graph = $stemma->situation_graph( $extant, $acwits );
475 } catch ( Text::Tradition::Error $e ) {
476 die "Could not extend graph with given extant and a.c. witnesses: "
479 die "Could not extend graph with a.c. witnesses @$acwits";
484 =head2 solve_variants( $calcdir, @groups )
486 Looks up the set of groups in the answers provided by the external graph solver
487 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
489 The JSON has the form
490 { "graph": [ stemmagraph DOT string without newlines ],
491 "groupings": [ array of arrays of groups, one per rank ] }
493 The answer has the form
494 { "variants" => [ array of variant location structures ],
495 "variant_count" => total,
496 "conflict_count" => number of conflicts detected,
497 "genealogical_count" => number of solutions found }
502 my( $dir, @groups ) = @_;
504 ## For each graph/group combo, look it up in the DB.
505 ## Witness map is a HACK to get around limitations in node names from IDP
506 my $witness_map = {};
507 ## Variables to store answers as they come back
509 my $genealogical = 0; # counter
510 foreach my $graphproblem ( @groups ) {
511 # Initialize the result structure for this graph problem
512 my $vstruct = { readings => [] };
513 push( @$variants, $vstruct );
515 # Construct the calc result key and look up its answer
516 my $reqkey = _get_calc_key( $graphproblem );
517 my $scope = $dir->new_scope;
518 my $answer = $dir->lookup( $reqkey );
520 #warn "No answer found for graph problem $reqkey, moving on";
521 # Record the unsolved problem so that we can go get a solution
522 _save_problem( $graphproblem );
523 # Put just the request, with no real result, into vstruct
524 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
525 push( @{$vstruct->{readings}}, { readingid => $rid,
526 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
531 ## The answer is a Text::Tradition::Analysis::Result containing a bunch
532 ## of information about this variant set. Record the information therein.
534 # 1. Did the group evaluate as genealogical?
535 $vstruct->{genealogical} = $answer->is_genealogical;
536 $genealogical++ if $answer->is_genealogical;
538 # 2. What are the calculated minimum groupings for each variant loc?
539 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
540 my $inputset = $graphproblem->{grouping}->{$rid};
541 my $minset = $answer->minimum_grouping_for( $inputset );
542 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
545 # 3. What are the sources and classes calculated for each witness?
546 $vstruct->{witcopy_types} = { $answer->classes };
547 $vstruct->{reading_roots} = {};
548 map { $vstruct->{reading_roots}->{$_} = 1 } $answer->sources;
552 # Spit out any unsolved problems we encountered
555 return { 'variants' => $variants,
556 'variant_count' => scalar @$variants,
557 'genealogical_count' => $genealogical };
561 my( $graphproblem ) = @_;
562 my $graph = $graphproblem->{graph};
563 my $grouping = [ values %{$graphproblem->{grouping}} ];
564 my $key = Text::Tradition::Analysis::Result::string_from_graph_problem(
566 return md5_hex( encode_utf8( $key ) );
570 my( $graphproblem ) = @_;
571 my $graphstr = Text::Tradition::Stemma::editable_graph(
572 $graphproblem->{graph}, { 'linesep' => ' ' } );
573 unless( exists $unsolved_problems->{$graphstr} ) {
574 $unsolved_problems->{$graphstr} = {};
577 foreach my $set ( sort { Text::Tradition::Analysis::Result::by_size_and_alpha( $a, $b ) } values %{$graphproblem->{grouping}} ) {
578 push( @$grouping, [ sort $set->members ] );
580 $unsolved_problems->{$graphstr}->{wit_stringify( $grouping )} = $grouping;
584 #say STDERR "Problems needing a solution:";
585 foreach my $graphstr ( keys %$unsolved_problems ) {
586 my $struct = { graph => $graphstr, groupings => [] };
587 foreach my $gp ( values %{$unsolved_problems->{$graphstr}} ) {
588 push( @{$struct->{groupings}}, $gp );
590 my $json = to_json( $struct );
595 =head2 analyze_location ( $tradition, $graph, $location_hash )
597 Given the tradition, its stemma graph, and the solution from the graph solver,
598 work out the rest of the information we want. For each reading we need missing,
599 conflict, reading_parents, independent_occurrence, followed, not_followed,
600 and follow_unknown. Alters the location_hash in place.
604 sub analyze_location {
605 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
606 my $c = $tradition->collation;
608 # Make a hash of all known node memberships, and make the subgraphs.
610 my $reading_roots = {};
612 my $acstr = $c->ac_label;
616 if( exists $variant_row->{'reading_roots'} ) {
617 $reading_roots = delete $variant_row->{'reading_roots'};
619 warn "No reading source information from IDP - proceed at your own risk";
622 my $classinfo = delete $variant_row->{'witcopy_types'};
624 # Note which witnesses positively belong to which group. This information
625 # comes ultimately from the IDP solver.
626 # Also make a note of the reading's roots.
627 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
628 my $rid = $rdghash->{'readingid'};
630 foreach my $wit ( @{$rdghash->{'group'}} ) {
631 $contig->{$wit} = $rid;
632 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
635 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
636 push( @roots, $wit );
639 $rdghash->{'independent_occurrence'} = \@roots;
642 # Now that we have all the node group memberships, calculate followed/
643 # non-followed/unknown values for each reading. Also figure out the
644 # reading's evident parent(s).
645 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
646 my $rid = $rdghash->{'readingid'};
647 my $rdg = $c->reading( $rid );
648 my @roots = @{$rdghash->{'independent_occurrence'}};
651 @reversions = grep { $classinfo->{$_} eq 'revert' }
652 $rdghash->{'group'}->members;
654 my @group = @{$rdghash->{'group'}};
656 # Start figuring things out.
657 $rdghash->{'followed'} = scalar( @group )
658 - ( scalar( @roots ) + scalar( @reversions ) );
659 # Find the parent readings, if any, of this reading.
660 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
661 my $revertparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
662 # Work out relationships between readings and their non-followed parent.
663 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
664 _resolve_parent_relationships( $c, $rid, $rdg, $revertparents );
666 $rdghash->{'reading_parents'} = $sourceparents;
667 $rdghash->{'reversion_parents'} = $revertparents;
669 # Find the number of times this reading was altered, and the number of
670 # times we're not sure.
671 my( %nofollow, %unknownfollow );
672 foreach my $wit ( @{$rdghash->{'group'}} ) {
673 foreach my $wchild ( $graph->successors( $wit ) ) {
674 if( $reading_roots->{$wchild} && $contig->{$wchild}
675 && $contig->{$wchild} ne $rid ) {
676 # It definitely changed here.
677 $nofollow{$wchild} = 1;
678 } elsif( !($contig->{$wchild}) ) {
679 # The child is a hypothetical node not definitely in
680 # any group. Answer is unknown.
681 $unknownfollow{$wchild} = 1;
682 } # else it is either in our group, or it is a non-root node in a
683 # known group and therefore is presumed to have its reading from
684 # its group, not this link.
687 $rdghash->{'not_followed'} = keys %nofollow;
688 $rdghash->{'follow_unknown'} = keys %unknownfollow;
690 # Now say whether this reading represents a conflict.
691 unless( $variant_row->{'genealogical'} ) {
694 # We have tested for reversions. Use the information.
696 $rdghash->{'reversions'} = \@reversions if @reversions;
698 $rdghash->{'is_conflict'} = @roots != 1;
699 $rdghash->{'is_reverted'} = !!@reversions;
704 sub _find_reading_parents {
705 my( $rid, $graph, $contig, @list ) = @_;
707 foreach my $wit ( @list ) {
708 # Look in the stemma graph to find this witness's extant or known-reading
709 # immediate ancestor(s), and look up the reading that each ancestor holds.
710 my @check = $graph->predecessors( $wit );
713 foreach my $wparent( @check ) {
714 my $preading = $contig->{$wparent};
715 if( $preading && $preading ne $rid ) {
716 $parenthash->{$preading} = 1;
718 push( @next, $graph->predecessors( $wparent ) );
727 sub _resolve_parent_relationships {
728 my( $c, $rid, $rdg, $rdgparents ) = @_;
729 foreach my $p ( keys %$rdgparents ) {
730 # Resolve the relationship of the parent to the reading, and
731 # save it in our hash.
732 my $pobj = $c->reading( $p );
733 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
734 my $phash = { 'label' => $prep };
736 my $rel = $c->get_relationship( $p, $rid );
738 _add_to_hash( $rel, $phash );
740 # First check for a transposed relationship
741 if( $rdg->rank != $pobj->rank ) {
742 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
743 next unless $ti->text eq $rdg->text;
744 $rel = $c->get_relationship( $ti, $pobj );
746 _add_to_hash( $rel, $phash, 1 );
751 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
752 next unless $ti->text eq $pobj->text;
753 $rel = $c->get_relationship( $ti, $rdg );
755 _add_to_hash( $rel, $phash, 1 );
762 # and then check for sheer word similarity.
763 my $rtext = $rdg->text;
764 my $ptext = $pobj->text;
765 if( similar( $rtext, $ptext ) ) {
766 # say STDERR "Words $rtext and $ptext judged similar";
767 $phash->{relation} = { type => 'wordsimilar' };
771 $phash->{relation} = { type => 'deletion' };
773 # Get the attributes of the parent object while we are here
774 $phash->{'text'} = $pobj->text if $pobj;
775 $phash->{'is_nonsense'} = $pobj->is_nonsense;
776 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
777 } elsif( $p eq '(omitted)' ) {
778 $phash->{relation} = { type => 'addition' };
781 $rdgparents->{$p} = $phash;
786 my( $rel, $phash, $is_transposed ) = @_;
787 $phash->{relation} = { type => $rel->type };
788 $phash->{relation}->{transposed} = 1 if $is_transposed;
789 $phash->{relation}->{annotation} = $rel->annotation
790 if $rel->has_annotation;
793 =head2 similar( $word1, $word2 )
795 Use Algorithm::Diff to get a sense of how close the words are to each other.
796 This will hopefully handle substitutions a bit more nicely than Levenshtein.
803 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
804 my @let1 = split( '', lc( $word1 ) );
805 my @let2 = split( '', lc( $word2 ) );
806 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
808 while( $diff->Next ) {
810 # Take off points for longer strings
811 my $cs = $diff->Range(1) - 2;
814 } elsif( !$diff->Items(1) ) {
815 $mag += $diff->Range(2);
816 } elsif( !$diff->Items(2) ) {
817 $mag += $diff->Range(1);
819 # Split the difference for substitutions
820 my $c1 = $diff->Range(1) || 1;
821 my $c2 = $diff->Range(2) || 1;
822 my $cd = ( $c1 + $c2 ) / 2;
826 return ( $mag <= length( $word1 ) / 2 );
830 my( $group, $graph ) = @_;
832 # Record the existence of the vertices in the group
833 map { $relevant->{$_} = 1 } @$group;
835 my $subgraph = $graph->deep_copy;
836 map { $subgraph->delete_vertex( $_ ) unless $relevant->{$_} }
838 # Now prune and return the remaining vertices.
839 _prune_subtree( $subgraph );
840 # Return the list of vertices and the list of roots.
841 my $pruned_group = [ sort $subgraph->vertices ];
842 my $pruned_roots = [ $subgraph->predecessorless_vertices ];
843 return( $pruned_group, $pruned_roots );
849 # Delete lacunose witnesses that have no successors
850 my @orphan_hypotheticals;
853 die "Infinite loop on leaves" if $ctr > 100;
854 @orphan_hypotheticals =
855 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' }
856 $tree->successorless_vertices;
857 $tree->delete_vertices( @orphan_hypotheticals );
859 } while( @orphan_hypotheticals );
861 # Delete lacunose roots that have a single successor
865 die "Infinite loop on roots" if $ctr > 100;
867 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical'
868 && $tree->successors( $_ ) == 1 }
869 $tree->predecessorless_vertices;
870 $tree->delete_vertices( @redundant_root );
872 } while( @redundant_root );
875 sub _useful_variant {
876 my( $rankgroup, $rankgraph, $acstr ) = @_;
878 # Sort by group size and return
880 foreach my $rdg ( keys %$rankgroup ) {
881 my @wits = $rankgroup->{$rdg}->members;
885 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
886 || $wits[0] =~ /\Q$acstr\E$/ );
889 return $is_useful > 1;
892 =head2 wit_stringify( $groups )
894 Takes an array of witness groupings and produces a string like
895 ['A','B'] / ['C','D','E'] / ['F']
902 # If we were passed an array of witnesses instead of an array of
903 # groupings, then "group" the witnesses first.
904 unless( ref( $groups->[0] ) ) {
905 my $mkgrp = [ $groups ];
908 foreach my $g ( @$groups ) {
909 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
911 return join( ' / ', @gst );
918 This package is free software and is provided "as is" without express
919 or implied warranty. You can redistribute it and/or modify it under
920 the same terms as Perl itself.
924 Tara L Andrews E<lt>aurum@cpan.orgE<gt>