1 package Text::Tradition::Analysis;
5 use Algorithm::Diff; # for word similarity measure
7 use Digest::MD5 qw/ md5_hex /;
8 use Encode qw/ encode_utf8 /;
11 use JSON qw/ to_json /;
14 use Text::Tradition::Analysis::Result;
15 use Text::Tradition::Directory;
16 use Text::Tradition::Stemma;
19 use vars qw/ @EXPORT_OK /;
20 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
22 my $unsolved_problems = {};
26 Text::Tradition::Analysis - functions for stemma analysis of a tradition
31 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
32 my $t = Text::Tradition->new(
33 'name' => 'this is a text',
35 'file' => '/path/to/tei_parallel_seg_file.xml' );
36 $t->add_stemma( 'dotfile' => $stemmafile );
38 my $variant_data = run_analysis( $tradition );
39 # Recalculate rank $n treating all orthographic variants as equivalent
40 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
44 Text::Tradition is a library for representation and analysis of collated
45 texts, particularly medieval ones. The Collation is the central feature of
46 a Tradition, where the text, its sequence of readings, and its relationships
47 between readings are actually kept.
51 =head2 run_analysis( $tradition, %opts )
53 Runs the analysis described in analyze_variant_location on every location in the
54 collation of the given tradition, with the given options. These include:
58 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
59 is 0 (i.e. the first).
61 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
63 =item * merge_types - Specify a list of relationship types, where related readings
64 should be treated as identical for the purposes of analysis.
66 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
73 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
75 my $datafile = 't/data/florilegium_tei_ps.xml';
76 my $tradition = Text::Tradition->new( 'input' => 'TEI',
78 'file' => $datafile );
79 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
80 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
82 my %expected_genealogical = (
113 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
114 my $c = $tradition->collation;
115 foreach my $row ( @{$data->{'variants'}} ) {
116 # Account for rows that used to be "not useful"
117 unless( exists $expected_genealogical{$row->{'id'}} ) {
118 $expected_genealogical{$row->{'id'}} = 1;
120 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
121 is( $gen_bool, $expected_genealogical{$row->{'id'}},
122 "Got correct genealogical flag for row " . $row->{'id'} );
123 # Check that we have the right row with the right groups
124 my $rank = $row->{'id'};
125 foreach my $rdghash ( @{$row->{'readings'}} ) {
126 # Skip 'readings' that aren't really
127 next unless $c->reading( $rdghash->{'readingid'} );
129 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
130 "Got correct reading rank" );
131 # Check the witnesses
132 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
133 my @sgrp = sort @{$rdghash->{'group'}};
134 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
137 is( $data->{'variant_count'}, 58, "Got right total variant number" );
138 # TODO Make something meaningful of conflict count, maybe test other bits
145 my( $tradition, %opts ) = @_;
146 my $c = $tradition->collation;
147 my $aclabel = $c->ac_label;
149 my $stemma_id = $opts{'stemma_id'} || 0;
150 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
151 my $collapse = Set::Scalar->new();
152 if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
153 $collapse->insert( @{$opts{'merge_types'}} );
154 } elsif( $opts{'merge_types'} ) {
155 $collapse->insert( $opts{'merge_types'} );
158 # Make sure we have a lookup DB for graph calculation results; this will die
159 # if calcdir or calcdsn isn't passed.
160 my $dir = $opts{'calcdir'} ? delete $opts{'calcdir'}
161 : Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
164 my $stemma = $tradition->stemma( $stemma_id );
166 # Figure out which witnesses we are working with - that is, the ones that
167 # appear both in the stemma and in the tradition. All others are 'lacunose'
169 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
170 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
171 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
172 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
174 # Find and mark 'common' ranks for exclusion, unless they were
175 # explicitly specified.
178 foreach my $rdg ( $c->common_readings ) {
179 $common_rank{$rdg->rank} = 1;
181 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
184 # Group the variants to send to the solver
189 foreach my $rank ( @ranks ) {
190 my $missing = $lacunose->clone();
191 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
192 # Filter out any empty rankgroups
193 # (e.g. from the later rank for a transposition)
194 next unless keys %$rankgroup;
195 # Get the graph for this rankgroup
196 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
197 if( $opts{'exclude_type1'} ) {
198 # Check to see whether this is a "useful" group.
199 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
201 push( @use_ranks, $rank );
202 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
203 $lacunae{$rank} = $missing;
206 my $answer = solve_variants( $dir, @groups );
208 # Do further analysis on the answer
209 my $conflict_count = 0;
210 my $reversion_count = 0;
211 foreach my $idx ( 0 .. $#use_ranks ) {
212 my $location = $answer->{'variants'}->[$idx];
213 # Add the rank back in
214 my $rank = $use_ranks[$idx];
215 $location->{'id'} = $rank;
216 # Note what our lacunae are
218 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
219 $location->{'missing'} = [ keys %lmiss ];
221 # Run the extra analysis we need.
222 ## TODO We run through all the variants in this call, so
223 ## why not add the reading data there instead of here below?
224 my $graph = $groups[$idx]->{graph};
225 analyze_location( $tradition, $graph, $location, \%lmiss );
228 # Do the final post-analysis tidying up of the data.
229 foreach my $rdghash ( @{$location->{'readings'}} ) {
230 $conflict_count++ if $rdghash->{'is_conflict'};
231 $reversion_count++ if $rdghash->{'is_reverted'};
232 # Add the reading text back in, setting display value as needed
233 my $rdg = $c->reading( $rdghash->{'readingid'} );
235 $rdghash->{'text'} = $rdg->text .
236 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
237 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
238 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
240 # Remove lacunose witnesses from this reading's list now that the
243 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
244 $rdghash->{'group'} = \@realgroup;
245 # Note any layered witnesses that appear in this group
246 foreach( @realgroup ) {
247 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
248 push( @layerwits, $1 );
252 $location->{'layerwits'} = \@layerwits if @layerwits;
254 $answer->{'conflict_count'} = $conflict_count;
255 $answer->{'reversion_count'} = $reversion_count;
260 =head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
262 Groups the variants at the given $rank of the collation, treating any
263 relationships in the set $merge_relationship_types as equivalent.
264 $lacunose should be a reference to an array, to which the sigla of lacunose
265 witnesses at this rank will be appended; $transposed should be a reference
266 to a hash, wherein the identities of transposed readings and their
267 relatives will be stored.
269 Returns a hash $group_readings where $rdg is attested by the witnesses listed
270 in $group_readings->{$rdg}.
274 # Return group_readings, groups, lacunose
276 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
277 my $c = $tradition->collation;
278 my $aclabel = $c->ac_label;
279 my $table = $c->alignment_table;
280 # Get the alignment table readings
281 my %readings_at_rank;
282 my $check_for_gaps = Set::Scalar->new();
284 my $has_transposition;
285 foreach my $tablewit ( @{$table->{'alignment'}} ) {
286 my $rdg = $tablewit->{'tokens'}->[$rank-1];
287 my $wit = $tablewit->{'witness'};
288 # Exclude the witness if it is "lacunose" which if we got here
289 # means "not in the stemma".
290 next if _is_lacunose( $wit, $lacunose, $aclabel );
291 # Note if the witness is actually in a lacuna
292 if( $rdg && $rdg->{'t'}->is_lacuna ) {
293 _add_to_witlist( $wit, $lacunose, $aclabel );
294 # Otherwise the witness either has a positive reading...
296 # If the reading has been counted elsewhere as a transposition, ignore it.
297 if( $transposed->{$rdg->{'t'}->id} ) {
298 # TODO Does this cope with three-way transpositions?
299 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
302 # Otherwise, record it...
303 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
304 # ...and grab any transpositions, and their relations.
305 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
306 foreach my $trdg ( @transp ) {
307 next if exists $readings_at_rank{$trdg->id};
308 $has_transposition = 1;
309 my @affected_wits = _table_witnesses(
310 $table, $trdg, $lacunose, $aclabel );
311 next unless @affected_wits;
312 map { $moved_wits{$_} = 1 } @affected_wits;
313 $transposed->{$trdg->id} =
314 [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
315 $readings_at_rank{$trdg->id} = $trdg;
317 # ...or it is empty, ergo a gap.
319 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
322 my $gap_wits = Set::Scalar->new();
323 map { _add_to_witlist( $_, $gap_wits, $aclabel )
324 unless $moved_wits{$_} } $check_for_gaps->members;
326 # Group the readings, collapsing groups by relationship if needed.
327 my $grouped_readings = {};
328 foreach my $rdg ( values %readings_at_rank ) {
329 # Skip readings that have been collapsed into others.
330 next if exists $grouped_readings->{$rdg->id}
331 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
332 # Get the witness list, including from readings collapsed into this one.
333 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
334 if( $collapse && $collapse->size ) {
335 my $filter = sub { $collapse->has( $_[0]->type ) };
336 foreach my $other ( $rdg->related_readings( $filter ) ) {
337 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
338 push( @wits, @otherwits );
339 $grouped_readings->{$other->id} = 'COLLAPSE';
342 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
344 if( $gap_wits->members ) {
345 $grouped_readings->{'(omitted)'} = $gap_wits;
348 # Get rid of our collapsed readings
349 map { delete $grouped_readings->{$_} if(
350 $grouped_readings->{$_} eq 'COLLAPSE'
351 || $grouped_readings->{$_}->is_empty ) }
352 keys %$grouped_readings;
354 # If something was transposed, check the groups for doubled-up readings
355 if( $has_transposition ) {
356 # print STDERR "Group for rank $rank:\n";
357 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
358 # keys %$grouped_readings;
359 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
363 return $grouped_readings;
366 # Helper function to query the alignment table for all witnesses (a.c. included)
367 # that have a given reading at its rank.
368 sub _table_witnesses {
369 my( $table, $trdg, $lacunose, $aclabel ) = @_;
370 my $tableidx = $trdg->rank - 1;
371 my $has_reading = Set::Scalar->new();
372 foreach my $row ( @{$table->{'alignment'}} ) {
373 my $wit = $row->{'witness'};
374 next if _is_lacunose( $wit, $lacunose, $aclabel );
375 my $rdg = $row->{'tokens'}->[$tableidx];
376 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
377 _add_to_witlist( $wit, $has_reading, $aclabel )
378 if $rdg->{'t'}->id eq $trdg->id;
380 return $has_reading->members;
383 # Helper function to see if a witness is lacunose even if we are asking about
386 my ( $wit, $lac, $acstr ) = @_;
387 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
390 return $lac->has( $wit );
393 # Helper function to ensure that X and X a.c. never appear in the same list.
394 sub _add_to_witlist {
395 my( $wit, $list, $acstr ) = @_;
396 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
397 # Don't add X a.c. if we already have X
398 return if $list->has( $1 );
400 # Delete X a.c. if we are about to add X
401 $list->delete( $wit.$acstr );
403 $list->insert( $wit );
406 sub _check_transposed_consistency {
407 my( $c, $rank, $transposed, $groupings ) = @_;
410 # Note which readings are actually at this rank, and which witnesses
411 # belong to which reading.
412 foreach my $rdg ( keys %$groupings ) {
413 my $rdgobj = $c->reading( $rdg );
414 # Count '(omitted)' as a reading at this rank
415 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
416 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
418 # Our work is done if we have no witness belonging to more than one
420 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
421 return unless @doubled;
422 # If we have a symmetric related transposition, drop the non-rank readings.
423 if( @doubled == scalar keys %seen_wits ) {
424 foreach my $rdg ( keys %$groupings ) {
425 if( !$thisrank{$rdg} ) {
426 my $groupstr = wit_stringify( $groupings->{$rdg} );
427 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
429 delete $groupings->{$rdg};
430 # If we found a group match, assume there is a symmetry happening.
431 # TODO think more about this
432 # print STDERR "*** Deleting symmetric reading $rdg\n";
434 delete $transposed->{$rdg};
435 warn "Found problem in evident symmetry with reading $rdg";
439 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
441 foreach my $dup ( @doubled ) {
442 foreach my $rdg ( @{$seen_wits{$dup}} ) {
443 next if $thisrank{$rdg};
444 next unless exists $groupings->{$rdg};
445 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
446 delete $groupings->{$rdg};
447 delete $transposed->{$rdg};
450 # and put any now-orphaned readings into an 'omitted' reading.
451 foreach my $wit ( keys %seen_wits ) {
452 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
453 $groupings->{'(omitted)'} = Set::Scalar->new()
454 unless exists $groupings->{'(omitted)'};
455 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
461 # For the given grouping, return its situation graph based on the stemma.
462 sub _graph_for_grouping {
463 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
466 foreach my $gs ( values %$grouping ) {
468 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
469 push( @$acwits, $1 ) unless $lacunose->has( $1 );
471 $extant->{$_} = 1 unless $lacunose->has( $_ );
477 # contig contains all extant wits and all hypothetical wits
478 # needed to make up the groups.
479 $graph = $stemma->situation_graph( $extant, $acwits );
480 } catch ( Text::Tradition::Error $e ) {
481 die "Could not extend graph with given extant and a.c. witnesses: "
484 die "Could not extend graph with a.c. witnesses @$acwits";
489 =head2 solve_variants( $calcdir, @groups )
491 Looks up the set of groups in the answers provided by the external graph solver
492 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
494 The JSON has the form
495 { "graph": [ stemmagraph DOT string without newlines ],
496 "groupings": [ array of arrays of groups, one per rank ] }
498 The answer has the form
499 { "variants" => [ array of variant location structures ],
500 "variant_count" => total,
501 "conflict_count" => number of conflicts detected,
502 "genealogical_count" => number of solutions found }
507 my( $dir, @groups ) = @_;
509 ## For each graph/group combo, look it up in the DB.
510 ## Witness map is a HACK to get around limitations in node names from IDP
511 my $witness_map = {};
512 ## Variables to store answers as they come back
514 my $genealogical = 0; # counter
515 foreach my $graphproblem ( @groups ) {
516 # Initialize the result structure for this graph problem
517 my $vstruct = { readings => [] };
518 push( @$variants, $vstruct );
520 # Construct the calc result key and look up its answer
521 my $reqkey = _get_calc_key( $graphproblem );
522 my $scope = $dir->new_scope;
523 my $answer = $dir->lookup( $reqkey );
525 #warn "No answer found for graph problem $reqkey, moving on";
526 # Record the unsolved problem so that we can go get a solution
527 _save_problem( $graphproblem );
528 # Put just the request, with no real result, into vstruct
529 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
530 push( @{$vstruct->{readings}}, { readingid => $rid,
531 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
536 ## The answer is a Text::Tradition::Analysis::Result containing a bunch
537 ## of information about this variant set. Record the information therein.
539 # 1. Did the group evaluate as genealogical?
540 $vstruct->{genealogical} = $answer->is_genealogical;
541 $genealogical++ if $answer->is_genealogical;
543 # 2. What are the calculated minimum groupings for each variant loc?
544 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
545 my $inputset = $graphproblem->{grouping}->{$rid};
546 my $minset = $answer->minimum_grouping_for( $inputset );
547 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
550 # 3. What are the sources and classes calculated for each witness?
551 $vstruct->{witcopy_types} = { $answer->classes };
552 $vstruct->{reading_roots} = {};
553 map { $vstruct->{reading_roots}->{$_} = 1 } $answer->sources;
557 # Spit out any unsolved problems we encountered
560 return { 'variants' => $variants,
561 'variant_count' => scalar @$variants,
562 'genealogical_count' => $genealogical };
566 my( $graphproblem ) = @_;
567 my $graph = $graphproblem->{graph};
568 my $grouping = [ values %{$graphproblem->{grouping}} ];
569 my $key = Text::Tradition::Analysis::Result::string_from_graph_problem(
571 return md5_hex( encode_utf8( $key ) );
575 my( $graphproblem ) = @_;
576 my $problem = Text::Tradition::Analysis::Result->new(
577 graph => $graphproblem->{graph},
578 setlist => [ values %{$graphproblem->{grouping}} ]
580 my $key = _get_calc_key( $graphproblem );
581 my( $str ) = $problem->problem_json;
582 say STDERR "Stashing unsolved problem $str at key $key";
583 $unsolved_problems->{$key} = $problem;
587 #say STDERR "Problems needing a solution:";
588 my @problems = values %$unsolved_problems;
589 return unless @problems;
590 my $first = shift @problems;
591 map { say STDERR $_ } $first->problem_json( @problems );
594 =head2 analyze_location ( $tradition, $graph, $location_hash )
596 Given the tradition, its stemma graph, and the solution from the graph solver,
597 work out the rest of the information we want. For each reading we need missing,
598 conflict, reading_parents, independent_occurrence, followed, not_followed,
599 and follow_unknown. Alters the location_hash in place.
603 sub analyze_location {
604 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
605 my $c = $tradition->collation;
607 # Make a hash of all known node memberships, and make the subgraphs.
609 my $reading_roots = {};
611 my $acstr = $c->ac_label;
615 if( exists $variant_row->{'reading_roots'} ) {
616 $reading_roots = delete $variant_row->{'reading_roots'};
618 warn "No reading source information from IDP - proceed at your own risk";
621 my $classinfo = delete $variant_row->{'witcopy_types'};
623 # Note which witnesses positively belong to which group. This information
624 # comes ultimately from the IDP solver.
625 # Also make a note of the reading's roots.
626 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
627 my $rid = $rdghash->{'readingid'};
629 foreach my $wit ( @{$rdghash->{'group'}} ) {
630 $contig->{$wit} = $rid;
631 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
634 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
635 push( @roots, $wit );
638 $rdghash->{'independent_occurrence'} = \@roots;
641 # Now that we have all the node group memberships, calculate followed/
642 # non-followed/unknown values for each reading. Also figure out the
643 # reading's evident parent(s).
644 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
645 my $rid = $rdghash->{'readingid'};
646 my $rdg = $c->reading( $rid );
647 my @roots = @{$rdghash->{'independent_occurrence'}};
650 @reversions = grep { $classinfo->{$_} eq 'revert' }
651 $rdghash->{'group'}->members;
653 my @group = @{$rdghash->{'group'}};
655 # Start figuring things out.
656 $rdghash->{'followed'} = scalar( @group )
657 - ( scalar( @roots ) + scalar( @reversions ) );
658 # Find the parent readings, if any, of this reading.
659 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
660 # Work out relationships between readings and their non-followed parent.
661 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
662 $rdghash->{'reading_parents'} = $sourceparents;
665 my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
666 _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
667 $rdghash->{'reversion_parents'} = $revparents;
670 # Find the number of times this reading was altered, and the number of
671 # times we're not sure.
672 my( %nofollow, %unknownfollow );
673 foreach my $wit ( @{$rdghash->{'group'}} ) {
674 foreach my $wchild ( $graph->successors( $wit ) ) {
675 if( $reading_roots->{$wchild} && $contig->{$wchild}
676 && $contig->{$wchild} ne $rid ) {
677 # It definitely changed here.
678 $nofollow{$wchild} = 1;
679 } elsif( !($contig->{$wchild}) ) {
680 # The child is a hypothetical node not definitely in
681 # any group. Answer is unknown.
682 $unknownfollow{$wchild} = 1;
683 } # else it is either in our group, or it is a non-root node in a
684 # known group and therefore is presumed to have its reading from
685 # its group, not this link.
688 $rdghash->{'not_followed'} = keys %nofollow;
689 $rdghash->{'follow_unknown'} = keys %unknownfollow;
691 # Now say whether this reading represents a conflict.
692 unless( $variant_row->{'genealogical'} ) {
693 $rdghash->{'is_conflict'} = @roots != 1;
694 $rdghash->{'is_reverted'} = scalar @reversions;
699 sub _find_reading_parents {
700 my( $rid, $graph, $contig, @list ) = @_;
702 foreach my $wit ( @list ) {
703 # Look in the stemma graph to find this witness's extant or known-reading
704 # immediate ancestor(s), and look up the reading that each ancestor holds.
705 my @check = $graph->predecessors( $wit );
708 foreach my $wparent( @check ) {
709 my $preading = $contig->{$wparent};
710 if( $preading && $preading ne $rid ) {
711 $parenthash->{$preading} = 1;
713 push( @next, $graph->predecessors( $wparent ) );
722 sub _resolve_parent_relationships {
723 my( $c, $rid, $rdg, $rdgparents ) = @_;
724 foreach my $p ( keys %$rdgparents ) {
725 # Resolve the relationship of the parent to the reading, and
726 # save it in our hash.
727 my $pobj = $c->reading( $p );
728 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
729 my $phash = { 'label' => $prep };
731 my $rel = $c->get_relationship( $p, $rid );
733 _add_to_hash( $rel, $phash );
735 # First check for a transposed relationship
736 if( $rdg->rank != $pobj->rank ) {
737 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
738 next unless $ti->text eq $rdg->text;
739 $rel = $c->get_relationship( $ti, $pobj );
741 _add_to_hash( $rel, $phash, 1 );
746 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
747 next unless $ti->text eq $pobj->text;
748 $rel = $c->get_relationship( $ti, $rdg );
750 _add_to_hash( $rel, $phash, 1 );
757 # and then check for sheer word similarity.
758 my $rtext = $rdg->text;
759 my $ptext = $pobj->text;
760 if( similar( $rtext, $ptext ) ) {
761 # say STDERR "Words $rtext and $ptext judged similar";
762 $phash->{relation} = { type => 'wordsimilar' };
766 $phash->{relation} = { type => 'deletion' };
768 # Get the attributes of the parent object while we are here
769 $phash->{'text'} = $pobj->text if $pobj;
770 $phash->{'is_nonsense'} = $pobj->is_nonsense;
771 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
772 } elsif( $p eq '(omitted)' ) {
773 $phash->{relation} = { type => 'addition' };
776 $rdgparents->{$p} = $phash;
781 my( $rel, $phash, $is_transposed ) = @_;
782 $phash->{relation} = { type => $rel->type };
783 $phash->{relation}->{transposed} = 1 if $is_transposed;
784 $phash->{relation}->{annotation} = $rel->annotation
785 if $rel->has_annotation;
788 =head2 similar( $word1, $word2 )
790 Use Algorithm::Diff to get a sense of how close the words are to each other.
791 This will hopefully handle substitutions a bit more nicely than Levenshtein.
798 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
799 my @let1 = split( '', lc( $word1 ) );
800 my @let2 = split( '', lc( $word2 ) );
801 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
803 while( $diff->Next ) {
805 # Take off points for longer strings
806 my $cs = $diff->Range(1) - 2;
809 } elsif( !$diff->Items(1) ) {
810 $mag += $diff->Range(2);
811 } elsif( !$diff->Items(2) ) {
812 $mag += $diff->Range(1);
814 # Split the difference for substitutions
815 my $c1 = $diff->Range(1) || 1;
816 my $c2 = $diff->Range(2) || 1;
817 my $cd = ( $c1 + $c2 ) / 2;
821 return ( $mag <= length( $word1 ) / 2 );
825 my( $group, $graph ) = @_;
827 # Record the existence of the vertices in the group
828 map { $relevant->{$_} = 1 } @$group;
830 my $subgraph = $graph->deep_copy;
831 map { $subgraph->delete_vertex( $_ ) unless $relevant->{$_} }
833 # Now prune and return the remaining vertices.
834 _prune_subtree( $subgraph );
835 # Return the list of vertices and the list of roots.
836 my $pruned_group = [ sort $subgraph->vertices ];
837 my $pruned_roots = [ $subgraph->predecessorless_vertices ];
838 return( $pruned_group, $pruned_roots );
844 # Delete lacunose witnesses that have no successors
845 my @orphan_hypotheticals;
848 die "Infinite loop on leaves" if $ctr > 100;
849 @orphan_hypotheticals =
850 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' }
851 $tree->successorless_vertices;
852 $tree->delete_vertices( @orphan_hypotheticals );
854 } while( @orphan_hypotheticals );
856 # Delete lacunose roots that have a single successor
860 die "Infinite loop on roots" if $ctr > 100;
862 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical'
863 && $tree->successors( $_ ) == 1 }
864 $tree->predecessorless_vertices;
865 $tree->delete_vertices( @redundant_root );
867 } while( @redundant_root );
870 sub _useful_variant {
871 my( $rankgroup, $rankgraph, $acstr ) = @_;
873 # Sort by group size and return
875 foreach my $rdg ( keys %$rankgroup ) {
876 my @wits = $rankgroup->{$rdg}->members;
880 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
881 || $wits[0] =~ /\Q$acstr\E$/ );
884 return $is_useful > 1;
887 =head2 wit_stringify( $groups )
889 Takes an array of witness groupings and produces a string like
890 ['A','B'] / ['C','D','E'] / ['F']
897 # If we were passed an array of witnesses instead of an array of
898 # groupings, then "group" the witnesses first.
899 unless( ref( $groups->[0] ) ) {
900 my $mkgrp = [ $groups ];
903 foreach my $g ( @$groups ) {
904 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
906 return join( ' / ', @gst );
913 This package is free software and is provided "as is" without express
914 or implied warranty. You can redistribute it and/or modify it under
915 the same terms as Perl itself.
919 Tara L Andrews E<lt>aurum@cpan.orgE<gt>