1 package Text::Tradition::Analysis;
5 use Algorithm::Diff; # for word similarity measure
6 use Encode qw/ decode_utf8 encode_utf8 /;
9 use JSON qw/ to_json decode_json /;
12 use Text::Tradition::Analysis::Result;
13 use Text::Tradition::Directory;
14 use Text::Tradition::Stemma;
17 use vars qw/ @EXPORT_OK /;
18 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
20 my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
21 my $unsolved_problems = {};
25 Text::Tradition::Analysis - functions for stemma analysis of a tradition
30 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
31 my $t = Text::Tradition->new(
32 'name' => 'this is a text',
34 'file' => '/path/to/tei_parallel_seg_file.xml' );
35 $t->add_stemma( 'dotfile' => $stemmafile );
37 my $variant_data = run_analysis( $tradition );
38 # Recalculate rank $n treating all orthographic variants as equivalent
39 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
43 Text::Tradition is a library for representation and analysis of collated
44 texts, particularly medieval ones. The Collation is the central feature of
45 a Tradition, where the text, its sequence of readings, and its relationships
46 between readings are actually kept.
50 =head2 run_analysis( $tradition, %opts )
52 Runs the analysis described in analyze_variant_location on every location in the
53 collation of the given tradition, with the given options. These include:
57 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
58 is 0 (i.e. the first).
60 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
62 =item * merge_types - Specify a list of relationship types, where related readings
63 should be treated as identical for the purposes of analysis.
65 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
72 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
74 my $datafile = 't/data/florilegium_tei_ps.xml';
75 my $tradition = Text::Tradition->new( 'input' => 'TEI',
77 'file' => $datafile );
78 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
79 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
81 my %expected_genealogical = (
112 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
113 my $c = $tradition->collation;
114 foreach my $row ( @{$data->{'variants'}} ) {
115 # Account for rows that used to be "not useful"
116 unless( exists $expected_genealogical{$row->{'id'}} ) {
117 $expected_genealogical{$row->{'id'}} = 1;
119 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
120 is( $gen_bool, $expected_genealogical{$row->{'id'}},
121 "Got correct genealogical flag for row " . $row->{'id'} );
122 # Check that we have the right row with the right groups
123 my $rank = $row->{'id'};
124 foreach my $rdghash ( @{$row->{'readings'}} ) {
125 # Skip 'readings' that aren't really
126 next unless $c->reading( $rdghash->{'readingid'} );
128 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
129 "Got correct reading rank" );
130 # Check the witnesses
131 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
132 my @sgrp = sort @{$rdghash->{'group'}};
133 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
136 is( $data->{'variant_count'}, 58, "Got right total variant number" );
137 # TODO Make something meaningful of conflict count, maybe test other bits
144 my( $tradition, %opts ) = @_;
145 my $c = $tradition->collation;
146 my $aclabel = $c->ac_label;
148 my $stemma_id = $opts{'stemma_id'} || 0;
149 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
150 my $collapse = Set::Scalar->new();
151 if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
152 $collapse->insert( @{$opts{'merge_types'}} );
153 } elsif( $opts{'merge_types'} ) {
154 $collapse->insert( $opts{'merge_types'} );
157 # Make sure we have a lookup DB for graph calculation results; this will die
158 # if calcdir or calcdsn isn't passed.
160 if( exists $opts{'calcdir'} ) {
161 $dir = delete $opts{'calcdir'}
162 } elsif ( exists $opts{'calcdsn'} ) {
163 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
167 my $stemma = $tradition->stemma( $stemma_id );
169 # Figure out which witnesses we are working with - that is, the ones that
170 # appear both in the stemma and in the tradition. All others are 'lacunose'
172 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
173 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
174 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
175 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
177 # Find and mark 'common' ranks for exclusion, unless they were
178 # explicitly specified.
181 foreach my $rdg ( $c->common_readings ) {
182 $common_rank{$rdg->rank} = 1;
184 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
187 # Group the variants to send to the solver
192 foreach my $rank ( @ranks ) {
193 my $missing = $lacunose->clone();
194 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
195 # Filter out any empty rankgroups
196 # (e.g. from the later rank for a transposition)
197 next unless keys %$rankgroup;
198 # Get the graph for this rankgroup
199 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
200 if( $opts{'exclude_type1'} ) {
201 # Check to see whether this is a "useful" group.
202 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
204 push( @use_ranks, $rank );
205 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
206 $lacunae{$rank} = $missing;
211 $answer = solve_variants( $dir, @groups );
212 } catch ( Text::Tradition::Error $e ) {
213 if( $e->message =~ /IDP/ ) {
214 # Something is wrong with the solver; make the variants table anyway
215 $answer->{'variants'} = [];
216 map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
219 # Something else is wrong; error out.
224 # Do further analysis on the answer
225 my $conflict_count = 0;
226 my $reversion_count = 0;
227 foreach my $idx ( 0 .. $#use_ranks ) {
228 my $location = $answer->{'variants'}->[$idx];
229 # Add the rank back in
230 my $rank = $use_ranks[$idx];
231 $location->{'id'} = $rank;
232 # Note what our lacunae are
234 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
235 $location->{'missing'} = [ keys %lmiss ];
237 # Run the extra analysis we need.
238 ## TODO We run through all the variants in this call, so
239 ## why not add the reading data there instead of here below?
240 my $graph = $groups[$idx]->{graph};
241 analyze_location( $tradition, $graph, $location, \%lmiss );
244 # Do the final post-analysis tidying up of the data.
245 foreach my $rdghash ( @{$location->{'readings'}} ) {
246 $conflict_count++ if $rdghash->{'is_conflict'};
247 $reversion_count++ if $rdghash->{'is_reverted'};
248 # Add the reading text back in, setting display value as needed
249 my $rdg = $c->reading( $rdghash->{'readingid'} );
251 $rdghash->{'text'} = $rdg->text .
252 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
253 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
254 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
256 # Remove lacunose witnesses from this reading's list now that the
259 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
260 $rdghash->{'group'} = \@realgroup;
261 # Note any layered witnesses that appear in this group
262 foreach( @realgroup ) {
263 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
264 push( @layerwits, $1 );
268 $location->{'layerwits'} = \@layerwits if @layerwits;
270 $answer->{'conflict_count'} = $conflict_count;
271 $answer->{'reversion_count'} = $reversion_count;
276 =head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
278 Groups the variants at the given $rank of the collation, treating any
279 relationships in the set $merge_relationship_types as equivalent.
280 $lacunose should be a reference to an array, to which the sigla of lacunose
281 witnesses at this rank will be appended; $transposed should be a reference
282 to a hash, wherein the identities of transposed readings and their
283 relatives will be stored.
285 Returns a hash $group_readings where $rdg is attested by the witnesses listed
286 in $group_readings->{$rdg}.
290 # Return group_readings, groups, lacunose
292 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
293 my $c = $tradition->collation;
294 my $aclabel = $c->ac_label;
295 my $table = $c->alignment_table;
296 # Get the alignment table readings
297 my %readings_at_rank;
298 my $check_for_gaps = Set::Scalar->new();
300 my $has_transposition;
301 foreach my $tablewit ( @{$table->{'alignment'}} ) {
302 my $rdg = $tablewit->{'tokens'}->[$rank-1];
303 my $wit = $tablewit->{'witness'};
304 # Exclude the witness if it is "lacunose" which if we got here
305 # means "not in the stemma".
306 next if _is_lacunose( $wit, $lacunose, $aclabel );
307 # Note if the witness is actually in a lacuna
308 if( $rdg && $rdg->{'t'}->is_lacuna ) {
309 _add_to_witlist( $wit, $lacunose, $aclabel );
310 # Otherwise the witness either has a positive reading...
312 # If the reading has been counted elsewhere as a transposition, ignore it.
313 if( $transposed->{$rdg->{'t'}->id} ) {
314 # TODO Does this cope with three-way transpositions?
315 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
318 # Otherwise, record it...
319 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
320 # ...and grab any transpositions, and their relations.
321 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
322 foreach my $trdg ( @transp ) {
323 next if exists $readings_at_rank{$trdg->id};
324 $has_transposition = 1;
325 my @affected_wits = _table_witnesses(
326 $table, $trdg, $lacunose, $aclabel );
327 next unless @affected_wits;
328 map { $moved_wits{$_} = 1 } @affected_wits;
329 $transposed->{$trdg->id} =
330 [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
331 $readings_at_rank{$trdg->id} = $trdg;
333 # ...or it is empty, ergo a gap.
335 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
338 my $gap_wits = Set::Scalar->new();
339 map { _add_to_witlist( $_, $gap_wits, $aclabel )
340 unless $moved_wits{$_} } $check_for_gaps->members;
342 # Group the readings, collapsing groups by relationship if needed.
343 my $grouped_readings = {};
344 foreach my $rdg ( values %readings_at_rank ) {
345 # Skip readings that have been collapsed into others.
346 next if exists $grouped_readings->{$rdg->id}
347 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
348 # Get the witness list, including from readings collapsed into this one.
349 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
350 if( $collapse && $collapse->size ) {
351 my $filter = sub { $collapse->has( $_[0]->type ) };
352 foreach my $other ( $rdg->related_readings( $filter ) ) {
353 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
354 push( @wits, @otherwits );
355 $grouped_readings->{$other->id} = 'COLLAPSE';
358 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
360 if( $gap_wits->members ) {
361 $grouped_readings->{'(omitted)'} = $gap_wits;
364 # Get rid of our collapsed readings
365 map { delete $grouped_readings->{$_} if(
366 $grouped_readings->{$_} eq 'COLLAPSE'
367 || $grouped_readings->{$_}->is_empty ) }
368 keys %$grouped_readings;
370 # If something was transposed, check the groups for doubled-up readings
371 if( $has_transposition ) {
372 # print STDERR "Group for rank $rank:\n";
373 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
374 # keys %$grouped_readings;
375 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
379 return $grouped_readings;
382 # Helper function to query the alignment table for all witnesses (a.c. included)
383 # that have a given reading at its rank.
384 sub _table_witnesses {
385 my( $table, $trdg, $lacunose, $aclabel ) = @_;
386 my $tableidx = $trdg->rank - 1;
387 my $has_reading = Set::Scalar->new();
388 foreach my $row ( @{$table->{'alignment'}} ) {
389 my $wit = $row->{'witness'};
390 next if _is_lacunose( $wit, $lacunose, $aclabel );
391 my $rdg = $row->{'tokens'}->[$tableidx];
392 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
393 _add_to_witlist( $wit, $has_reading, $aclabel )
394 if $rdg->{'t'}->id eq $trdg->id;
396 return $has_reading->members;
399 # Helper function to see if a witness is lacunose even if we are asking about
402 my ( $wit, $lac, $acstr ) = @_;
403 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
406 return $lac->has( $wit );
409 # Helper function to ensure that X and X a.c. never appear in the same list.
410 sub _add_to_witlist {
411 my( $wit, $list, $acstr ) = @_;
412 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
413 # Don't add X a.c. if we already have X
414 return if $list->has( $1 );
416 # Delete X a.c. if we are about to add X
417 $list->delete( $wit.$acstr );
419 $list->insert( $wit );
422 sub _check_transposed_consistency {
423 my( $c, $rank, $transposed, $groupings ) = @_;
426 # Note which readings are actually at this rank, and which witnesses
427 # belong to which reading.
428 foreach my $rdg ( keys %$groupings ) {
429 my $rdgobj = $c->reading( $rdg );
430 # Count '(omitted)' as a reading at this rank
431 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
432 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
434 # Our work is done if we have no witness belonging to more than one
436 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
437 return unless @doubled;
438 # If we have a symmetric related transposition, drop the non-rank readings.
439 if( @doubled == scalar keys %seen_wits ) {
440 foreach my $rdg ( keys %$groupings ) {
441 if( !$thisrank{$rdg} ) {
442 my $groupstr = wit_stringify( $groupings->{$rdg} );
443 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
445 delete $groupings->{$rdg};
446 # If we found a group match, assume there is a symmetry happening.
447 # TODO think more about this
448 # print STDERR "*** Deleting symmetric reading $rdg\n";
450 delete $transposed->{$rdg};
451 warn "Found problem in evident symmetry with reading $rdg";
455 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
457 foreach my $dup ( @doubled ) {
458 foreach my $rdg ( @{$seen_wits{$dup}} ) {
459 next if $thisrank{$rdg};
460 next unless exists $groupings->{$rdg};
461 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
462 delete $groupings->{$rdg};
463 delete $transposed->{$rdg};
466 # and put any now-orphaned readings into an 'omitted' reading.
467 foreach my $wit ( keys %seen_wits ) {
468 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
469 $groupings->{'(omitted)'} = Set::Scalar->new()
470 unless exists $groupings->{'(omitted)'};
471 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
477 # For the given grouping, return its situation graph based on the stemma.
478 sub _graph_for_grouping {
479 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
482 foreach my $gs ( values %$grouping ) {
484 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
485 push( @$acwits, $1 ) unless $lacunose->has( $1 );
487 $extant->{$_} = 1 unless $lacunose->has( $_ );
493 # contig contains all extant wits and all hypothetical wits
494 # needed to make up the groups.
495 $graph = $stemma->situation_graph( $extant, $acwits, $aclabel );
496 } catch ( Text::Tradition::Error $e ) {
497 throw( "Could not extend graph with given extant and a.c. witnesses: "
500 throw( "Could not extend graph with a.c. witnesses @$acwits" );
505 =head2 solve_variants( $calcdir, @groups )
507 Looks up the set of groups in the answers provided by the external graph solver
508 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
510 The answer has the form
511 { "variants" => [ array of variant location structures ],
512 "variant_count" => total,
513 "conflict_count" => number of conflicts detected,
514 "genealogical_count" => number of solutions found }
521 # Are we using a local result directory, or did we pass an empty value
524 unless( ref( $groups[0] ) eq 'HASH' ) {
525 $dir = shift @groups;
528 ## For each graph/group combo, make a Text::Tradition::Analysis::Result
529 ## object so that we can send it off for IDP lookup.
531 my $genealogical = 0; # counter
532 # TODO Optimize for unique graph problems
534 foreach my $graphproblem ( @groups ) {
535 # Construct the calc result key and look up its answer
536 my $problem = Text::Tradition::Analysis::Result->new(
537 graph => $graphproblem->{'graph'},
538 setlist => [ values %{$graphproblem->{'grouping'}} ] );
539 if( exists $problems{$problem->object_key} ) {
540 $problem = $problems{$problem->object_key};
542 $problems{$problem->object_key} = $problem;
544 $graphproblem->{'object'} = $problem;
549 my $scope = $dir->new_scope;
550 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
552 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
553 [ values %problems ] );
554 # Send it off and get the result
555 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
556 my $ua = LWP::UserAgent->new();
557 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
558 'Content' => $json );
560 if( $resp->is_success ) {
561 $answer = decode_json( $resp->content );
562 throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
564 throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
565 . "; cannot run graph analysis" );
567 # One more sanity check
568 throw( "Something went wrong with answer symmetricity" )
569 unless keys( %problems ) == @$answer;
570 # Convert the results
571 foreach my $a ( @$answer ) {
572 my $r = Text::Tradition::Analysis::Result->new( $a );
573 $results{$r->object_key} = $r;
577 # We now have a single JSON-encoded Result object per problem sent. Fold its
578 # answers into our variant info structure.
579 foreach my $graphproblem ( @groups ) {
580 my $result = $results{$graphproblem->{'object'}->object_key}
581 || $graphproblem->{'object'};
583 # Initialize the result structure for this graph problem
585 if( $result->status eq 'OK' ) {
586 $vstruct = { readings => [] };
587 push( @$variants, $vstruct );
589 push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
593 # 1. Did the group evaluate as genealogical?
594 $vstruct->{genealogical} = $result->is_genealogical;
595 $genealogical++ if $result->is_genealogical;
597 # 2. What are the calculated minimum groupings for each variant loc?
598 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
599 my $inputset = $graphproblem->{grouping}->{$rid};
600 my $minset = $result->minimum_grouping_for( $inputset );
601 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
604 # 3. What are the sources and classes calculated for each witness?
605 $vstruct->{witcopy_types} = { $result->classes };
606 $vstruct->{reading_roots} = {};
607 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
611 return { 'variants' => $variants,
612 'variant_count' => scalar @$variants,
613 'genealogical_count' => $genealogical };
617 my( $graphproblem, $status ) = @_;
618 my $vstruct = { 'readings' => [] };
619 $vstruct->{'unsolved'} = $status;
620 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
621 push( @{$vstruct->{readings}}, { readingid => $rid,
622 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
627 =head2 analyze_location ( $tradition, $graph, $location_hash )
629 Given the tradition, its stemma graph, and the solution from the graph solver,
630 work out the rest of the information we want. For each reading we need missing,
631 conflict, reading_parents, independent_occurrence, followed, not_followed,
632 and follow_unknown. Alters the location_hash in place.
636 sub analyze_location {
637 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
638 my $c = $tradition->collation;
640 if( exists $variant_row->{'unsolved'} ) {
643 my $reading_roots = delete $variant_row->{'reading_roots'};
644 my $classinfo = delete $variant_row->{'witcopy_types'};
646 # Make a hash of all known node memberships, and make the subgraphs.
649 my $acstr = $c->ac_label;
652 # Note which witnesses positively belong to which group. This information
653 # comes ultimately from the IDP solver.
654 # Also make a note of the reading's roots.
655 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
656 my $rid = $rdghash->{'readingid'};
658 foreach my $wit ( @{$rdghash->{'group'}} ) {
659 $contig->{$wit} = $rid;
660 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
663 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
664 push( @roots, $wit );
667 $rdghash->{'independent_occurrence'} = \@roots;
670 # Now that we have all the node group memberships, calculate followed/
671 # non-followed/unknown values for each reading. Also figure out the
672 # reading's evident parent(s).
673 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
674 my $rid = $rdghash->{'readingid'};
675 my $rdg = $c->reading( $rid );
676 my @roots = @{$rdghash->{'independent_occurrence'}};
679 @reversions = grep { $classinfo->{$_} eq 'revert' }
680 $rdghash->{'group'}->members;
681 $rdghash->{'reversions'} = \@reversions;
683 my @group = @{$rdghash->{'group'}};
685 # Start figuring things out.
686 $rdghash->{'followed'} = scalar( @group )
687 - ( scalar( @roots ) + scalar( @reversions ) );
688 # Find the parent readings, if any, of this reading.
689 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
690 # Work out relationships between readings and their non-followed parent.
691 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
692 $rdghash->{'source_parents'} = $sourceparents;
695 my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
696 _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
697 $rdghash->{'reversion_parents'} = $revparents;
700 # Find the number of times this reading was altered, and the number of
701 # times we're not sure.
702 my( %nofollow, %unknownfollow );
703 foreach my $wit ( @{$rdghash->{'group'}} ) {
704 foreach my $wchild ( $graph->successors( $wit ) ) {
705 if( $reading_roots->{$wchild} && $contig->{$wchild}
706 && $contig->{$wchild} ne $rid ) {
707 # It definitely changed here.
708 $nofollow{$wchild} = 1;
709 } elsif( !($contig->{$wchild}) ) {
710 # The child is a hypothetical node not definitely in
711 # any group. Answer is unknown.
712 $unknownfollow{$wchild} = 1;
713 } # else it is either in our group, or it is a non-root node in a
714 # known group and therefore is presumed to have its reading from
715 # its group, not this link.
718 $rdghash->{'not_followed'} = keys %nofollow;
719 $rdghash->{'follow_unknown'} = keys %unknownfollow;
721 # Now say whether this reading represents a conflict.
722 unless( $variant_row->{'genealogical'} ) {
723 $rdghash->{'is_conflict'} = @roots != 1;
724 $rdghash->{'is_reverted'} = scalar @reversions;
729 sub _find_reading_parents {
730 my( $rid, $graph, $contig, @list ) = @_;
732 foreach my $wit ( @list ) {
733 # Look in the stemma graph to find this witness's extant or known-reading
734 # immediate ancestor(s), and look up the reading that each ancestor holds.
735 my @check = $graph->predecessors( $wit );
738 foreach my $wparent( @check ) {
739 my $preading = $contig->{$wparent};
740 if( $preading && $preading ne $rid ) {
741 $parenthash->{$preading} = 1;
743 push( @next, $graph->predecessors( $wparent ) );
752 sub _resolve_parent_relationships {
753 my( $c, $rid, $rdg, $rdgparents ) = @_;
754 foreach my $p ( keys %$rdgparents ) {
755 # Resolve the relationship of the parent to the reading, and
756 # save it in our hash.
757 my $pobj = $c->reading( $p );
758 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
759 my $phash = { 'label' => $prep };
761 my $rel = $c->get_relationship( $p, $rid );
763 _add_to_hash( $rel, $phash );
765 # First check for a transposed relationship
766 if( $rdg->rank != $pobj->rank ) {
767 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
768 next unless $ti->text eq $rdg->text;
769 $rel = $c->get_relationship( $ti, $pobj );
771 _add_to_hash( $rel, $phash, 1 );
776 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
777 next unless $ti->text eq $pobj->text;
778 $rel = $c->get_relationship( $ti, $rdg );
780 _add_to_hash( $rel, $phash, 1 );
787 # and then check for sheer word similarity.
788 my $rtext = $rdg->text;
789 my $ptext = $pobj->text;
790 if( similar( $rtext, $ptext ) ) {
791 # say STDERR "Words $rtext and $ptext judged similar";
792 $phash->{relation} = { type => 'wordsimilar' };
796 $phash->{relation} = { type => 'deletion' };
798 # Get the attributes of the parent object while we are here
799 $phash->{'text'} = $pobj->text if $pobj;
800 $phash->{'is_nonsense'} = $pobj->is_nonsense;
801 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
802 } elsif( $p eq '(omitted)' ) {
803 $phash->{relation} = { type => 'addition' };
806 $rdgparents->{$p} = $phash;
811 my( $rel, $phash, $is_transposed ) = @_;
812 $phash->{relation} = { type => $rel->type };
813 $phash->{relation}->{transposed} = 1 if $is_transposed;
814 $phash->{relation}->{annotation} = $rel->annotation
815 if $rel->has_annotation;
818 =head2 similar( $word1, $word2 )
820 Use Algorithm::Diff to get a sense of how close the words are to each other.
821 This will hopefully handle substitutions a bit more nicely than Levenshtein.
828 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
829 my @let1 = split( '', lc( $word1 ) );
830 my @let2 = split( '', lc( $word2 ) );
831 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
833 while( $diff->Next ) {
835 # Take off points for longer strings
836 my $cs = $diff->Range(1) - 2;
839 } elsif( !$diff->Items(1) ) {
840 $mag += $diff->Range(2);
841 } elsif( !$diff->Items(2) ) {
842 $mag += $diff->Range(1);
844 # Split the difference for substitutions
845 my $c1 = $diff->Range(1) || 1;
846 my $c2 = $diff->Range(2) || 1;
847 my $cd = ( $c1 + $c2 ) / 2;
851 return ( $mag <= length( $word1 ) / 2 );
854 sub _useful_variant {
855 my( $rankgroup, $rankgraph, $acstr ) = @_;
857 # Sort by group size and return
859 foreach my $rdg ( keys %$rankgroup ) {
860 my @wits = $rankgroup->{$rdg}->members;
864 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
865 || $wits[0] =~ /\Q$acstr\E$/ );
868 return $is_useful > 1;
871 =head2 wit_stringify( $groups )
873 Takes an array of witness groupings and produces a string like
874 ['A','B'] / ['C','D','E'] / ['F']
881 # If we were passed an array of witnesses instead of an array of
882 # groupings, then "group" the witnesses first.
883 unless( ref( $groups->[0] ) ) {
884 my $mkgrp = [ $groups ];
887 foreach my $g ( @$groups ) {
888 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
890 return join( ' / ', @gst );
896 Text::Tradition::Error->throw(
897 'ident' => 'Analysis error',
904 This package is free software and is provided "as is" without express
905 or implied warranty. You can redistribute it and/or modify it under
906 the same terms as Perl itself.
910 Tara L Andrews E<lt>aurum@cpan.orgE<gt>