1 package Text::Tradition::Analysis;
5 use Algorithm::Diff; # for word similarity measure
6 use Encode qw/ decode_utf8 encode_utf8 /;
9 use JSON qw/ to_json decode_json /;
12 use Text::Tradition::Analysis::Result;
13 use Text::Tradition::Directory;
14 use Text::Tradition::Stemma;
17 use vars qw/ @EXPORT_OK /;
18 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
20 my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
21 my $unsolved_problems = {};
25 Text::Tradition::Analysis - functions for stemma analysis of a tradition
30 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
31 my $t = Text::Tradition->new(
32 'name' => 'this is a text',
34 'file' => '/path/to/tei_parallel_seg_file.xml' );
35 $t->add_stemma( 'dotfile' => $stemmafile );
37 my $variant_data = run_analysis( $tradition );
38 # Recalculate rank $n treating all orthographic variants as equivalent
39 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
43 Text::Tradition is a library for representation and analysis of collated
44 texts, particularly medieval ones. The Collation is the central feature of
45 a Tradition, where the text, its sequence of readings, and its relationships
46 between readings are actually kept.
50 =head2 run_analysis( $tradition, %opts )
52 Runs the analysis described in analyze_variant_location on every location in the
53 collation of the given tradition, with the given options. These include:
57 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
58 is 0 (i.e. the first).
60 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
62 =item * merge_types - Specify a list of relationship types, where related readings
63 should be treated as identical for the purposes of analysis.
65 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
72 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
74 my $datafile = 't/data/florilegium_tei_ps.xml';
75 my $tradition = Text::Tradition->new( 'input' => 'TEI',
77 'file' => $datafile );
78 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
79 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
81 my %expected_genealogical = (
112 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
113 my $c = $tradition->collation;
114 foreach my $row ( @{$data->{'variants'}} ) {
115 # Account for rows that used to be "not useful"
116 unless( exists $expected_genealogical{$row->{'id'}} ) {
117 $expected_genealogical{$row->{'id'}} = 1;
119 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
120 is( $gen_bool, $expected_genealogical{$row->{'id'}},
121 "Got correct genealogical flag for row " . $row->{'id'} );
122 # Check that we have the right row with the right groups
123 my $rank = $row->{'id'};
124 foreach my $rdghash ( @{$row->{'readings'}} ) {
125 # Skip 'readings' that aren't really
126 next unless $c->reading( $rdghash->{'readingid'} );
128 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
129 "Got correct reading rank" );
130 # Check the witnesses
131 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
132 my @sgrp = sort @{$rdghash->{'group'}};
133 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
136 is( $data->{'variant_count'}, 58, "Got right total variant number" );
137 # TODO Make something meaningful of conflict count, maybe test other bits
144 my( $tradition, %opts ) = @_;
145 my $c = $tradition->collation;
146 my $aclabel = $c->ac_label;
148 my $stemma_id = $opts{'stemma_id'} || 0;
149 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
150 my $collapse = Set::Scalar->new();
151 if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
152 $collapse->insert( @{$opts{'merge_types'}} );
153 } elsif( $opts{'merge_types'} ) {
154 $collapse->insert( $opts{'merge_types'} );
157 # Make sure we have a lookup DB for graph calculation results; this will die
158 # if calcdir or calcdsn isn't passed.
160 if( exists $opts{'calcdir'} ) {
161 $dir = delete $opts{'calcdir'}
162 } elsif ( exists $opts{'calcdsn'} ) {
163 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
167 my $stemma = $tradition->stemma( $stemma_id );
169 # Figure out which witnesses we are working with - that is, the ones that
170 # appear both in the stemma and in the tradition. All others are 'lacunose'
172 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
173 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
174 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
175 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
177 # Find and mark 'common' ranks for exclusion, unless they were
178 # explicitly specified.
181 foreach my $rdg ( $c->common_readings ) {
182 $common_rank{$rdg->rank} = 1;
184 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
187 # Group the variants to send to the solver
192 foreach my $rank ( @ranks ) {
193 my $missing = $lacunose->clone();
194 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
195 # Filter out any empty rankgroups
196 # (e.g. from the later rank for a transposition)
197 next unless keys %$rankgroup;
198 # Get the graph for this rankgroup
199 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
200 if( $opts{'exclude_type1'} ) {
201 # Check to see whether this is a "useful" group.
202 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
204 push( @use_ranks, $rank );
205 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
206 $lacunae{$rank} = $missing;
209 my $answer = solve_variants( $dir, @groups );
211 # Do further analysis on the answer
212 my $conflict_count = 0;
213 my $reversion_count = 0;
214 foreach my $idx ( 0 .. $#use_ranks ) {
215 my $location = $answer->{'variants'}->[$idx];
216 # Add the rank back in
217 my $rank = $use_ranks[$idx];
218 $location->{'id'} = $rank;
219 # Note what our lacunae are
221 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
222 $location->{'missing'} = [ keys %lmiss ];
224 # Run the extra analysis we need.
225 ## TODO We run through all the variants in this call, so
226 ## why not add the reading data there instead of here below?
227 my $graph = $groups[$idx]->{graph};
228 analyze_location( $tradition, $graph, $location, \%lmiss );
231 # Do the final post-analysis tidying up of the data.
232 foreach my $rdghash ( @{$location->{'readings'}} ) {
233 $conflict_count++ if $rdghash->{'is_conflict'};
234 $reversion_count++ if $rdghash->{'is_reverted'};
235 # Add the reading text back in, setting display value as needed
236 my $rdg = $c->reading( $rdghash->{'readingid'} );
238 $rdghash->{'text'} = $rdg->text .
239 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
240 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
241 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
243 # Remove lacunose witnesses from this reading's list now that the
246 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
247 $rdghash->{'group'} = \@realgroup;
248 # Note any layered witnesses that appear in this group
249 foreach( @realgroup ) {
250 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
251 push( @layerwits, $1 );
255 $location->{'layerwits'} = \@layerwits if @layerwits;
257 $answer->{'conflict_count'} = $conflict_count;
258 $answer->{'reversion_count'} = $reversion_count;
263 =head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
265 Groups the variants at the given $rank of the collation, treating any
266 relationships in the set $merge_relationship_types as equivalent.
267 $lacunose should be a reference to an array, to which the sigla of lacunose
268 witnesses at this rank will be appended; $transposed should be a reference
269 to a hash, wherein the identities of transposed readings and their
270 relatives will be stored.
272 Returns a hash $group_readings where $rdg is attested by the witnesses listed
273 in $group_readings->{$rdg}.
277 # Return group_readings, groups, lacunose
279 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
280 my $c = $tradition->collation;
281 my $aclabel = $c->ac_label;
282 my $table = $c->alignment_table;
283 # Get the alignment table readings
284 my %readings_at_rank;
285 my $check_for_gaps = Set::Scalar->new();
287 my $has_transposition;
288 foreach my $tablewit ( @{$table->{'alignment'}} ) {
289 my $rdg = $tablewit->{'tokens'}->[$rank-1];
290 my $wit = $tablewit->{'witness'};
291 # Exclude the witness if it is "lacunose" which if we got here
292 # means "not in the stemma".
293 next if _is_lacunose( $wit, $lacunose, $aclabel );
294 # Note if the witness is actually in a lacuna
295 if( $rdg && $rdg->{'t'}->is_lacuna ) {
296 _add_to_witlist( $wit, $lacunose, $aclabel );
297 # Otherwise the witness either has a positive reading...
299 # If the reading has been counted elsewhere as a transposition, ignore it.
300 if( $transposed->{$rdg->{'t'}->id} ) {
301 # TODO Does this cope with three-way transpositions?
302 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
305 # Otherwise, record it...
306 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
307 # ...and grab any transpositions, and their relations.
308 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
309 foreach my $trdg ( @transp ) {
310 next if exists $readings_at_rank{$trdg->id};
311 $has_transposition = 1;
312 my @affected_wits = _table_witnesses(
313 $table, $trdg, $lacunose, $aclabel );
314 next unless @affected_wits;
315 map { $moved_wits{$_} = 1 } @affected_wits;
316 $transposed->{$trdg->id} =
317 [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
318 $readings_at_rank{$trdg->id} = $trdg;
320 # ...or it is empty, ergo a gap.
322 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
325 my $gap_wits = Set::Scalar->new();
326 map { _add_to_witlist( $_, $gap_wits, $aclabel )
327 unless $moved_wits{$_} } $check_for_gaps->members;
329 # Group the readings, collapsing groups by relationship if needed.
330 my $grouped_readings = {};
331 foreach my $rdg ( values %readings_at_rank ) {
332 # Skip readings that have been collapsed into others.
333 next if exists $grouped_readings->{$rdg->id}
334 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
335 # Get the witness list, including from readings collapsed into this one.
336 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
337 if( $collapse && $collapse->size ) {
338 my $filter = sub { $collapse->has( $_[0]->type ) };
339 foreach my $other ( $rdg->related_readings( $filter ) ) {
340 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
341 push( @wits, @otherwits );
342 $grouped_readings->{$other->id} = 'COLLAPSE';
345 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
347 if( $gap_wits->members ) {
348 $grouped_readings->{'(omitted)'} = $gap_wits;
351 # Get rid of our collapsed readings
352 map { delete $grouped_readings->{$_} if(
353 $grouped_readings->{$_} eq 'COLLAPSE'
354 || $grouped_readings->{$_}->is_empty ) }
355 keys %$grouped_readings;
357 # If something was transposed, check the groups for doubled-up readings
358 if( $has_transposition ) {
359 # print STDERR "Group for rank $rank:\n";
360 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
361 # keys %$grouped_readings;
362 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
366 return $grouped_readings;
369 # Helper function to query the alignment table for all witnesses (a.c. included)
370 # that have a given reading at its rank.
371 sub _table_witnesses {
372 my( $table, $trdg, $lacunose, $aclabel ) = @_;
373 my $tableidx = $trdg->rank - 1;
374 my $has_reading = Set::Scalar->new();
375 foreach my $row ( @{$table->{'alignment'}} ) {
376 my $wit = $row->{'witness'};
377 next if _is_lacunose( $wit, $lacunose, $aclabel );
378 my $rdg = $row->{'tokens'}->[$tableidx];
379 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
380 _add_to_witlist( $wit, $has_reading, $aclabel )
381 if $rdg->{'t'}->id eq $trdg->id;
383 return $has_reading->members;
386 # Helper function to see if a witness is lacunose even if we are asking about
389 my ( $wit, $lac, $acstr ) = @_;
390 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
393 return $lac->has( $wit );
396 # Helper function to ensure that X and X a.c. never appear in the same list.
397 sub _add_to_witlist {
398 my( $wit, $list, $acstr ) = @_;
399 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
400 # Don't add X a.c. if we already have X
401 return if $list->has( $1 );
403 # Delete X a.c. if we are about to add X
404 $list->delete( $wit.$acstr );
406 $list->insert( $wit );
409 sub _check_transposed_consistency {
410 my( $c, $rank, $transposed, $groupings ) = @_;
413 # Note which readings are actually at this rank, and which witnesses
414 # belong to which reading.
415 foreach my $rdg ( keys %$groupings ) {
416 my $rdgobj = $c->reading( $rdg );
417 # Count '(omitted)' as a reading at this rank
418 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
419 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
421 # Our work is done if we have no witness belonging to more than one
423 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
424 return unless @doubled;
425 # If we have a symmetric related transposition, drop the non-rank readings.
426 if( @doubled == scalar keys %seen_wits ) {
427 foreach my $rdg ( keys %$groupings ) {
428 if( !$thisrank{$rdg} ) {
429 my $groupstr = wit_stringify( $groupings->{$rdg} );
430 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
432 delete $groupings->{$rdg};
433 # If we found a group match, assume there is a symmetry happening.
434 # TODO think more about this
435 # print STDERR "*** Deleting symmetric reading $rdg\n";
437 delete $transposed->{$rdg};
438 warn "Found problem in evident symmetry with reading $rdg";
442 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
444 foreach my $dup ( @doubled ) {
445 foreach my $rdg ( @{$seen_wits{$dup}} ) {
446 next if $thisrank{$rdg};
447 next unless exists $groupings->{$rdg};
448 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
449 delete $groupings->{$rdg};
450 delete $transposed->{$rdg};
453 # and put any now-orphaned readings into an 'omitted' reading.
454 foreach my $wit ( keys %seen_wits ) {
455 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
456 $groupings->{'(omitted)'} = Set::Scalar->new()
457 unless exists $groupings->{'(omitted)'};
458 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
464 # For the given grouping, return its situation graph based on the stemma.
465 sub _graph_for_grouping {
466 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
469 foreach my $gs ( values %$grouping ) {
471 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
472 push( @$acwits, $1 ) unless $lacunose->has( $1 );
474 $extant->{$_} = 1 unless $lacunose->has( $_ );
480 # contig contains all extant wits and all hypothetical wits
481 # needed to make up the groups.
482 $graph = $stemma->situation_graph( $extant, $acwits );
483 } catch ( Text::Tradition::Error $e ) {
484 throw( "Could not extend graph with given extant and a.c. witnesses: "
487 throw( "Could not extend graph with a.c. witnesses @$acwits" );
492 =head2 solve_variants( $calcdir, @groups )
494 Looks up the set of groups in the answers provided by the external graph solver
495 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
497 The answer has the form
498 { "variants" => [ array of variant location structures ],
499 "variant_count" => total,
500 "conflict_count" => number of conflicts detected,
501 "genealogical_count" => number of solutions found }
508 # Are we using a local result directory, or did we pass an empty value
511 unless( ref( $groups[0] ) eq 'HASH' ) {
512 $dir = shift @groups;
515 ## For each graph/group combo, make a Text::Tradition::Analysis::Result
516 ## object so that we can send it off for IDP lookup.
518 my $genealogical = 0; # counter
519 # TODO Optimize for unique graph problems
521 foreach my $graphproblem ( @groups ) {
522 # Construct the calc result key and look up its answer
523 my $problem = Text::Tradition::Analysis::Result->new(
524 graph => $graphproblem->{'graph'},
525 setlist => [ values %{$graphproblem->{'grouping'}} ] );
526 push( @problems, $problem );
531 my $scope = $dir->new_scope;
532 @results = map { $dir->lookup( $_->object_key ) || $_ } @problems;
534 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode( \@problems );
535 # Send it off and get the result
536 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
537 my $ua = LWP::UserAgent->new();
538 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
539 'Content' => $json );
541 if( $resp->is_success ) {
542 $answer = decode_json( $resp->content );
543 throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
545 throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
546 . "; cannot run graph analysis" );
548 # One more sanity check
549 throw( "Something went wrong with answer symmetricity" )
550 unless @groups == @$answer;
551 # Convert the results
552 @results = map { Text::Tradition::Analysis::Result->new( $_ ) } @$answer;
555 # We now have a single JSON-encoded Result object per problem sent. Fold its
556 # answers into our variant info structure.
557 foreach my $idx ( 0 .. $#groups ) {
558 my $graphproblem = $groups[$idx];
559 my $result = $results[$idx];
561 # Initialize the result structure for this graph problem
562 my $vstruct = { readings => [] };
563 push( @$variants, $vstruct );
565 # 0. Do we have a calculated result at all?
566 unless( $result->status eq 'OK' ) {
567 $vstruct->{'unsolved'} = $result->status;
568 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
569 push( @{$vstruct->{readings}}, { readingid => $rid,
570 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
575 # 1. Did the group evaluate as genealogical?
576 $vstruct->{genealogical} = $result->is_genealogical;
577 $genealogical++ if $result->is_genealogical;
579 # 2. What are the calculated minimum groupings for each variant loc?
580 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
581 my $inputset = $graphproblem->{grouping}->{$rid};
582 my $minset = $result->minimum_grouping_for( $inputset );
583 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
586 # 3. What are the sources and classes calculated for each witness?
587 $vstruct->{witcopy_types} = { $result->classes };
588 $vstruct->{reading_roots} = {};
589 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
593 return { 'variants' => $variants,
594 'variant_count' => scalar @$variants,
595 'genealogical_count' => $genealogical };
598 =head2 analyze_location ( $tradition, $graph, $location_hash )
600 Given the tradition, its stemma graph, and the solution from the graph solver,
601 work out the rest of the information we want. For each reading we need missing,
602 conflict, reading_parents, independent_occurrence, followed, not_followed,
603 and follow_unknown. Alters the location_hash in place.
607 sub analyze_location {
608 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
609 my $c = $tradition->collation;
611 if( exists $variant_row->{'unsolved'} ) {
614 my $reading_roots = delete $variant_row->{'reading_roots'};
615 my $classinfo = delete $variant_row->{'witcopy_types'};
617 # Make a hash of all known node memberships, and make the subgraphs.
620 my $acstr = $c->ac_label;
623 # Note which witnesses positively belong to which group. This information
624 # comes ultimately from the IDP solver.
625 # Also make a note of the reading's roots.
626 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
627 my $rid = $rdghash->{'readingid'};
629 foreach my $wit ( @{$rdghash->{'group'}} ) {
630 $contig->{$wit} = $rid;
631 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
634 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
635 push( @roots, $wit );
638 $rdghash->{'independent_occurrence'} = \@roots;
641 # Now that we have all the node group memberships, calculate followed/
642 # non-followed/unknown values for each reading. Also figure out the
643 # reading's evident parent(s).
644 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
645 my $rid = $rdghash->{'readingid'};
646 my $rdg = $c->reading( $rid );
647 my @roots = @{$rdghash->{'independent_occurrence'}};
650 @reversions = grep { $classinfo->{$_} eq 'revert' }
651 $rdghash->{'group'}->members;
652 $rdghash->{'reversions'} = \@reversions;
654 my @group = @{$rdghash->{'group'}};
656 # Start figuring things out.
657 $rdghash->{'followed'} = scalar( @group )
658 - ( scalar( @roots ) + scalar( @reversions ) );
659 # Find the parent readings, if any, of this reading.
660 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
661 # Work out relationships between readings and their non-followed parent.
662 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
663 $rdghash->{'source_parents'} = $sourceparents;
666 my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
667 _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
668 $rdghash->{'reversion_parents'} = $revparents;
671 # Find the number of times this reading was altered, and the number of
672 # times we're not sure.
673 my( %nofollow, %unknownfollow );
674 foreach my $wit ( @{$rdghash->{'group'}} ) {
675 foreach my $wchild ( $graph->successors( $wit ) ) {
676 if( $reading_roots->{$wchild} && $contig->{$wchild}
677 && $contig->{$wchild} ne $rid ) {
678 # It definitely changed here.
679 $nofollow{$wchild} = 1;
680 } elsif( !($contig->{$wchild}) ) {
681 # The child is a hypothetical node not definitely in
682 # any group. Answer is unknown.
683 $unknownfollow{$wchild} = 1;
684 } # else it is either in our group, or it is a non-root node in a
685 # known group and therefore is presumed to have its reading from
686 # its group, not this link.
689 $rdghash->{'not_followed'} = keys %nofollow;
690 $rdghash->{'follow_unknown'} = keys %unknownfollow;
692 # Now say whether this reading represents a conflict.
693 unless( $variant_row->{'genealogical'} ) {
694 $rdghash->{'is_conflict'} = @roots != 1;
695 $rdghash->{'is_reverted'} = scalar @reversions;
700 sub _find_reading_parents {
701 my( $rid, $graph, $contig, @list ) = @_;
703 foreach my $wit ( @list ) {
704 # Look in the stemma graph to find this witness's extant or known-reading
705 # immediate ancestor(s), and look up the reading that each ancestor holds.
706 my @check = $graph->predecessors( $wit );
709 foreach my $wparent( @check ) {
710 my $preading = $contig->{$wparent};
711 if( $preading && $preading ne $rid ) {
712 $parenthash->{$preading} = 1;
714 push( @next, $graph->predecessors( $wparent ) );
723 sub _resolve_parent_relationships {
724 my( $c, $rid, $rdg, $rdgparents ) = @_;
725 foreach my $p ( keys %$rdgparents ) {
726 # Resolve the relationship of the parent to the reading, and
727 # save it in our hash.
728 my $pobj = $c->reading( $p );
729 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
730 my $phash = { 'label' => $prep };
732 my $rel = $c->get_relationship( $p, $rid );
734 _add_to_hash( $rel, $phash );
736 # First check for a transposed relationship
737 if( $rdg->rank != $pobj->rank ) {
738 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
739 next unless $ti->text eq $rdg->text;
740 $rel = $c->get_relationship( $ti, $pobj );
742 _add_to_hash( $rel, $phash, 1 );
747 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
748 next unless $ti->text eq $pobj->text;
749 $rel = $c->get_relationship( $ti, $rdg );
751 _add_to_hash( $rel, $phash, 1 );
758 # and then check for sheer word similarity.
759 my $rtext = $rdg->text;
760 my $ptext = $pobj->text;
761 if( similar( $rtext, $ptext ) ) {
762 # say STDERR "Words $rtext and $ptext judged similar";
763 $phash->{relation} = { type => 'wordsimilar' };
767 $phash->{relation} = { type => 'deletion' };
769 # Get the attributes of the parent object while we are here
770 $phash->{'text'} = $pobj->text if $pobj;
771 $phash->{'is_nonsense'} = $pobj->is_nonsense;
772 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
773 } elsif( $p eq '(omitted)' ) {
774 $phash->{relation} = { type => 'addition' };
777 $rdgparents->{$p} = $phash;
782 my( $rel, $phash, $is_transposed ) = @_;
783 $phash->{relation} = { type => $rel->type };
784 $phash->{relation}->{transposed} = 1 if $is_transposed;
785 $phash->{relation}->{annotation} = $rel->annotation
786 if $rel->has_annotation;
789 =head2 similar( $word1, $word2 )
791 Use Algorithm::Diff to get a sense of how close the words are to each other.
792 This will hopefully handle substitutions a bit more nicely than Levenshtein.
799 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
800 my @let1 = split( '', lc( $word1 ) );
801 my @let2 = split( '', lc( $word2 ) );
802 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
804 while( $diff->Next ) {
806 # Take off points for longer strings
807 my $cs = $diff->Range(1) - 2;
810 } elsif( !$diff->Items(1) ) {
811 $mag += $diff->Range(2);
812 } elsif( !$diff->Items(2) ) {
813 $mag += $diff->Range(1);
815 # Split the difference for substitutions
816 my $c1 = $diff->Range(1) || 1;
817 my $c2 = $diff->Range(2) || 1;
818 my $cd = ( $c1 + $c2 ) / 2;
822 return ( $mag <= length( $word1 ) / 2 );
826 my( $group, $graph ) = @_;
828 # Record the existence of the vertices in the group
829 map { $relevant->{$_} = 1 } @$group;
831 my $subgraph = $graph->deep_copy;
832 map { $subgraph->delete_vertex( $_ ) unless $relevant->{$_} }
834 # Now prune and return the remaining vertices.
835 _prune_subtree( $subgraph );
836 # Return the list of vertices and the list of roots.
837 my $pruned_group = [ sort $subgraph->vertices ];
838 my $pruned_roots = [ $subgraph->predecessorless_vertices ];
839 return( $pruned_group, $pruned_roots );
845 # Delete lacunose witnesses that have no successors
846 my @orphan_hypotheticals;
849 throw( "Infinite loop on leaves" ) if $ctr > 100;
850 @orphan_hypotheticals =
851 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' }
852 $tree->successorless_vertices;
853 $tree->delete_vertices( @orphan_hypotheticals );
855 } while( @orphan_hypotheticals );
857 # Delete lacunose roots that have a single successor
861 throw( "Infinite loop on roots" ) if $ctr > 100;
863 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical'
864 && $tree->successors( $_ ) == 1 }
865 $tree->predecessorless_vertices;
866 $tree->delete_vertices( @redundant_root );
868 } while( @redundant_root );
871 sub _useful_variant {
872 my( $rankgroup, $rankgraph, $acstr ) = @_;
874 # Sort by group size and return
876 foreach my $rdg ( keys %$rankgroup ) {
877 my @wits = $rankgroup->{$rdg}->members;
881 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
882 || $wits[0] =~ /\Q$acstr\E$/ );
885 return $is_useful > 1;
888 =head2 wit_stringify( $groups )
890 Takes an array of witness groupings and produces a string like
891 ['A','B'] / ['C','D','E'] / ['F']
898 # If we were passed an array of witnesses instead of an array of
899 # groupings, then "group" the witnesses first.
900 unless( ref( $groups->[0] ) ) {
901 my $mkgrp = [ $groups ];
904 foreach my $g ( @$groups ) {
905 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
907 return join( ' / ', @gst );
913 Text::Tradition::Error->throw(
914 'ident' => 'Analysis error',
921 This package is free software and is provided "as is" without express
922 or implied warranty. You can redistribute it and/or modify it under
923 the same terms as Perl itself.
927 Tara L Andrews E<lt>aurum@cpan.orgE<gt>