1 package Text::Tradition::Analysis;
5 use Algorithm::Diff; # for word similarity measure
6 use Encode qw/ decode_utf8 encode_utf8 /;
9 use JSON qw/ to_json decode_json /;
12 use Text::Tradition::Analysis::Result;
13 use Text::Tradition::Directory;
14 use Text::Tradition::Stemma;
17 use vars qw/ @EXPORT_OK $VERSION /;
18 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
22 my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
23 my $unsolved_problems = {};
27 Text::Tradition::Analysis - functions for stemma analysis of a tradition
32 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
33 my $t = Text::Tradition->new(
34 'name' => 'this is a text',
36 'file' => '/path/to/tei_parallel_seg_file.xml' );
37 $t->add_stemma( 'dotfile' => $stemmafile );
39 my $variant_data = run_analysis( $tradition );
40 # Recalculate rank $n treating all orthographic variants as equivalent
41 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
45 Text::Tradition is a library for representation and analysis of collated
46 texts, particularly medieval ones. Where the Collation is the central feature of
47 a Tradition, it may also have one or more temmata associated with it, and these stemmata may be analyzed. This package provides the following modules:
51 =item * L<Text::Tradition::HasStemma> - a role that can be composed into Text::Tradition objects, providing the ability for Text::Tradition::Stemma objects to be associated with them.
53 =item * L<Text::Tradition::Stemma> - an object class that represents stemma hypotheses, both rooted (with a single archetype) and unrooted (e.g. phylogenetic trees).
55 =item * Text::Tradition::Analysis (this package). Provides functions for the analysis of a given stemma against the collation within a given Tradition.
61 =head2 run_analysis( $tradition, %opts )
63 Runs the analysis described in analyze_variant_location on every location in the
64 collation of the given tradition, with the given options. These include:
68 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
69 is 0 (i.e. the first).
71 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
73 =item * merge_types - Specify a list of relationship types, where related readings
74 should be treated as identical for the purposes of analysis.
76 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
83 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
85 my $datafile = 't/data/florilegium_tei_ps.xml';
86 my $tradition = Text::Tradition->new( 'input' => 'TEI',
88 'file' => $datafile );
89 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
90 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
92 my %expected_genealogical = (
123 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
124 my $c = $tradition->collation;
125 foreach my $row ( @{$data->{'variants'}} ) {
126 # Account for rows that used to be "not useful"
127 unless( exists $expected_genealogical{$row->{'id'}} ) {
128 $expected_genealogical{$row->{'id'}} = 1;
130 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
131 is( $gen_bool, $expected_genealogical{$row->{'id'}},
132 "Got correct genealogical flag for row " . $row->{'id'} );
133 # Check that we have the right row with the right groups
134 my $rank = $row->{'id'};
135 foreach my $rdghash ( @{$row->{'readings'}} ) {
136 # Skip 'readings' that aren't really
137 next unless $c->reading( $rdghash->{'readingid'} );
139 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
140 "Got correct reading rank" );
141 # Check the witnesses
142 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
143 my @sgrp = sort @{$rdghash->{'group'}};
144 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
147 is( $data->{'variant_count'}, 58, "Got right total variant number" );
148 # TODO Make something meaningful of conflict count, maybe test other bits
155 my( $tradition, %opts ) = @_;
156 my $c = $tradition->collation;
157 my $aclabel = $c->ac_label;
159 my $stemma_id = $opts{'stemma_id'} || 0;
160 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
161 my $collapse = Set::Scalar->new();
162 if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
163 $collapse->insert( @{$opts{'merge_types'}} );
164 } elsif( $opts{'merge_types'} ) {
165 $collapse->insert( $opts{'merge_types'} );
168 # Make sure we have a lookup DB for graph calculation results; this will die
169 # if calcdir or calcdsn isn't passed.
171 if( exists $opts{'calcdir'} ) {
172 $dir = delete $opts{'calcdir'}
173 } elsif ( exists $opts{'calcdsn'} ) {
174 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
178 my $stemma = $tradition->stemma( $stemma_id );
180 # Figure out which witnesses we are working with - that is, the ones that
181 # appear both in the stemma and in the tradition. All others are 'lacunose'
183 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
184 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
185 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
186 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
188 # Find and mark 'common' ranks for exclusion, unless they were
189 # explicitly specified.
192 foreach my $rdg ( $c->common_readings ) {
193 $common_rank{$rdg->rank} = 1;
195 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
198 # Group the variants to send to the solver
203 foreach my $rank ( @ranks ) {
204 my $missing = $lacunose->clone();
205 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
206 # Filter out any empty rankgroups
207 # (e.g. from the later rank for a transposition)
208 next unless keys %$rankgroup;
209 # Get the graph for this rankgroup
210 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
211 if( $opts{'exclude_type1'} ) {
212 # Check to see whether this is a "useful" group.
213 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
215 push( @use_ranks, $rank );
216 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
217 $lacunae{$rank} = $missing;
222 $answer = solve_variants( $dir, @groups );
223 } catch ( Text::Tradition::Error $e ) {
224 if( $e->message =~ /IDP/ ) {
225 # Something is wrong with the solver; make the variants table anyway
226 $answer->{'variants'} = [];
227 map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
230 # Something else is wrong; error out.
235 # Do further analysis on the answer
236 my $conflict_count = 0;
237 my $reversion_count = 0;
238 foreach my $idx ( 0 .. $#use_ranks ) {
239 my $location = $answer->{'variants'}->[$idx];
240 # Add the rank back in
241 my $rank = $use_ranks[$idx];
242 $location->{'id'} = $rank;
243 # Note what our lacunae are
245 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
246 $location->{'missing'} = [ keys %lmiss ];
248 # Run the extra analysis we need.
249 ## TODO We run through all the variants in this call, so
250 ## why not add the reading data there instead of here below?
251 my $graph = $groups[$idx]->{graph};
252 analyze_location( $tradition, $graph, $location, \%lmiss );
255 # Do the final post-analysis tidying up of the data.
256 foreach my $rdghash ( @{$location->{'readings'}} ) {
257 $conflict_count++ if $rdghash->{'is_conflict'};
258 $reversion_count++ if $rdghash->{'is_reverted'};
259 # Add the reading text back in, setting display value as needed
260 my $rdg = $c->reading( $rdghash->{'readingid'} );
262 $rdghash->{'text'} = $rdg->text .
263 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
264 if( $rdg->does( 'Text::Tradition::Morphology' ) ) {
265 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
266 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
269 # Remove lacunose witnesses from this reading's list now that the
272 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
273 $rdghash->{'group'} = \@realgroup;
274 # Note any layered witnesses that appear in this group
275 foreach( @realgroup ) {
276 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
277 push( @layerwits, $1 );
281 $location->{'layerwits'} = \@layerwits if @layerwits;
283 $answer->{'conflict_count'} = $conflict_count;
284 $answer->{'reversion_count'} = $reversion_count;
289 =head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
291 Groups the variants at the given $rank of the collation, treating any
292 relationships in the set $merge_relationship_types as equivalent.
293 $lacunose should be a reference to an array, to which the sigla of lacunose
294 witnesses at this rank will be appended; $transposed should be a reference
295 to a hash, wherein the identities of transposed readings and their
296 relatives will be stored.
298 Returns a hash $group_readings where $rdg is attested by the witnesses listed
299 in $group_readings->{$rdg}.
303 # Return group_readings, groups, lacunose
305 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
306 my $c = $tradition->collation;
307 my $aclabel = $c->ac_label;
308 my $table = $c->alignment_table;
309 # Get the alignment table readings
310 my %readings_at_rank;
311 my $check_for_gaps = Set::Scalar->new();
313 my $has_transposition;
314 foreach my $tablewit ( @{$table->{'alignment'}} ) {
315 my $rdg = $tablewit->{'tokens'}->[$rank-1];
316 my $wit = $tablewit->{'witness'};
317 # Exclude the witness if it is "lacunose" which if we got here
318 # means "not in the stemma".
319 next if _is_lacunose( $wit, $lacunose, $aclabel );
320 # Note if the witness is actually in a lacuna
321 if( $rdg && $rdg->{'t'}->is_lacuna ) {
322 _add_to_witlist( $wit, $lacunose, $aclabel );
323 # Otherwise the witness either has a positive reading...
325 # If the reading has been counted elsewhere as a transposition, ignore it.
326 if( $transposed->{$rdg->{'t'}->id} ) {
327 # TODO Does this cope with three-way transpositions?
328 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
331 # Otherwise, record it...
332 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
333 # ...and grab any transpositions, and their relations.
334 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
335 foreach my $trdg ( @transp ) {
336 next if exists $readings_at_rank{$trdg->id};
337 $has_transposition = 1;
338 my @affected_wits = _table_witnesses(
339 $table, $trdg, $lacunose, $aclabel );
340 next unless @affected_wits;
341 map { $moved_wits{$_} = 1 } @affected_wits;
342 $transposed->{$trdg->id} =
343 [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
344 $readings_at_rank{$trdg->id} = $trdg;
346 # ...or it is empty, ergo a gap.
348 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
351 my $gap_wits = Set::Scalar->new();
352 map { _add_to_witlist( $_, $gap_wits, $aclabel )
353 unless $moved_wits{$_} } $check_for_gaps->members;
355 # Group the readings, collapsing groups by relationship if needed.
356 my $grouped_readings = {};
357 foreach my $rdg ( values %readings_at_rank ) {
358 # Skip readings that have been collapsed into others.
359 next if exists $grouped_readings->{$rdg->id}
360 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
361 # Get the witness list, including from readings collapsed into this one.
362 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
363 if( $collapse && $collapse->size ) {
364 my $filter = sub { $collapse->has( $_[0]->type ) };
365 foreach my $other ( $rdg->related_readings( $filter ) ) {
366 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
367 push( @wits, @otherwits );
368 $grouped_readings->{$other->id} = 'COLLAPSE';
371 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
373 if( $gap_wits->members ) {
374 $grouped_readings->{'(omitted)'} = $gap_wits;
377 # Get rid of our collapsed readings
378 map { delete $grouped_readings->{$_} if(
379 $grouped_readings->{$_} eq 'COLLAPSE'
380 || $grouped_readings->{$_}->is_empty ) }
381 keys %$grouped_readings;
383 # If something was transposed, check the groups for doubled-up readings
384 if( $has_transposition ) {
385 # print STDERR "Group for rank $rank:\n";
386 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
387 # keys %$grouped_readings;
388 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
392 return $grouped_readings;
395 # Helper function to query the alignment table for all witnesses (a.c. included)
396 # that have a given reading at its rank.
397 sub _table_witnesses {
398 my( $table, $trdg, $lacunose, $aclabel ) = @_;
399 my $tableidx = $trdg->rank - 1;
400 my $has_reading = Set::Scalar->new();
401 foreach my $row ( @{$table->{'alignment'}} ) {
402 my $wit = $row->{'witness'};
403 next if _is_lacunose( $wit, $lacunose, $aclabel );
404 my $rdg = $row->{'tokens'}->[$tableidx];
405 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
406 _add_to_witlist( $wit, $has_reading, $aclabel )
407 if $rdg->{'t'}->id eq $trdg->id;
409 return $has_reading->members;
412 # Helper function to see if a witness is lacunose even if we are asking about
415 my ( $wit, $lac, $acstr ) = @_;
416 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
419 return $lac->has( $wit );
422 # Helper function to ensure that X and X a.c. never appear in the same list.
423 sub _add_to_witlist {
424 my( $wit, $list, $acstr ) = @_;
425 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
426 # Don't add X a.c. if we already have X
427 return if $list->has( $1 );
429 # Delete X a.c. if we are about to add X
430 $list->delete( $wit.$acstr );
432 $list->insert( $wit );
435 sub _check_transposed_consistency {
436 my( $c, $rank, $transposed, $groupings ) = @_;
439 # Note which readings are actually at this rank, and which witnesses
440 # belong to which reading.
441 foreach my $rdg ( keys %$groupings ) {
442 my $rdgobj = $c->reading( $rdg );
443 # Count '(omitted)' as a reading at this rank
444 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
445 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
447 # Our work is done if we have no witness belonging to more than one
449 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
450 return unless @doubled;
451 # If we have a symmetric related transposition, drop the non-rank readings.
452 if( @doubled == scalar keys %seen_wits ) {
453 foreach my $rdg ( keys %$groupings ) {
454 if( !$thisrank{$rdg} ) {
455 my $groupstr = wit_stringify( $groupings->{$rdg} );
456 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
458 delete $groupings->{$rdg};
459 # If we found a group match, assume there is a symmetry happening.
460 # TODO think more about this
461 # print STDERR "*** Deleting symmetric reading $rdg\n";
463 delete $transposed->{$rdg};
464 warn "Found problem in evident symmetry with reading $rdg";
468 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
470 foreach my $dup ( @doubled ) {
471 foreach my $rdg ( @{$seen_wits{$dup}} ) {
472 next if $thisrank{$rdg};
473 next unless exists $groupings->{$rdg};
474 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
475 delete $groupings->{$rdg};
476 delete $transposed->{$rdg};
479 # and put any now-orphaned readings into an 'omitted' reading.
480 foreach my $wit ( keys %seen_wits ) {
481 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
482 $groupings->{'(omitted)'} = Set::Scalar->new()
483 unless exists $groupings->{'(omitted)'};
484 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
490 # For the given grouping, return its situation graph based on the stemma.
491 sub _graph_for_grouping {
492 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
495 foreach my $gs ( values %$grouping ) {
497 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
498 push( @$acwits, $1 ) unless $lacunose->has( $1 );
500 $extant->{$_} = 1 unless $lacunose->has( $_ );
506 # contig contains all extant wits and all hypothetical wits
507 # needed to make up the groups.
508 $graph = $stemma->situation_graph( $extant, $acwits, $aclabel );
509 } catch ( Text::Tradition::Error $e ) {
510 throw( "Could not extend graph with given extant and a.c. witnesses: "
513 throw( "Could not extend graph with a.c. witnesses @$acwits" );
518 =head2 solve_variants( $calcdir, @groups )
520 Looks up the set of groups in the answers provided by the external graph solver
521 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
523 The answer has the form
524 { "variants" => [ array of variant location structures ],
525 "variant_count" => total,
526 "conflict_count" => number of conflicts detected,
527 "genealogical_count" => number of solutions found }
534 # Are we using a local result directory, or did we pass an empty value
537 unless( ref( $groups[0] ) eq 'HASH' ) {
538 $dir = shift @groups;
541 ## For each graph/group combo, make a Text::Tradition::Analysis::Result
542 ## object so that we can send it off for IDP lookup.
544 my $genealogical = 0; # counter
545 # TODO Optimize for unique graph problems
547 foreach my $graphproblem ( @groups ) {
548 # Construct the calc result key and look up its answer
549 my $problem = Text::Tradition::Analysis::Result->new(
550 graph => $graphproblem->{'graph'},
551 setlist => [ values %{$graphproblem->{'grouping'}} ] );
552 if( exists $problems{$problem->object_key} ) {
553 $problem = $problems{$problem->object_key};
555 $problems{$problem->object_key} = $problem;
557 $graphproblem->{'object'} = $problem;
562 my $scope = $dir->new_scope;
563 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
565 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
566 [ values %problems ] );
567 # Send it off and get the result
568 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
569 my $ua = LWP::UserAgent->new();
570 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
571 'Content' => $json );
573 if( $resp->is_success ) {
574 $answer = decode_json( $resp->content );
575 throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
577 throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
578 . "; cannot run graph analysis" );
580 # One more sanity check
581 throw( "Something went wrong with answer symmetricity" )
582 unless keys( %problems ) == @$answer;
583 # Convert the results
584 foreach my $a ( @$answer ) {
585 my $r = Text::Tradition::Analysis::Result->new( $a );
586 $results{$r->object_key} = $r;
590 # We now have a single JSON-encoded Result object per problem sent. Fold its
591 # answers into our variant info structure.
592 foreach my $graphproblem ( @groups ) {
593 my $result = $results{$graphproblem->{'object'}->object_key}
594 || $graphproblem->{'object'};
596 # Initialize the result structure for this graph problem
598 if( $result->status eq 'OK' ) {
599 $vstruct = { readings => [] };
600 push( @$variants, $vstruct );
602 push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
606 # 1. Did the group evaluate as genealogical?
607 $vstruct->{genealogical} = $result->is_genealogical;
608 $genealogical++ if $result->is_genealogical;
610 # 2. What are the calculated minimum groupings for each variant loc?
611 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
612 my $inputset = $graphproblem->{grouping}->{$rid};
613 my $minset = $result->minimum_grouping_for( $inputset );
614 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
617 # 3. What are the sources and classes calculated for each witness?
618 $vstruct->{witcopy_types} = { $result->classes };
619 $vstruct->{reading_roots} = {};
620 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
624 return { 'variants' => $variants,
625 'variant_count' => scalar @$variants,
626 'genealogical_count' => $genealogical };
630 my( $graphproblem, $status ) = @_;
631 my $vstruct = { 'readings' => [] };
632 $vstruct->{'unsolved'} = $status;
633 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
634 push( @{$vstruct->{readings}}, { readingid => $rid,
635 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
640 =head2 analyze_location ( $tradition, $graph, $location_hash )
642 Given the tradition, its stemma graph, and the solution from the graph solver,
643 work out the rest of the information we want. For each reading we need missing,
644 conflict, reading_parents, independent_occurrence, followed, not_followed,
645 and follow_unknown. Alters the location_hash in place.
649 sub analyze_location {
650 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
651 my $c = $tradition->collation;
653 if( exists $variant_row->{'unsolved'} ) {
656 my $reading_roots = delete $variant_row->{'reading_roots'};
657 my $classinfo = delete $variant_row->{'witcopy_types'};
659 # Make a hash of all known node memberships, and make the subgraphs.
662 my $acstr = $c->ac_label;
665 # Note which witnesses positively belong to which group. This information
666 # comes ultimately from the IDP solver.
667 # Also make a note of the reading's roots.
668 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
669 my $rid = $rdghash->{'readingid'};
671 foreach my $wit ( @{$rdghash->{'group'}} ) {
672 $contig->{$wit} = $rid;
673 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
676 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
677 push( @roots, $wit );
680 $rdghash->{'independent_occurrence'} = \@roots;
683 # Now that we have all the node group memberships, calculate followed/
684 # non-followed/unknown values for each reading. Also figure out the
685 # reading's evident parent(s).
686 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
687 my $rid = $rdghash->{'readingid'};
688 my $rdg = $c->reading( $rid );
689 my @roots = @{$rdghash->{'independent_occurrence'}};
692 @reversions = grep { $classinfo->{$_} eq 'revert' }
693 $rdghash->{'group'}->members;
694 $rdghash->{'reversions'} = \@reversions;
696 my @group = @{$rdghash->{'group'}};
698 # Start figuring things out.
699 $rdghash->{'followed'} = scalar( @group )
700 - ( scalar( @roots ) + scalar( @reversions ) );
701 # Find the parent readings, if any, of this reading.
702 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
703 # Work out relationships between readings and their non-followed parent.
704 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
705 $rdghash->{'source_parents'} = $sourceparents;
708 my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
709 _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
710 $rdghash->{'reversion_parents'} = $revparents;
713 # Find the number of times this reading was altered, and the number of
714 # times we're not sure.
715 my( %nofollow, %unknownfollow );
716 foreach my $wit ( @{$rdghash->{'group'}} ) {
717 foreach my $wchild ( $graph->successors( $wit ) ) {
718 if( $reading_roots->{$wchild} && $contig->{$wchild}
719 && $contig->{$wchild} ne $rid ) {
720 # It definitely changed here.
721 $nofollow{$wchild} = 1;
722 } elsif( !($contig->{$wchild}) ) {
723 # The child is a hypothetical node not definitely in
724 # any group. Answer is unknown.
725 $unknownfollow{$wchild} = 1;
726 } # else it is either in our group, or it is a non-root node in a
727 # known group and therefore is presumed to have its reading from
728 # its group, not this link.
731 $rdghash->{'not_followed'} = keys %nofollow;
732 $rdghash->{'follow_unknown'} = keys %unknownfollow;
734 # Now say whether this reading represents a conflict.
735 unless( $variant_row->{'genealogical'} ) {
736 $rdghash->{'is_conflict'} = @roots != 1;
737 $rdghash->{'is_reverted'} = scalar @reversions;
742 sub _find_reading_parents {
743 my( $rid, $graph, $contig, @list ) = @_;
745 foreach my $wit ( @list ) {
746 # Look in the stemma graph to find this witness's extant or known-reading
747 # immediate ancestor(s), and look up the reading that each ancestor holds.
748 my @check = $graph->predecessors( $wit );
751 foreach my $wparent( @check ) {
752 my $preading = $contig->{$wparent};
753 if( $preading && $preading ne $rid ) {
754 $parenthash->{$preading} = 1;
756 push( @next, $graph->predecessors( $wparent ) );
765 sub _resolve_parent_relationships {
766 my( $c, $rid, $rdg, $rdgparents ) = @_;
767 foreach my $p ( keys %$rdgparents ) {
768 # Resolve the relationship of the parent to the reading, and
769 # save it in our hash.
770 my $pobj = $c->reading( $p );
771 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
772 my $phash = { 'label' => $prep };
774 my $rel = $c->get_relationship( $p, $rid );
776 _add_to_hash( $rel, $phash );
778 # First check for a transposed relationship
779 if( $rdg->rank != $pobj->rank ) {
780 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
781 next unless $ti->text eq $rdg->text;
782 $rel = $c->get_relationship( $ti, $pobj );
784 _add_to_hash( $rel, $phash, 1 );
789 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
790 next unless $ti->text eq $pobj->text;
791 $rel = $c->get_relationship( $ti, $rdg );
793 _add_to_hash( $rel, $phash, 1 );
800 # and then check for sheer word similarity.
801 my $rtext = $rdg->text;
802 my $ptext = $pobj->text;
803 if( similar( $rtext, $ptext ) ) {
804 # say STDERR "Words $rtext and $ptext judged similar";
805 $phash->{relation} = { type => 'wordsimilar' };
809 $phash->{relation} = { type => 'deletion' };
811 # Get the attributes of the parent object while we are here
812 $phash->{'text'} = $pobj->text if $pobj;
813 if( $pobj && $pobj->does('Text::Tradition::Morphology') ) {
814 $phash->{'is_nonsense'} = $pobj->is_nonsense;
815 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
817 } elsif( $p eq '(omitted)' ) {
818 $phash->{relation} = { type => 'addition' };
821 $rdgparents->{$p} = $phash;
826 my( $rel, $phash, $is_transposed ) = @_;
827 $phash->{relation} = { type => $rel->type };
828 $phash->{relation}->{transposed} = 1 if $is_transposed;
829 $phash->{relation}->{annotation} = $rel->annotation
830 if $rel->has_annotation;
833 =head2 similar( $word1, $word2 )
835 Use Algorithm::Diff to get a sense of how close the words are to each other.
836 This will hopefully handle substitutions a bit more nicely than Levenshtein.
843 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
844 my @let1 = split( '', lc( $word1 ) );
845 my @let2 = split( '', lc( $word2 ) );
846 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
848 while( $diff->Next ) {
850 # Take off points for longer strings
851 my $cs = $diff->Range(1) - 2;
854 } elsif( !$diff->Items(1) ) {
855 $mag += $diff->Range(2);
856 } elsif( !$diff->Items(2) ) {
857 $mag += $diff->Range(1);
859 # Split the difference for substitutions
860 my $c1 = $diff->Range(1) || 1;
861 my $c2 = $diff->Range(2) || 1;
862 my $cd = ( $c1 + $c2 ) / 2;
866 return ( $mag <= length( $word1 ) / 2 );
869 sub _useful_variant {
870 my( $rankgroup, $rankgraph, $acstr ) = @_;
872 # Sort by group size and return
874 foreach my $rdg ( keys %$rankgroup ) {
875 my @wits = $rankgroup->{$rdg}->members;
879 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
880 || $wits[0] =~ /\Q$acstr\E$/ );
883 return $is_useful > 1;
886 =head2 wit_stringify( $groups )
888 Takes an array of witness groupings and produces a string like
889 ['A','B'] / ['C','D','E'] / ['F']
896 # If we were passed an array of witnesses instead of an array of
897 # groupings, then "group" the witnesses first.
898 unless( ref( $groups->[0] ) ) {
899 my $mkgrp = [ $groups ];
902 foreach my $g ( @$groups ) {
903 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
905 return join( ' / ', @gst );
911 Text::Tradition::Error->throw(
912 'ident' => 'Analysis error',
919 This package is free software and is provided "as is" without express
920 or implied warranty. You can redistribute it and/or modify it under
921 the same terms as Perl itself.
925 Tara L Andrews E<lt>aurum@cpan.orgE<gt>