1 package Text::Tradition::Analysis;
5 use Algorithm::Diff; # for word similarity measure
6 use Encode qw/ decode_utf8 encode_utf8 /;
9 use JSON qw/ to_json decode_json /;
12 use Text::Tradition::Analysis::Result;
13 use Text::Tradition::Directory;
14 use Text::Tradition::Stemma;
17 use vars qw/ @EXPORT_OK $VERSION /;
18 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
22 my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
23 my $unsolved_problems = {};
27 Text::Tradition::Analysis - functions for stemma analysis of a tradition
32 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
33 my $t = Text::Tradition->new(
34 'name' => 'this is a text',
36 'file' => '/path/to/tei_parallel_seg_file.xml' );
38 $t->add_stemma( 'dotfile' => $stemmafile );
40 my $variant_data = run_analysis( $tradition );
41 # Recalculate rank $n treating all orthographic variants as equivalent
42 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
46 Text::Tradition is a library for representation and analysis of collated
47 texts, particularly medieval ones. Where the Collation is the central feature of
48 a Tradition, it may also have one or more temmata associated with it, and these stemmata may be analyzed. This package provides the following modules:
52 =item * L<Text::Tradition::HasStemma> - a role that can be composed into Text::Tradition objects, providing the ability for Text::Tradition::Stemma objects to be associated with them.
54 =item * L<Text::Tradition::Stemma> - an object class that represents stemma hypotheses, both rooted (with a single archetype) and unrooted (e.g. phylogenetic trees).
56 =item * Text::Tradition::Analysis (this package). Provides functions for the analysis of a given stemma against the collation within a given Tradition.
62 =head2 run_analysis( $tradition, %opts )
64 Runs the analysis described in analyze_variant_location on every location in the
65 collation of the given tradition, with the given options. These include:
69 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
70 is 0 (i.e. the first).
72 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
74 =item * merge_types - Specify a list of relationship types, where related readings
75 should be treated as identical for the purposes of analysis.
77 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
84 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
86 my $datafile = 't/data/florilegium_tei_ps.xml';
87 my $tradition = Text::Tradition->new( 'input' => 'TEI',
89 'file' => $datafile );
90 $tradition->enable_stemmata;
91 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
92 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
94 my %expected_genealogical = (
125 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
126 my $c = $tradition->collation;
127 foreach my $row ( @{$data->{'variants'}} ) {
128 # Account for rows that used to be "not useful"
129 unless( exists $expected_genealogical{$row->{'id'}} ) {
130 $expected_genealogical{$row->{'id'}} = 1;
132 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
133 is( $gen_bool, $expected_genealogical{$row->{'id'}},
134 "Got correct genealogical flag for row " . $row->{'id'} );
135 # Check that we have the right row with the right groups
136 my $rank = $row->{'id'};
137 foreach my $rdghash ( @{$row->{'readings'}} ) {
138 # Skip 'readings' that aren't really
139 next unless $c->reading( $rdghash->{'readingid'} );
141 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
142 "Got correct reading rank" );
143 # Check the witnesses
144 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
145 my @sgrp = sort @{$rdghash->{'group'}};
146 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
149 is( $data->{'variant_count'}, 58, "Got right total variant number" );
150 # TODO Make something meaningful of conflict count, maybe test other bits
157 my( $tradition, %opts ) = @_;
158 my $c = $tradition->collation;
159 my $aclabel = $c->ac_label;
161 my $stemma_id = $opts{'stemma_id'} || 0;
162 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
163 my $collapse = Set::Scalar->new();
164 if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
165 $collapse->insert( @{$opts{'merge_types'}} );
166 } elsif( $opts{'merge_types'} ) {
167 $collapse->insert( $opts{'merge_types'} );
170 # Make sure we have a lookup DB for graph calculation results; this will die
171 # if calcdir or calcdsn isn't passed.
173 if( exists $opts{'calcdir'} ) {
174 $dir = delete $opts{'calcdir'}
175 } elsif ( exists $opts{'calcdsn'} ) {
176 $dir = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
180 my $stemma = $tradition->stemma( $stemma_id );
182 # Figure out which witnesses we are working with - that is, the ones that
183 # appear both in the stemma and in the tradition. All others are 'lacunose'
185 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
186 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
187 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
188 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
190 # Find and mark 'common' ranks for exclusion, unless they were
191 # explicitly specified.
194 foreach my $rdg ( $c->common_readings ) {
195 $common_rank{$rdg->rank} = 1;
197 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
200 # Group the variants to send to the solver
205 foreach my $rank ( @ranks ) {
206 my $missing = $lacunose->clone();
207 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
208 # Filter out any empty rankgroups
209 # (e.g. from the later rank for a transposition)
210 next unless keys %$rankgroup;
211 # Get the graph for this rankgroup
212 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
213 if( $opts{'exclude_type1'} ) {
214 # Check to see whether this is a "useful" group.
215 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
217 push( @use_ranks, $rank );
218 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
219 $lacunae{$rank} = $missing;
224 $answer = solve_variants( $dir, @groups );
225 } catch ( Text::Tradition::Error $e ) {
226 if( $e->message =~ /IDP/ ) {
227 # Something is wrong with the solver; make the variants table anyway
228 $answer->{'variants'} = [];
229 map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
232 # Something else is wrong; error out.
237 # Do further analysis on the answer
238 my $conflict_count = 0;
239 my $reversion_count = 0;
240 foreach my $idx ( 0 .. $#use_ranks ) {
241 my $location = $answer->{'variants'}->[$idx];
242 # Add the rank back in
243 my $rank = $use_ranks[$idx];
244 $location->{'id'} = $rank;
245 # Note what our lacunae are
247 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
248 $location->{'missing'} = [ keys %lmiss ];
250 # Run the extra analysis we need.
251 ## TODO We run through all the variants in this call, so
252 ## why not add the reading data there instead of here below?
253 my $graph = $groups[$idx]->{graph};
254 analyze_location( $tradition, $graph, $location, \%lmiss );
257 # Do the final post-analysis tidying up of the data.
258 foreach my $rdghash ( @{$location->{'readings'}} ) {
259 $conflict_count++ if $rdghash->{'is_conflict'};
260 $reversion_count++ if $rdghash->{'is_reverted'};
261 # Add the reading text back in, setting display value as needed
262 my $rdg = $c->reading( $rdghash->{'readingid'} );
264 $rdghash->{'text'} = $rdg->text .
265 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
266 if( $rdg->does( 'Text::Tradition::Morphology' ) ) {
267 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
268 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
271 # Remove lacunose witnesses from this reading's list now that the
274 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
275 $rdghash->{'group'} = \@realgroup;
276 # Note any layered witnesses that appear in this group
277 foreach( @realgroup ) {
278 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
279 push( @layerwits, $1 );
283 $location->{'layerwits'} = \@layerwits if @layerwits;
285 $answer->{'conflict_count'} = $conflict_count;
286 $answer->{'reversion_count'} = $reversion_count;
291 =head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
293 Groups the variants at the given $rank of the collation, treating any
294 relationships in the set $merge_relationship_types as equivalent.
295 $lacunose should be a reference to an array, to which the sigla of lacunose
296 witnesses at this rank will be appended; $transposed should be a reference
297 to a hash, wherein the identities of transposed readings and their
298 relatives will be stored.
300 Returns a hash $group_readings where $rdg is attested by the witnesses listed
301 in $group_readings->{$rdg}.
305 # Return group_readings, groups, lacunose
307 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
308 my $c = $tradition->collation;
309 my $aclabel = $c->ac_label;
310 my $table = $c->alignment_table;
311 # Get the alignment table readings
312 my %readings_at_rank;
313 my $check_for_gaps = Set::Scalar->new();
315 my $has_transposition;
316 foreach my $tablewit ( @{$table->{'alignment'}} ) {
317 my $rdg = $tablewit->{'tokens'}->[$rank-1];
318 my $wit = $tablewit->{'witness'};
319 # Exclude the witness if it is "lacunose" which if we got here
320 # means "not in the stemma".
321 next if _is_lacunose( $wit, $lacunose, $aclabel );
322 # Note if the witness is actually in a lacuna
323 if( $rdg && $rdg->{'t'}->is_lacuna ) {
324 _add_to_witlist( $wit, $lacunose, $aclabel );
325 # Otherwise the witness either has a positive reading...
327 # If the reading has been counted elsewhere as a transposition, ignore it.
328 if( $transposed->{$rdg->{'t'}->id} ) {
329 # TODO Does this cope with three-way transpositions?
330 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
333 # Otherwise, record it...
334 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
335 # ...and grab any transpositions, and their relations.
336 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
337 foreach my $trdg ( @transp ) {
338 next if exists $readings_at_rank{$trdg->id};
339 $has_transposition = 1;
340 my @affected_wits = _table_witnesses(
341 $table, $trdg, $lacunose, $aclabel );
342 next unless @affected_wits;
343 map { $moved_wits{$_} = 1 } @affected_wits;
344 $transposed->{$trdg->id} =
345 [ _table_witnesses( $table, $rdg->{'t'}, $lacunose, $aclabel ) ];
346 $readings_at_rank{$trdg->id} = $trdg;
348 # ...or it is empty, ergo a gap.
350 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
353 my $gap_wits = Set::Scalar->new();
354 map { _add_to_witlist( $_, $gap_wits, $aclabel )
355 unless $moved_wits{$_} } $check_for_gaps->members;
357 # Group the readings, collapsing groups by relationship if needed.
358 my $grouped_readings = {};
359 foreach my $rdg ( values %readings_at_rank ) {
360 # Skip readings that have been collapsed into others.
361 next if exists $grouped_readings->{$rdg->id}
362 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
363 # Get the witness list, including from readings collapsed into this one.
364 my @wits = _table_witnesses( $table, $rdg, $lacunose, $aclabel );
365 if( $collapse && $collapse->size ) {
366 my $filter = sub { $collapse->has( $_[0]->type ) };
367 foreach my $other ( $rdg->related_readings( $filter ) ) {
368 my @otherwits = _table_witnesses( $table, $other, $lacunose, $aclabel );
369 push( @wits, @otherwits );
370 $grouped_readings->{$other->id} = 'COLLAPSE';
373 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
375 if( $gap_wits->members ) {
376 $grouped_readings->{'(omitted)'} = $gap_wits;
379 # Get rid of our collapsed readings
380 map { delete $grouped_readings->{$_} if(
381 $grouped_readings->{$_} eq 'COLLAPSE'
382 || $grouped_readings->{$_}->is_empty ) }
383 keys %$grouped_readings;
385 # If something was transposed, check the groups for doubled-up readings
386 if( $has_transposition ) {
387 # print STDERR "Group for rank $rank:\n";
388 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
389 # keys %$grouped_readings;
390 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
394 return $grouped_readings;
397 # Helper function to query the alignment table for all witnesses (a.c. included)
398 # that have a given reading at its rank.
399 sub _table_witnesses {
400 my( $table, $trdg, $lacunose, $aclabel ) = @_;
401 my $tableidx = $trdg->rank - 1;
402 my $has_reading = Set::Scalar->new();
403 foreach my $row ( @{$table->{'alignment'}} ) {
404 my $wit = $row->{'witness'};
405 next if _is_lacunose( $wit, $lacunose, $aclabel );
406 my $rdg = $row->{'tokens'}->[$tableidx];
407 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
408 _add_to_witlist( $wit, $has_reading, $aclabel )
409 if $rdg->{'t'}->id eq $trdg->id;
411 return $has_reading->members;
414 # Helper function to see if a witness is lacunose even if we are asking about
417 my ( $wit, $lac, $acstr ) = @_;
418 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
421 return $lac->has( $wit );
424 # Helper function to ensure that X and X a.c. never appear in the same list.
425 sub _add_to_witlist {
426 my( $wit, $list, $acstr ) = @_;
427 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
428 # Don't add X a.c. if we already have X
429 return if $list->has( $1 );
431 # Delete X a.c. if we are about to add X
432 $list->delete( $wit.$acstr );
434 $list->insert( $wit );
437 sub _check_transposed_consistency {
438 my( $c, $rank, $transposed, $groupings ) = @_;
441 # Note which readings are actually at this rank, and which witnesses
442 # belong to which reading.
443 foreach my $rdg ( keys %$groupings ) {
444 my $rdgobj = $c->reading( $rdg );
445 # Count '(omitted)' as a reading at this rank
446 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
447 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
449 # Our work is done if we have no witness belonging to more than one
451 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
452 return unless @doubled;
453 # If we have a symmetric related transposition, drop the non-rank readings.
454 if( @doubled == scalar keys %seen_wits ) {
455 foreach my $rdg ( keys %$groupings ) {
456 if( !$thisrank{$rdg} ) {
457 my $groupstr = wit_stringify( $groupings->{$rdg} );
458 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
460 delete $groupings->{$rdg};
461 # If we found a group match, assume there is a symmetry happening.
462 # TODO think more about this
463 # print STDERR "*** Deleting symmetric reading $rdg\n";
465 delete $transposed->{$rdg};
466 warn "Found problem in evident symmetry with reading $rdg";
470 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
472 foreach my $dup ( @doubled ) {
473 foreach my $rdg ( @{$seen_wits{$dup}} ) {
474 next if $thisrank{$rdg};
475 next unless exists $groupings->{$rdg};
476 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
477 delete $groupings->{$rdg};
478 delete $transposed->{$rdg};
481 # and put any now-orphaned readings into an 'omitted' reading.
482 foreach my $wit ( keys %seen_wits ) {
483 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
484 $groupings->{'(omitted)'} = Set::Scalar->new()
485 unless exists $groupings->{'(omitted)'};
486 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
492 # For the given grouping, return its situation graph based on the stemma.
493 sub _graph_for_grouping {
494 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
497 foreach my $gs ( values %$grouping ) {
499 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
500 push( @$acwits, $1 ) unless $lacunose->has( $1 );
502 $extant->{$_} = 1 unless $lacunose->has( $_ );
508 # contig contains all extant wits and all hypothetical wits
509 # needed to make up the groups.
510 $graph = $stemma->situation_graph( $extant, $acwits, $aclabel );
511 } catch ( Text::Tradition::Error $e ) {
512 throw( "Could not extend graph with given extant and a.c. witnesses: "
515 throw( "Could not extend graph with a.c. witnesses @$acwits" );
520 =head2 solve_variants( $calcdir, @groups )
522 Looks up the set of groups in the answers provided by the external graph solver
523 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
525 The answer has the form
526 { "variants" => [ array of variant location structures ],
527 "variant_count" => total,
528 "conflict_count" => number of conflicts detected,
529 "genealogical_count" => number of solutions found }
536 # Are we using a local result directory, or did we pass an empty value
539 unless( ref( $groups[0] ) eq 'HASH' ) {
540 $dir = shift @groups;
543 ## For each graph/group combo, make a Text::Tradition::Analysis::Result
544 ## object so that we can send it off for IDP lookup.
546 my $genealogical = 0; # counter
547 # TODO Optimize for unique graph problems
549 foreach my $graphproblem ( @groups ) {
550 # Construct the calc result key and look up its answer
551 my $problem = Text::Tradition::Analysis::Result->new(
552 graph => $graphproblem->{'graph'},
553 setlist => [ values %{$graphproblem->{'grouping'}} ] );
554 if( exists $problems{$problem->object_key} ) {
555 $problem = $problems{$problem->object_key};
557 $problems{$problem->object_key} = $problem;
559 $graphproblem->{'object'} = $problem;
564 my $scope = $dir->new_scope;
565 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
567 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
568 [ values %problems ] );
569 # Send it off and get the result
570 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
571 my $ua = LWP::UserAgent->new();
572 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
573 'Content' => $json );
575 if( $resp->is_success ) {
576 $answer = decode_json( $resp->content );
577 throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
579 throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
580 . "; cannot run graph analysis" );
582 # One more sanity check
583 throw( "Something went wrong with answer symmetricity" )
584 unless keys( %problems ) == @$answer;
585 # Convert the results
586 foreach my $a ( @$answer ) {
587 my $r = Text::Tradition::Analysis::Result->new( $a );
588 $results{$r->object_key} = $r;
592 # We now have a single JSON-encoded Result object per problem sent. Fold its
593 # answers into our variant info structure.
594 foreach my $graphproblem ( @groups ) {
595 my $result = $results{$graphproblem->{'object'}->object_key}
596 || $graphproblem->{'object'};
598 # Initialize the result structure for this graph problem
600 if( $result->status eq 'OK' ) {
601 $vstruct = { readings => [] };
602 push( @$variants, $vstruct );
604 push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
608 # 1. Did the group evaluate as genealogical?
609 $vstruct->{genealogical} = $result->is_genealogical;
610 $genealogical++ if $result->is_genealogical;
612 # 2. What are the calculated minimum groupings for each variant loc?
613 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
614 my $inputset = $graphproblem->{grouping}->{$rid};
615 my $minset = $result->minimum_grouping_for( $inputset );
616 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
619 # 3. What are the sources and classes calculated for each witness?
620 $vstruct->{witcopy_types} = { $result->classes };
621 $vstruct->{reading_roots} = {};
622 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
626 return { 'variants' => $variants,
627 'variant_count' => scalar @$variants,
628 'genealogical_count' => $genealogical };
632 my( $graphproblem, $status ) = @_;
633 my $vstruct = { 'readings' => [] };
634 $vstruct->{'unsolved'} = $status;
635 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
636 push( @{$vstruct->{readings}}, { readingid => $rid,
637 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
642 =head2 analyze_location ( $tradition, $graph, $location_hash )
644 Given the tradition, its stemma graph, and the solution from the graph solver,
645 work out the rest of the information we want. For each reading we need missing,
646 conflict, reading_parents, independent_occurrence, followed, not_followed,
647 and follow_unknown. Alters the location_hash in place.
651 sub analyze_location {
652 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
653 my $c = $tradition->collation;
655 if( exists $variant_row->{'unsolved'} ) {
658 my $reading_roots = delete $variant_row->{'reading_roots'};
659 my $classinfo = delete $variant_row->{'witcopy_types'};
661 # Make a hash of all known node memberships, and make the subgraphs.
664 my $acstr = $c->ac_label;
667 # Note which witnesses positively belong to which group. This information
668 # comes ultimately from the IDP solver.
669 # Also make a note of the reading's roots.
670 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
671 my $rid = $rdghash->{'readingid'};
673 foreach my $wit ( @{$rdghash->{'group'}} ) {
674 $contig->{$wit} = $rid;
675 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
678 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
679 push( @roots, $wit );
682 $rdghash->{'independent_occurrence'} = \@roots;
685 # Now that we have all the node group memberships, calculate followed/
686 # non-followed/unknown values for each reading. Also figure out the
687 # reading's evident parent(s).
688 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
689 my $rid = $rdghash->{'readingid'};
690 my $rdg = $c->reading( $rid );
691 my @roots = @{$rdghash->{'independent_occurrence'}};
694 @reversions = grep { $classinfo->{$_} eq 'revert' }
695 $rdghash->{'group'}->members;
696 $rdghash->{'reversions'} = \@reversions;
698 my @group = @{$rdghash->{'group'}};
700 # Start figuring things out.
701 $rdghash->{'followed'} = scalar( @group )
702 - ( scalar( @roots ) + scalar( @reversions ) );
703 # Find the parent readings, if any, of this reading.
704 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
705 # Work out relationships between readings and their non-followed parent.
706 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
707 $rdghash->{'source_parents'} = $sourceparents;
710 my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
711 _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
712 $rdghash->{'reversion_parents'} = $revparents;
715 # Find the number of times this reading was altered, and the number of
716 # times we're not sure.
717 my( %nofollow, %unknownfollow );
718 foreach my $wit ( @{$rdghash->{'group'}} ) {
719 foreach my $wchild ( $graph->successors( $wit ) ) {
720 if( $reading_roots->{$wchild} && $contig->{$wchild}
721 && $contig->{$wchild} ne $rid ) {
722 # It definitely changed here.
723 $nofollow{$wchild} = 1;
724 } elsif( !($contig->{$wchild}) ) {
725 # The child is a hypothetical node not definitely in
726 # any group. Answer is unknown.
727 $unknownfollow{$wchild} = 1;
728 } # else it is either in our group, or it is a non-root node in a
729 # known group and therefore is presumed to have its reading from
730 # its group, not this link.
733 $rdghash->{'not_followed'} = keys %nofollow;
734 $rdghash->{'follow_unknown'} = keys %unknownfollow;
736 # Now say whether this reading represents a conflict.
737 unless( $variant_row->{'genealogical'} ) {
738 $rdghash->{'is_conflict'} = @roots != 1;
739 $rdghash->{'is_reverted'} = scalar @reversions;
744 sub _find_reading_parents {
745 my( $rid, $graph, $contig, @list ) = @_;
747 foreach my $wit ( @list ) {
748 # Look in the stemma graph to find this witness's extant or known-reading
749 # immediate ancestor(s), and look up the reading that each ancestor holds.
750 my @check = $graph->predecessors( $wit );
753 foreach my $wparent( @check ) {
754 my $preading = $contig->{$wparent};
755 if( $preading && $preading ne $rid ) {
756 $parenthash->{$preading} = 1;
758 push( @next, $graph->predecessors( $wparent ) );
767 sub _resolve_parent_relationships {
768 my( $c, $rid, $rdg, $rdgparents ) = @_;
769 foreach my $p ( keys %$rdgparents ) {
770 # Resolve the relationship of the parent to the reading, and
771 # save it in our hash.
772 my $pobj = $c->reading( $p );
773 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
774 my $phash = { 'label' => $prep };
776 my $rel = $c->get_relationship( $p, $rid );
778 _add_to_hash( $rel, $phash );
780 # First check for a transposed relationship
781 if( $rdg->rank != $pobj->rank ) {
782 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
783 next unless $ti->text eq $rdg->text;
784 $rel = $c->get_relationship( $ti, $pobj );
786 _add_to_hash( $rel, $phash, 1 );
791 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
792 next unless $ti->text eq $pobj->text;
793 $rel = $c->get_relationship( $ti, $rdg );
795 _add_to_hash( $rel, $phash, 1 );
802 # and then check for sheer word similarity.
803 my $rtext = $rdg->text;
804 my $ptext = $pobj->text;
805 if( similar( $rtext, $ptext ) ) {
806 # say STDERR "Words $rtext and $ptext judged similar";
807 $phash->{relation} = { type => 'wordsimilar' };
811 $phash->{relation} = { type => 'deletion' };
813 # Get the attributes of the parent object while we are here
814 $phash->{'text'} = $pobj->text if $pobj;
815 if( $pobj && $pobj->does('Text::Tradition::Morphology') ) {
816 $phash->{'is_nonsense'} = $pobj->is_nonsense;
817 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
819 } elsif( $p eq '(omitted)' ) {
820 $phash->{relation} = { type => 'addition' };
823 $rdgparents->{$p} = $phash;
828 my( $rel, $phash, $is_transposed ) = @_;
829 $phash->{relation} = { type => $rel->type };
830 $phash->{relation}->{transposed} = 1 if $is_transposed;
831 $phash->{relation}->{annotation} = $rel->annotation
832 if $rel->has_annotation;
835 =head2 similar( $word1, $word2 )
837 Use Algorithm::Diff to get a sense of how close the words are to each other.
838 This will hopefully handle substitutions a bit more nicely than Levenshtein.
845 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
846 my @let1 = split( '', lc( $word1 ) );
847 my @let2 = split( '', lc( $word2 ) );
848 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
850 while( $diff->Next ) {
852 # Take off points for longer strings
853 my $cs = $diff->Range(1) - 2;
856 } elsif( !$diff->Items(1) ) {
857 $mag += $diff->Range(2);
858 } elsif( !$diff->Items(2) ) {
859 $mag += $diff->Range(1);
861 # Split the difference for substitutions
862 my $c1 = $diff->Range(1) || 1;
863 my $c2 = $diff->Range(2) || 1;
864 my $cd = ( $c1 + $c2 ) / 2;
868 return ( $mag <= length( $word1 ) / 2 );
871 sub _useful_variant {
872 my( $rankgroup, $rankgraph, $acstr ) = @_;
874 # Sort by group size and return
876 foreach my $rdg ( keys %$rankgroup ) {
877 my @wits = $rankgroup->{$rdg}->members;
881 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
882 || $wits[0] =~ /\Q$acstr\E$/ );
885 return $is_useful > 1;
888 =head2 wit_stringify( $groups )
890 Takes an array of witness groupings and produces a string like
891 ['A','B'] / ['C','D','E'] / ['F']
898 # If we were passed an array of witnesses instead of an array of
899 # groupings, then "group" the witnesses first.
900 unless( ref( $groups->[0] ) ) {
901 my $mkgrp = [ $groups ];
904 foreach my $g ( @$groups ) {
905 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
907 return join( ' / ', @gst );
913 Text::Tradition::Error->throw(
914 'ident' => 'Analysis error',
921 This package is free software and is provided "as is" without express
922 or implied warranty. You can redistribute it and/or modify it under
923 the same terms as Perl itself.
927 Tara L Andrews E<lt>aurum@cpan.orgE<gt>