1 package Text::Tradition::Analysis;
5 use Algorithm::Diff; # for word similarity measure
6 use Encode qw/ decode_utf8 encode_utf8 /;
9 use JSON qw/ to_json decode_json /;
12 use Text::Tradition::Analysis::Result;
13 use Text::Tradition::Stemma;
16 use vars qw/ @EXPORT_OK $VERSION /;
17 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
21 my $DEFAULT_SOLVER_URL = 'http://perf.byzantini.st/cgi-bin/graphcalc.cgi';
22 my $unsolved_problems = {};
26 Text::Tradition::Analysis - functions for stemma analysis of a tradition
30 Text::Tradition is a library for representation and analysis of collated
31 texts, particularly medieval ones. Where the Collation is the central
32 feature of a Tradition, it may also have one or more stemmata associated
33 with it, and these stemmata may be analyzed. This package provides the
38 =item * L<Text::Tradition::HasStemma> - a role that will be composed into
39 Text::Tradition objects, providing the ability for Text::Tradition::Stemma
40 objects to be associated with them.
42 =item * L<Text::Tradition::Stemma> - an object class that represents stemma
43 hypotheses, both rooted (with a single archetype) and unrooted (e.g.
46 =item * Text::Tradition::Analysis (this package). Provides functions for
47 the analysis of a given stemma against the collation within a given
55 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
56 my $t = Text::Tradition->new(
57 'name' => 'this is a text',
59 'file' => '/path/to/tei_parallel_seg_file.xml' );
60 $t->add_stemma( 'dotfile' => $stemmafile );
62 my $variant_data = run_analysis( $tradition );
66 =head2 run_analysis( $tradition, %opts )
68 Runs the analysis described in analyze_variant_location on every location in the
69 collation of the given tradition, with the given options. These include:
73 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
74 is 0 (i.e. the first).
76 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
78 =item * merge_types - Specify a list of relationship types, where related readings
79 should be treated as identical for the purposes of analysis.
81 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
88 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
90 my $datafile = 't/data/florilegium_tei_ps.xml';
91 my $tradition = Text::Tradition->new( 'input' => 'TEI',
93 'file' => $datafile );
94 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
95 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
97 my %expected_genealogical = (
128 my $data = run_analysis( $tradition, calcdsn => 'dbi:SQLite:dbname=t/data/analysis.db' );
129 my $c = $tradition->collation;
130 foreach my $row ( @{$data->{'variants'}} ) {
131 # Account for rows that used to be "not useful"
132 unless( exists $expected_genealogical{$row->{'id'}} ) {
133 $expected_genealogical{$row->{'id'}} = 1;
135 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
136 is( $gen_bool, $expected_genealogical{$row->{'id'}},
137 "Got correct genealogical flag for row " . $row->{'id'} );
138 # Check that we have the right row with the right groups
139 my $rank = $row->{'id'};
140 foreach my $rdghash ( @{$row->{'readings'}} ) {
141 # Skip 'readings' that aren't really
142 next unless $c->reading( $rdghash->{'readingid'} );
144 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
145 "Got correct reading rank" );
146 # Check the witnesses
147 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
148 my @sgrp = sort @{$rdghash->{'group'}};
149 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
152 is( $data->{'variant_count'}, 58, "Got right total variant number" );
153 # TODO Make something meaningful of conflict count, maybe test other bits
160 my( $tradition, %opts ) = @_;
161 my $c = $tradition->collation;
162 my $aclabel = $c->ac_label;
164 my $stemma_id = $opts{'stemma_id'} || 0;
165 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
166 my $collapse = Set::Scalar->new();
167 if( $opts{'merge_types'} && ref( $opts{'merge_types'} ) eq 'ARRAY' ) {
168 $collapse->insert( @{$opts{'merge_types'}} );
169 } elsif( $opts{'merge_types'} ) {
170 $collapse->insert( $opts{'merge_types'} );
173 # If we have specified a local lookup DB for graph calculation results,
174 # make sure it exists and connect to it.
176 if ( exists $opts{'calcdsn'} ) {
177 eval { require Text::Tradition::Directory };
179 throw( "Could not instantiate a directory for " . $opts{'calcdsn'}
182 $opts{'dir'} = Text::Tradition::Directory->new( dsn => $opts{'calcdsn'} );
183 } elsif( !exists $opts{'solver_url'} ) {
184 $opts{'solver_url'} = $DEFAULT_SOLVER_URL;
188 my $stemma = $tradition->stemma( $stemma_id );
190 # Figure out which witnesses we are working with - that is, the ones that
191 # appear both in the stemma and in the tradition. All others are 'lacunose'
193 my $lacunose = Set::Scalar->new( $stemma->hypotheticals );
194 my $stemma_wits = Set::Scalar->new( $stemma->witnesses );
195 my $tradition_wits = Set::Scalar->new( map { $_->sigil } $tradition->witnesses );
196 $lacunose->insert( $stemma_wits->symmetric_difference( $tradition_wits )->members );
198 # Find and mark 'common' ranks for exclusion, unless they were
199 # explicitly specified.
202 foreach my $rdg ( $c->common_readings ) {
203 $common_rank{$rdg->rank} = 1;
205 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
208 # Group the variants to send to the solver
213 foreach my $rank ( @ranks ) {
214 my $missing = $lacunose->clone();
215 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, $collapse );
216 # Filter out any empty rankgroups
217 # (e.g. from the later rank for a transposition)
218 next unless keys %$rankgroup;
219 # Get the graph for this rankgroup
220 my $rankgraph = _graph_for_grouping( $stemma, $rankgroup, $missing, $aclabel );
221 if( $opts{'exclude_type1'} ) {
222 # Check to see whether this is a "useful" group.
223 next unless _useful_variant( $rankgroup, $rankgraph, $aclabel );
225 push( @use_ranks, $rank );
226 push( @groups, { grouping => $rankgroup, graph => $rankgraph } );
227 $lacunae{$rank} = $missing;
232 $answer = solve_variants( \%opts, @groups );
233 } catch ( Text::Tradition::Error $e ) {
234 if( $e->message =~ /IDP/ ) {
235 # Something is wrong with the solver; make the variants table anyway
236 $answer->{'variants'} = [];
237 map { push( @{$answer->{'variants'}}, _init_unsolved( $_, 'IDP error' ) ) }
240 # Something else is wrong; error out.
245 # Do further analysis on the answer
246 my $conflict_count = 0;
247 my $reversion_count = 0;
248 foreach my $idx ( 0 .. $#use_ranks ) {
249 my $location = $answer->{'variants'}->[$idx];
250 # Add the rank back in
251 my $rank = $use_ranks[$idx];
252 $location->{'id'} = $rank;
253 # Note what our lacunae are
255 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
256 $location->{'missing'} = [ keys %lmiss ];
258 # Run the extra analysis we need.
259 ## TODO We run through all the variants in this call, so
260 ## why not add the reading data there instead of here below?
261 my $graph = $groups[$idx]->{graph};
262 analyze_location( $tradition, $graph, $location, \%lmiss );
265 # Do the final post-analysis tidying up of the data.
266 foreach my $rdghash ( @{$location->{'readings'}} ) {
267 $conflict_count++ if $rdghash->{'is_conflict'};
268 $reversion_count++ if $rdghash->{'is_reverted'};
269 # Add the reading text back in, setting display value as needed
270 my $rdg = $c->reading( $rdghash->{'readingid'} );
272 $rdghash->{'text'} = $rdg->text .
273 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
274 if( $rdg->does( 'Text::Tradition::Morphology' ) ) {
275 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
276 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
279 # Remove lacunose witnesses from this reading's list now that the
282 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
283 $rdghash->{'group'} = \@realgroup;
284 # Note any layered witnesses that appear in this group
285 foreach( @realgroup ) {
286 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
287 push( @layerwits, $1 );
291 $location->{'layerwits'} = \@layerwits if @layerwits;
293 $answer->{'conflict_count'} = $conflict_count;
294 $answer->{'reversion_count'} = $reversion_count;
299 =head2 group_variants( $tradition, $rank, $lacunose, $transposed, $merge_relationship_types )
301 Groups the variants at the given $rank of the collation, treating any
302 relationships in the set $merge_relationship_types as equivalent.
303 $lacunose should be a reference to an array, to which the sigla of lacunose
304 witnesses at this rank will be appended; $transposed should be a reference
305 to a hash, wherein the identities of transposed readings and their
306 relatives will be stored.
308 Returns a hash $group_readings where $rdg is attested by the witnesses listed
309 in $group_readings->{$rdg}.
313 # Return group_readings, groups, lacunose
315 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
316 my $c = $tradition->collation;
317 my $aclabel = $c->ac_label;
318 my $table = $c->alignment_table;
319 # Get the alignment table readings
320 my %readings_at_rank;
321 my $check_for_gaps = Set::Scalar->new();
323 my $has_transposition;
325 foreach my $tablewit ( @{$table->{'alignment'}} ) {
326 my $rdg = $tablewit->{'tokens'}->[$rank-1];
327 my $wit = $tablewit->{'witness'};
328 # Exclude the witness if it is "lacunose" which if we got here
329 # means "not in the stemma".
330 next if _is_lacunose( $wit, $lacunose, $aclabel );
331 # Note if the witness is actually in a lacuna
332 if( $rdg && $rdg->{'t'}->is_lacuna ) {
333 _add_to_witlist( $wit, $lacunose, $aclabel );
334 # Otherwise the witness either has a positive reading...
336 # If the reading has been counted elsewhere as a transposition, ignore it.
337 if( $transposed->{$rdg->{'t'}->id} ) {
338 # TODO Does this cope with three-way transpositions?
339 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
342 # Otherwise, record it...
343 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
344 # ...and grab any transpositions, and their relations.
345 my @transp = grep { $_->rank != $rank } _all_related( $rdg->{'t'} );
346 foreach my $trdg ( @transp ) {
347 next if exists $readings_at_rank{$trdg->id};
348 $has_transposition = 1;
349 my @affected_wits = _table_witnesses(
350 $table, $trdg->rank, $trdg, $lacunose, $aclabel );
351 next unless @affected_wits;
352 map { $moved_wits{$_} = 1 } @affected_wits;
353 my @thisloc_wits = _table_witnesses( $table, $rank, $rdg->{'t'},
354 $lacunose, $aclabel );
355 # Check to see if our affected wits have layers that do something
358 map { $transploc_gaps{$_} = 1 }
359 _table_witnesses( $table, $trdg->rank, undef, $lacunose, $aclabel );
360 foreach my $aw ( @affected_wits ) {
361 if( $transploc_gaps{$aw.$aclabel} ) {
362 push( @thisloc_wits, $aw.$aclabel );
363 push( @transp_acgap, $aw.$aclabel );
366 # Record which witnesses we should count as already analyzed when we
367 # get to the transposed reading's own rank.
368 $transposed->{$trdg->id} = \@thisloc_wits;
369 $readings_at_rank{$trdg->id} = $trdg;
371 # ...or it is empty, ergo a gap.
373 _add_to_witlist( $wit, $check_for_gaps, $aclabel );
376 # Push all the transposition layer gaps onto our list
377 $check_for_gaps->insert( @transp_acgap );
378 # Now remove from our 'gaps' any witnesses known to have been dealt with elsewhere.
379 my $gap_wits = Set::Scalar->new();
380 map { _add_to_witlist( $_, $gap_wits, $aclabel )
381 unless $moved_wits{$_} } $check_for_gaps->members;
383 # Group the readings, collapsing groups by relationship if needed.
384 my $grouped_readings = {};
385 foreach my $rdg ( values %readings_at_rank ) {
386 # Skip readings that have been collapsed into others.
387 next if exists $grouped_readings->{$rdg->id}
388 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
389 # Get the witness list, including from readings collapsed into this one.
390 my @wits = _table_witnesses( $table, $rdg->rank, $rdg, $lacunose, $aclabel );
391 if( $collapse && $collapse->size ) {
392 my $filter = sub { $collapse->has( $_[0]->type ) };
393 foreach my $other ( $rdg->related_readings( $filter ) ) {
394 my @otherwits = _table_witnesses( $table, $other->rank, $other, $lacunose, $aclabel );
395 push( @wits, @otherwits );
396 $grouped_readings->{$other->id} = 'COLLAPSE';
399 $grouped_readings->{$rdg->id} = Set::Scalar->new( @wits );
401 if( $gap_wits->members ) {
402 $grouped_readings->{'(omitted)'} = $gap_wits;
405 # Get rid of our collapsed readings
406 map { delete $grouped_readings->{$_} if(
407 $grouped_readings->{$_} eq 'COLLAPSE'
408 || $grouped_readings->{$_}->is_empty ) }
409 keys %$grouped_readings;
411 # If something was transposed, check the groups for doubled-up readings
412 if( $has_transposition ) {
413 # print STDERR "Group for rank $rank:\n";
414 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
415 # keys %$grouped_readings;
416 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
420 return $grouped_readings;
424 # Except by repetition
426 my $c = $rdg->collation;
427 my @check = ( $rdg );
431 foreach my $ck ( @check ) {
433 push( @next, grep { !$seen{"$_"} }
434 $ck->related_readings( sub { $_[0]->type ne 'repetition' } ) );
440 my @all = map { $c->reading( $_ ) } keys %seen;
445 # Helper function to query the alignment table for all witnesses (a.c. included)
446 # that have a given reading at its rank.
447 sub _table_witnesses {
448 my( $table, $rank, $trdg, $lacunose, $aclabel ) = @_;
449 my $tableidx = $rank - 1;
450 my $has_reading = Set::Scalar->new();
451 foreach my $row ( @{$table->{'alignment'}} ) {
452 my $wit = $row->{'witness'};
453 next if _is_lacunose( $wit, $lacunose, $aclabel );
454 my $rdg = $row->{'tokens'}->[$tableidx];
456 # We have some positive reading we want.
457 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
458 if( $trdg->is_lacuna ) {
459 _add_to_witlist( $wit, $has_reading, $aclabel )
460 if $rdg->{'t'}->is_lacuna;
462 _add_to_witlist( $wit, $has_reading, $aclabel )
463 if $rdg->{'t'}->id eq $trdg->id;
466 # We want the omissions.
467 next if exists $rdg->{'t'} && defined $rdg->{'t'};
468 _add_to_witlist( $wit, $has_reading, $aclabel )
471 return $has_reading->members;
474 # Helper function to see if a witness is lacunose even if we are asking about
477 my ( $wit, $lac, $acstr ) = @_;
478 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
481 return $lac->has( $wit );
484 # Helper function to ensure that X and X a.c. never appear in the same list.
485 sub _add_to_witlist {
486 my( $wit, $list, $acstr ) = @_;
487 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
488 # Don't add X a.c. if we already have X
489 return if $list->has( $1 );
491 # Delete X a.c. if we are about to add X
492 $list->delete( $wit.$acstr );
494 $list->insert( $wit );
497 sub _check_transposed_consistency {
498 my( $c, $rank, $transposed, $groupings ) = @_;
501 # Note which readings are actually at this rank, and which witnesses
502 # belong to which reading.
503 foreach my $rdg ( keys %$groupings ) {
504 my $rdgobj = $c->reading( $rdg );
505 # Count '(omitted)' as a reading at this rank
506 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
507 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
509 # Our work is done if we have no witness belonging to more than one
511 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
512 return unless @doubled;
513 # If we have a symmetric related transposition, drop the non-rank readings.
514 if( @doubled == scalar keys %seen_wits ) {
515 foreach my $rdg ( keys %$groupings ) {
516 if( !$thisrank{$rdg} ) {
517 # Groupings are Set::Scalar objects so we can compare them outright.
518 my ( $matched ) = grep { $groupings->{$rdg} == $groupings->{$_} }
520 delete $groupings->{$rdg};
521 # If we found a group match, assume there is a symmetry happening.
522 # TODO think more about this
523 # print STDERR "*** Deleting symmetric reading $rdg\n";
525 delete $transposed->{$rdg};
526 warn "Found problem in evident symmetry with reading $rdg";
530 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
532 foreach my $dup ( @doubled ) {
533 foreach my $rdg ( @{$seen_wits{$dup}} ) {
534 next if $thisrank{$rdg};
535 next unless exists $groupings->{$rdg};
536 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
537 delete $groupings->{$rdg};
538 delete $transposed->{$rdg};
541 # and put any now-orphaned readings into an 'omitted' reading.
542 foreach my $wit ( keys %seen_wits ) {
543 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
544 $groupings->{'(omitted)'} = Set::Scalar->new()
545 unless exists $groupings->{'(omitted)'};
546 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
552 # For the given grouping, return its situation graph based on the stemma.
553 sub _graph_for_grouping {
554 my( $stemma, $grouping, $lacunose, $aclabel ) = @_;
557 foreach my $gs ( values %$grouping ) {
559 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
560 push( @$acwits, $1 ) unless $lacunose->has( $1 );
562 $extant->{$_} = 1 unless $lacunose->has( $_ );
568 # contig contains all extant wits and all hypothetical wits
569 # needed to make up the groups.
570 $graph = $stemma->situation_graph( $extant, $acwits, $aclabel );
571 } catch ( Text::Tradition::Error $e ) {
572 throw( "Could not extend graph with given extant and a.c. witnesses: "
575 throw( "Could not extend graph with a.c. witnesses @$acwits" );
580 =head2 solve_variants( $calcdir, @groups )
582 Looks up the set of groups in the answers provided by the external graph solver
583 service and returns a cleaned-up answer, adding the rank IDs back where they belong.
585 The answer has the form
586 { "variants" => [ array of variant location structures ],
587 "variant_count" => total,
588 "conflict_count" => number of conflicts detected,
589 "genealogical_count" => number of solutions found }
594 my( $opts, @groups ) = @_;
596 # Are we using a local result directory?
597 my $dir = $opts->{dir};
599 ## For each graph/group combo, make a Text::Tradition::Analysis::Result
600 ## object so that we can send it off for IDP lookup.
602 my $genealogical = 0; # counter
603 # TODO Optimize for unique graph problems
605 foreach my $graphproblem ( @groups ) {
606 # Construct the calc result key and look up its answer
607 my $problem = Text::Tradition::Analysis::Result->new(
608 graph => $graphproblem->{'graph'},
609 setlist => [ values %{$graphproblem->{'grouping'}} ] );
610 if( exists $problems{$problem->object_key} ) {
611 $problem = $problems{$problem->object_key};
613 $problems{$problem->object_key} = $problem;
615 $graphproblem->{'object'} = $problem;
620 my $scope = $dir->new_scope;
621 map { $results{$_} = $dir->lookup( $_ ) || $problems{$_} } keys %problems;
623 # print STDERR "Using solver at " . $opts->{solver_url} . "\n";
624 my $json = JSON->new->allow_blessed->convert_blessed->utf8->encode(
625 [ values %problems ] );
626 # Send it off and get the result
627 # print STDERR "Sending request: " . decode_utf8( $json ) . "\n";
628 my $ua = LWP::UserAgent->new();
629 my $resp = $ua->post( $opts->{solver_url}, 'Content-Type' => 'application/json',
630 'Content' => $json );
632 if( $resp->is_success ) {
633 $answer = decode_json( $resp->content );
634 throw( "Unexpected answer from IDP: $answer" ) unless ref( $answer ) eq 'ARRAY';
636 throw( "IDP solver returned " . $resp->status_line . " / " . $resp->content
637 . "; cannot run graph analysis" );
639 # One more sanity check
640 throw( "Something went wrong with answer symmetricity" )
641 unless keys( %problems ) == @$answer;
642 # Convert the results
643 foreach my $a ( @$answer ) {
644 my $r = Text::Tradition::Analysis::Result->new( $a );
645 $results{$r->object_key} = $r;
649 # We now have a single JSON-encoded Result object per problem sent. Fold its
650 # answers into our variant info structure.
651 foreach my $graphproblem ( @groups ) {
652 my $result = $results{$graphproblem->{'object'}->object_key}
653 || $graphproblem->{'object'};
655 # Initialize the result structure for this graph problem
657 if( $result->status eq 'OK' ) {
658 $vstruct = { readings => [] };
659 push( @$variants, $vstruct );
661 push( @$variants, _init_unsolved( $graphproblem, $result->status ) );
665 # 1. Did the group evaluate as genealogical?
666 $vstruct->{genealogical} = $result->is_genealogical;
667 $genealogical++ if $result->is_genealogical;
669 # 2. What are the calculated minimum groupings for each variant loc?
670 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
671 my $inputset = $graphproblem->{grouping}->{$rid};
672 my $minset = $result->minimum_grouping_for( $inputset );
673 push( @{$vstruct->{readings}}, { readingid => $rid, group => $minset } );
676 # 3. What are the sources and classes calculated for each witness?
677 $vstruct->{witcopy_types} = { $result->classes };
678 $vstruct->{reading_roots} = {};
679 map { $vstruct->{reading_roots}->{$_} = 1 } $result->sources;
683 return { 'variants' => $variants,
684 'variant_count' => scalar @$variants,
685 'genealogical_count' => $genealogical };
689 my( $graphproblem, $status ) = @_;
690 my $vstruct = { 'readings' => [] };
691 $vstruct->{'unsolved'} = $status;
692 foreach my $rid ( keys %{$graphproblem->{grouping}} ) {
693 push( @{$vstruct->{readings}}, { readingid => $rid,
694 group => [ $graphproblem->{grouping}->{$rid}->members ] } );
699 =head2 analyze_location ( $tradition, $graph, $location_hash )
701 Given the tradition, its stemma graph, and the solution from the graph solver,
702 work out the rest of the information we want. For each reading we need missing,
703 conflict, reading_parents, independent_occurrence, followed, not_followed,
704 and follow_unknown. Alters the location_hash in place.
708 sub analyze_location {
709 my ( $tradition, $graph, $variant_row, $lacunose ) = @_;
710 my $c = $tradition->collation;
712 if( exists $variant_row->{'unsolved'} ) {
715 my $reading_roots = delete $variant_row->{'reading_roots'};
716 my $classinfo = delete $variant_row->{'witcopy_types'};
718 # Make a hash of all known node memberships, and make the subgraphs.
721 my $acstr = $c->ac_label;
724 # Note which witnesses positively belong to which group. This information
725 # comes ultimately from the IDP solver.
726 # Also make a note of the reading's roots.
727 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
728 my $rid = $rdghash->{'readingid'};
730 foreach my $wit ( @{$rdghash->{'group'}} ) {
731 $contig->{$wit} = $rid;
732 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
735 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
736 push( @roots, $wit );
739 $rdghash->{'independent_occurrence'} = \@roots;
742 # Now that we have all the node group memberships, calculate followed/
743 # non-followed/unknown values for each reading. Also figure out the
744 # reading's evident parent(s).
745 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
746 my $rid = $rdghash->{'readingid'};
747 my $rdg = $c->reading( $rid );
748 my @roots = @{$rdghash->{'independent_occurrence'}};
751 @reversions = grep { $classinfo->{$_} eq 'revert' }
752 $rdghash->{'group'}->members;
753 $rdghash->{'reversions'} = \@reversions;
755 my @group = @{$rdghash->{'group'}};
757 # Start figuring things out.
758 $rdghash->{'followed'} = scalar( @group )
759 - ( scalar( @roots ) + scalar( @reversions ) );
760 # Find the parent readings, if any, of this reading.
761 my $sourceparents = _find_reading_parents( $rid, $graph, $contig, @roots );
762 # Work out relationships between readings and their non-followed parent.
763 _resolve_parent_relationships( $c, $rid, $rdg, $sourceparents );
764 $rdghash->{'source_parents'} = $sourceparents;
767 my $revparents = _find_reading_parents( $rid, $graph, $contig, @reversions );
768 _resolve_parent_relationships( $c, $rid, $rdg, $revparents );
769 $rdghash->{'reversion_parents'} = $revparents;
772 # Find the number of times this reading was altered, and the number of
773 # times we're not sure.
774 my( %nofollow, %unknownfollow );
775 foreach my $wit ( @{$rdghash->{'group'}} ) {
776 foreach my $wchild ( $graph->successors( $wit ) ) {
777 if( $reading_roots->{$wchild} && $contig->{$wchild}
778 && $contig->{$wchild} ne $rid ) {
779 # It definitely changed here.
780 $nofollow{$wchild} = 1;
781 } elsif( !($contig->{$wchild}) ) {
782 # The child is a hypothetical node not definitely in
783 # any group. Answer is unknown.
784 $unknownfollow{$wchild} = 1;
785 } # else it is either in our group, or it is a non-root node in a
786 # known group and therefore is presumed to have its reading from
787 # its group, not this link.
790 $rdghash->{'not_followed'} = keys %nofollow;
791 $rdghash->{'follow_unknown'} = keys %unknownfollow;
793 # Now say whether this reading represents a conflict.
794 unless( $variant_row->{'genealogical'} ) {
795 $rdghash->{'is_conflict'} = @roots != 1;
796 $rdghash->{'is_reverted'} = scalar @reversions;
801 sub _find_reading_parents {
802 my( $rid, $graph, $contig, @list ) = @_;
804 foreach my $wit ( @list ) {
805 # Look in the stemma graph to find this witness's extant or known-reading
806 # immediate ancestor(s), and look up the reading that each ancestor holds.
807 my @check = $graph->predecessors( $wit );
810 foreach my $wparent( @check ) {
811 my $preading = $contig->{$wparent};
812 if( $preading && $preading ne $rid ) {
813 $parenthash->{$preading} = 1;
815 push( @next, $graph->predecessors( $wparent ) );
824 sub _resolve_parent_relationships {
825 my( $c, $rid, $rdg, $rdgparents ) = @_;
826 foreach my $p ( keys %$rdgparents ) {
827 # Resolve the relationship of the parent to the reading, and
828 # save it in our hash.
829 my $pobj = $c->reading( $p );
830 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
831 my $phash = { 'label' => $prep };
833 # Get the attributes of the parent object while we are here
834 $phash->{'text'} = $pobj->text if $pobj;
835 if( $pobj && $pobj->does('Text::Tradition::Morphology') ) {
836 $phash->{'is_nonsense'} = $pobj->is_nonsense;
837 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
839 # Now look at the relationship
840 my $rel = $c->get_relationship( $p, $rid );
841 if( $rel && $rel->type eq 'collated' ) {
845 _add_to_hash( $rel, $phash );
847 # First check for a transposed relationship
848 if( $rdg->rank != $pobj->rank ) {
849 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
850 next unless $ti->text eq $rdg->text;
851 $rel = $c->get_relationship( $ti, $pobj );
853 _add_to_hash( $rel, $phash, 1 );
858 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
859 next unless $ti->text eq $pobj->text;
860 $rel = $c->get_relationship( $ti, $rdg );
862 _add_to_hash( $rel, $phash, 1 );
869 # and then check for sheer word similarity.
870 my $rtext = $rdg->text;
871 my $ptext = $pobj->text;
872 if( similar( $rtext, $ptext ) ) {
873 # say STDERR "Words $rtext and $ptext judged similar";
874 $phash->{relation} = { type => 'wordsimilar' };
878 $phash->{relation} = { type => 'deletion' };
880 } elsif( $p eq '(omitted)' ) {
881 # Check to see if the reading in question is a repetition.
882 my @reps = $rdg->related_readings( 'repetition' );
884 $phash->{relation} = { type => 'repetition',
885 annotation => "of reading @reps" };
887 $phash->{relation} = { type => 'addition' };
891 $rdgparents->{$p} = $phash;
896 my( $rel, $phash, $is_transposed ) = @_;
897 $phash->{relation} = { type => $rel->type };
898 $phash->{relation}->{transposed} = 1 if $is_transposed;
899 $phash->{relation}->{annotation} = $rel->annotation
900 if $rel->has_annotation;
901 # Get all the relevant relationship info.
902 foreach my $prop ( qw/ non_independent is_significant / ) {
903 $phash->{relation}->{$prop} = $rel->$prop;
905 # Figure out if the variant was judged revertible.
906 my $is_a = $rel->reading_a eq $phash->{text};
907 $phash->{revertible} = $is_a
908 ? $rel->a_derivable_from_b : $rel->b_derivable_from_a;
911 =head2 similar( $word1, $word2 )
913 Use Algorithm::Diff to get a sense of how close the words are to each other.
914 This will hopefully handle substitutions a bit more nicely than Levenshtein.
921 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
922 my @let1 = split( '', lc( $word1 ) );
923 my @let2 = split( '', lc( $word2 ) );
924 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
926 while( $diff->Next ) {
928 # Take off points for longer strings
929 my $cs = $diff->Range(1) - 2;
932 } elsif( !$diff->Items(1) ) {
933 $mag += $diff->Range(2);
934 } elsif( !$diff->Items(2) ) {
935 $mag += $diff->Range(1);
937 # Split the difference for substitutions
938 my $c1 = $diff->Range(1) || 1;
939 my $c2 = $diff->Range(2) || 1;
940 my $cd = ( $c1 + $c2 ) / 2;
944 return ( $mag <= length( $word1 ) / 2 );
947 sub _useful_variant {
948 my( $rankgroup, $rankgraph, $acstr ) = @_;
950 # Sort by group size and return
952 foreach my $rdg ( keys %$rankgroup ) {
953 my @wits = $rankgroup->{$rdg}->members;
957 $is_useful++ unless( $rankgraph->is_sink_vertex( $wits[0] )
958 || $wits[0] =~ /\Q$acstr\E$/ );
961 return $is_useful > 1;
964 =head2 wit_stringify( $groups )
966 Takes an array of witness groupings and produces a string like
967 ['A','B'] / ['C','D','E'] / ['F']
974 # If we were passed an array of witnesses instead of an array of
975 # groupings, then "group" the witnesses first.
976 unless( ref( $groups->[0] ) ) {
977 my $mkgrp = [ $groups ];
980 foreach my $g ( @$groups ) {
981 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
983 return join( ' / ', @gst );
989 Text::Tradition::Error->throw(
990 'ident' => 'Analysis error',
997 This package is free software and is provided "as is" without express
998 or implied warranty. You can redistribute it and/or modify it under
999 the same terms as Perl itself.
1003 Tara L Andrews E<lt>aurum@cpan.orgE<gt>