1 package Text::Tradition::Analysis;
5 use Algorithm::Diff; # for word similarity measure
7 use Encode qw/ encode_utf8 /;
10 use JSON qw/ encode_json decode_json /;
13 use Text::Tradition::Stemma;
16 use vars qw/ @EXPORT_OK /;
17 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
19 my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
24 Text::Tradition::Analysis - functions for stemma analysis of a tradition
29 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
30 my $t = Text::Tradition->new(
31 'name' => 'this is a text',
33 'file' => '/path/to/tei_parallel_seg_file.xml' );
34 $t->add_stemma( 'dotfile' => $stemmafile );
36 my $variant_data = run_analysis( $tradition );
37 # Recalculate rank $n treating all orthographic variants as equivalent
38 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
42 Text::Tradition is a library for representation and analysis of collated
43 texts, particularly medieval ones. The Collation is the central feature of
44 a Tradition, where the text, its sequence of readings, and its relationships
45 between readings are actually kept.
49 =head2 run_analysis( $tradition, %opts )
51 Runs the analysis described in analyze_variant_location on every location in the
52 collation of the given tradition, with the given options. These include:
56 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
57 is 0 (i.e. the first).
59 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
61 =item * merge_types - Specify a list of relationship types, where related readings
62 should be treated as identical for the purposes of analysis.
64 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
71 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
73 my $datafile = 't/data/florilegium_tei_ps.xml';
74 my $tradition = Text::Tradition->new( 'input' => 'TEI',
76 'file' => $datafile );
77 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
78 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
80 my %expected_genealogical = (
111 my $data = run_analysis( $tradition );
112 my $c = $tradition->collation;
113 foreach my $row ( @{$data->{'variants'}} ) {
114 # Account for rows that used to be "not useful"
115 unless( exists $expected_genealogical{$row->{'id'}} ) {
116 $expected_genealogical{$row->{'id'}} = 1;
118 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
119 is( $gen_bool, $expected_genealogical{$row->{'id'}},
120 "Got correct genealogical flag for row " . $row->{'id'} );
121 # Check that we have the right row with the right groups
122 my $rank = $row->{'id'};
123 foreach my $rdghash ( @{$row->{'readings'}} ) {
124 # Skip 'readings' that aren't really
125 next unless $c->reading( $rdghash->{'readingid'} );
127 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
128 "Got correct reading rank" );
129 # Check the witnesses
130 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
131 my @sgrp = sort @{$rdghash->{'group'}};
132 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
135 is( $data->{'variant_count'}, 58, "Got right total variant number" );
136 # TODO Make something meaningful of conflict count, maybe test other bits
143 my( $tradition, %opts ) = @_;
144 my $c = $tradition->collation;
146 my $stemma_id = $opts{'stemma_id'} || 0;
147 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
148 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
151 my $stemma = $tradition->stemma( $stemma_id );
153 # Figure out which witnesses we are working with - that is, the ones that
154 # appear both in the stemma and in the tradition. All others are 'lacunose'
156 my @lacunose = $stemma->hypotheticals;
157 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
158 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
160 # Find and mark 'common' ranks for exclusion, unless they were
161 # explicitly specified.
164 foreach my $rdg ( $c->common_readings ) {
165 $common_rank{$rdg->rank} = 1;
167 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
170 # Group the variants to send to the solver
175 foreach my $rank ( @ranks ) {
176 my $missing = [ @lacunose ];
177 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
178 # Filter out any empty rankgroups
179 # (e.g. from the later rank for a transposition)
180 next unless keys %$rankgroup;
181 if( $opts{'exclude_type1'} ) {
182 # Check to see whether this is a "useful" group.
183 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
184 $stemma->graph, $c->ac_label );
187 push( @use_ranks, $rank );
188 push( @groups, $rankgroup );
189 $lacunae{$rank} = $missing;
192 my $answer = solve_variants( $stemma, @groups );
194 # Do further analysis on the answer
195 my $conflict_count = 0;
196 my $aclabel = $c->ac_label;
197 foreach my $idx ( 0 .. $#use_ranks ) {
198 my $location = $answer->{'variants'}->[$idx];
199 # Add the rank back in
200 my $rank = $use_ranks[$idx];
201 $location->{'id'} = $rank;
202 # Note what our lacunae are
204 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
205 $location->{'missing'} = [ keys %lmiss ];
207 # Run the extra analysis we need.
208 ## TODO We run through all the variants in this call, so
209 ## why not add the reading data there instead of here below?
210 analyze_location( $tradition, $stemma, $location, \%lmiss );
213 # Do the final post-analysis tidying up of the data.
214 foreach my $rdghash ( @{$location->{'readings'}} ) {
216 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
217 # Add the reading text back in, setting display value as needed
218 my $rdg = $c->reading( $rdghash->{'readingid'} );
220 $rdghash->{'text'} = $rdg->text .
221 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
222 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
223 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
225 # Remove lacunose witnesses from this reading's list now that the
228 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
229 $rdghash->{'group'} = \@realgroup;
230 # Note any layered witnesses that appear in this group
231 foreach( @realgroup ) {
232 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
233 push( @layerwits, $1 );
237 $location->{'layerwits'} = \@layerwits if @layerwits;
239 $answer->{'conflict_count'} = $conflict_count;
244 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
246 Groups the variants at the given $rank of the collation, treating any
247 relationships in @merge_relationship_types as equivalent. $lacunose should
248 be a reference to an array, to which the sigla of lacunose witnesses at this
249 rank will be appended; $transposed should be a reference to a hash, wherein
250 the identities of transposed readings and their relatives will be stored.
252 Returns a hash $group_readings where $rdg is attested by the witnesses listed
253 in $group_readings->{$rdg}.
257 # Return group_readings, groups, lacunose
259 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
260 my $c = $tradition->collation;
261 my $aclabel = $c->ac_label;
262 my $table = $c->alignment_table;
263 # Get the alignment table readings
264 my %readings_at_rank;
265 my %is_lacunose; # lookup table for witnesses not in stemma
266 map { $is_lacunose{$_} = 1; $is_lacunose{$_.$aclabel} = 1 } @$lacunose;
269 my $has_transposition;
270 foreach my $tablewit ( @{$table->{'alignment'}} ) {
271 my $rdg = $tablewit->{'tokens'}->[$rank-1];
272 my $wit = $tablewit->{'witness'};
273 # Exclude the witness if it is "lacunose" which if we got here
274 # means "not in the stemma".
275 next if $is_lacunose{$wit};
276 # Note if the witness is actually in a lacuna
277 if( $rdg && $rdg->{'t'}->is_lacuna ) {
278 _add_to_witlist( $wit, $lacunose, $aclabel );
279 # Otherwise the witness either has a positive reading...
281 # If the reading has been counted elsewhere as a transposition, ignore it.
282 if( $transposed->{$rdg->{'t'}->id} ) {
283 # TODO Does this cope with three-way transpositions?
284 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
287 # Otherwise, record it...
288 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
289 # ...and grab any transpositions, and their relations.
290 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
291 foreach my $trdg ( @transp ) {
292 next if exists $readings_at_rank{$trdg->id};
293 $has_transposition = 1;
294 my @affected_wits = _table_witnesses(
295 $table, $trdg, \%is_lacunose, $aclabel );
296 next unless @affected_wits;
297 map { $moved_wits{$_} = 1 } @affected_wits;
298 $transposed->{$trdg->id} =
299 [ _table_witnesses( $table, $rdg->{'t'}, \%is_lacunose, $aclabel ) ];
300 $readings_at_rank{$trdg->id} = $trdg;
302 # ...or it is empty, ergo a gap.
304 _add_to_witlist( $wit, \@check_for_gaps, $aclabel );
308 map { _add_to_witlist( $_, \@gap_wits, $aclabel )
309 unless $moved_wits{$_} } @check_for_gaps;
310 # Group the readings, collapsing groups by relationship if needed
311 my $grouped_readings = {};
312 foreach my $rdg ( values %readings_at_rank ) {
313 # Skip readings that have been collapsed into others.
314 next if exists $grouped_readings->{$rdg->id}
315 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
316 # Get the witness list, including from readings collapsed into this one.
317 my @wits = _table_witnesses( $table, $rdg, \%is_lacunose, $aclabel );
318 if( $collapse && @$collapse ) {
319 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
320 foreach my $other ( $rdg->related_readings( $filter ) ) {
321 my @otherwits = _table_witnesses(
322 $table, $other, \%is_lacunose, $aclabel );
323 push( @wits, @otherwits );
324 $grouped_readings->{$other->id} = 'COLLAPSE';
327 $grouped_readings->{$rdg->id} = \@wits;
329 $grouped_readings->{'(omitted)'} = \@gap_wits if @gap_wits;
330 # Get rid of our collapsed readings
331 map { delete $grouped_readings->{$_} if $grouped_readings->{$_} eq 'COLLAPSE' }
332 keys %$grouped_readings
335 # If something was transposed, check the groups for doubled-up readings
336 if( $has_transposition ) {
337 # print STDERR "Group for rank $rank:\n";
338 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
339 # keys %$grouped_readings;
340 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
344 return $grouped_readings;
347 # Helper function to query the alignment table for all witnesses (a.c. included)
348 # that have a given reading at its rank.
349 sub _table_witnesses {
350 my( $table, $trdg, $lacunose, $aclabel ) = @_;
351 my $tableidx = $trdg->rank - 1;
353 foreach my $row ( @{$table->{'alignment'}} ) {
354 my $wit = $row->{'witness'};
355 next if $lacunose->{$wit};
356 my $rdg = $row->{'tokens'}->[$tableidx];
357 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
358 _add_to_witlist( $wit, \@has_reading, $aclabel )
359 if $rdg->{'t'}->id eq $trdg->id;
364 # Helper function to ensure that X and X a.c. never appear in the same list.
365 sub _add_to_witlist {
366 my( $wit, $list, $acstr ) = @_;
369 map { $inlist{$_} = $idx++ } @$list;
370 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
372 unless( exists $inlist{$acwit} ) {
373 push( @$list, $acwit.$acstr );
376 if( exists( $inlist{$wit.$acstr} ) ) {
377 # Replace the a.c. version with the main witness
378 my $i = $inlist{$wit.$acstr};
381 push( @$list, $wit );
386 sub _check_transposed_consistency {
387 my( $c, $rank, $transposed, $groupings ) = @_;
390 # Note which readings are actually at this rank, and which witnesses
391 # belong to which reading.
392 foreach my $rdg ( keys %$groupings ) {
393 my $rdgobj = $c->reading( $rdg );
394 # Count '(omitted)' as a reading at this rank
395 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
396 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
398 # Our work is done if we have no witness belonging to more than one
400 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
401 return unless @doubled;
402 # If we have a symmetric related transposition, drop the non-rank readings.
403 if( @doubled == scalar keys %seen_wits ) {
404 foreach my $rdg ( keys %$groupings ) {
405 if( !$thisrank{$rdg} ) {
406 my $groupstr = wit_stringify( $groupings->{$rdg} );
407 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
409 delete $groupings->{$rdg};
410 # If we found a group match, assume there is a symmetry happening.
411 # TODO think more about this
412 # print STDERR "*** Deleting symmetric reading $rdg\n";
414 delete $transposed->{$rdg};
415 warn "Found problem in evident symmetry with reading $rdg";
419 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
421 foreach my $dup ( @doubled ) {
422 foreach my $rdg ( @{$seen_wits{$dup}} ) {
423 next if $thisrank{$rdg};
424 next unless exists $groupings->{$rdg};
425 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
426 delete $groupings->{$rdg};
427 delete $transposed->{$rdg};
430 # and put any now-orphaned readings into an 'omitted' reading.
431 foreach my $wit ( keys %seen_wits ) {
432 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
433 $groupings->{'(omitted)'} = [] unless exists $groupings->{'(omitted)'};
434 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
440 =head2 solve_variants( $graph, @groups )
442 Sends the set of groups to the external graph solver service and returns
443 a cleaned-up answer, adding the rank IDs back where they belong.
445 The JSON has the form
446 { "graph": [ stemmagraph DOT string without newlines ],
447 "groupings": [ array of arrays of groups, one per rank ] }
449 The answer has the form
450 { "variants" => [ array of variant location structures ],
451 "variant_count" => total,
452 "conflict_count" => number of conflicts detected,
453 "genealogical_count" => number of solutions found }
458 my( $stemma, @groups ) = @_;
460 # Filter the groups down to distinct groups, and work out what graph
461 # should be used in the calculation of each group. We want to send each
462 # distinct problem to the solver only once.
463 # We need a whole bunch of lookup tables for this.
464 my( $index_groupkeys, $group_indices, $graph_problems ) = _prepare_groups( @_ );
466 ## For each distinct graph, send its groups to the solver.
467 my $ua = LWP::UserAgent->new();
468 ## Witness map is a HACK to get around limitations in node names from IDP
469 my $witness_map = {};
470 ## Variables to store answers as they come back
471 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
472 my $genealogical = 0;
473 foreach my $graphkey ( keys %$graph_problems ) {
474 my $graph = $graph_problems->{$graphkey}->{'object'};
475 my $groupings = [ values %{$graph_problems->{$graphkey}->{'groups'}} ];
476 my $req = _safe_wit_strings( $graph, $stemma->collation,
477 $groupings, $witness_map );
478 $req->{'command'} = 'findGroupings';
479 my $json = encode_json( $req );
480 # Send it off and get the result
481 # print STDERR "Sending request: " . to_json( $req ) . "\n";
482 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
483 'Content' => $json );
485 if( $resp->is_success ) {
486 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
488 # Fall back to the old method.
489 die "IDP solver returned " . $resp->status_line . " / " . $resp->content
490 . "; cannot run graph analysis";
493 ## If IDP worked, asked it the other two questions for this dataset.
495 foreach my $test ( qw/ findSources findClasses / ) {
496 $req->{'command'} = $test;
497 $json = encode_json( $req );
498 $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
499 'Content' => $json );
500 if( $resp->is_success ) {
501 $more_eval->{$test} = _desanitize_names(
502 decode_json( $resp->content ), $witness_map );
504 warn "IDP solver for $test returned " . $resp->status_line .
505 " / " . $resp->content;
506 # TODO arrange fallback
510 ## The answer is the evaluated groupings, plus a boolean for whether
511 ## they were genealogical. Reconstruct our original groups.
512 foreach my $gidx ( 0 .. $#{$groupings} ) {
513 my( $calc_groups, $result ) = @{$answer->[$gidx]};
514 # Keep track of the total # of genealogical readings
515 $genealogical++ if $result;
517 my( $sources, $classes );
518 # Use the expanded groups from findSources if that got calculated.
519 if( exists( $more_eval->{'findSources'} ) ) {
520 ( $calc_groups, $sources ) = @{$more_eval->{'findSources'}->[$gidx]};
522 # Use the (same) expanded groups from findClasses if that got calculated
524 if( exists( $more_eval->{'findClasses'} ) && !$result ) {
525 ( $calc_groups, $classes ) = @{$more_eval->{'findClasses'}->[$gidx]};
528 # Prune the calculated groups, in case the IDP solver failed to.
529 if( $sources || $result ) {
532 foreach my $cg ( @$calc_groups ) {
533 my( $pg, $pr ) = _prune_group( $cg, $graph );
534 push( @pruned_groups, $pg );
535 push( @pruned_roots, @$pr );
537 $calc_groups = \@pruned_groups;
538 say STDERR "Pruned roots from @$sources to @pruned_roots"
539 unless wit_stringify( [ sort @$sources ] )
540 eq wit_stringify( [ sort @pruned_roots ] );
541 $sources = \@pruned_roots;
544 # Convert the source list into a lookup hash
546 map { $roots->{$_} = 1 } @$sources;
547 # Convert the class list into a lookup hash
549 $classes = _invert_hash( $classes );
552 # Retrieve the key for the original group that went to the solver
553 my $input_group = wit_stringify( $groupings->[$gidx] );
555 # Make the variant hash for each location that had this particular
556 # grouping on this particular stemma situation
557 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
558 my @readings = @{$index_groupkeys->{$oidx}};
560 'genealogical' => $result,
563 foreach my $ridx ( 0 .. $#readings ) {
564 push( @{$vstruct->{'readings'}},
565 { 'readingid' => $readings[$ridx],
566 'group' => $calc_groups->[$ridx] } );
568 $vstruct->{'reading_roots'} = $roots if $roots;
569 $vstruct->{'reading_types'} = $classes if $classes;
570 $variants->[$oidx] = $vstruct;
575 return { 'variants' => $variants,
576 'variant_count' => scalar @$variants,
577 'genealogical_count' => $genealogical };
580 sub _prepare_groups {
581 my( $stemma, @groups ) = @_;
582 my $aclabel = $stemma->collation->ac_label;
584 my $index_groupkeys = {}; # Save the order of readings
585 my $group_indices = {}; # Save the indices that have a given grouping
586 my $graph_problems = {}; # Save the groupings for the given graph
588 foreach my $idx ( 0..$#groups ) {
589 my $ghash = $groups[$idx];
591 # Sort the groupings from big to little, and scan for a.c. witnesses
592 # that would need an extended graph.
593 my @acwits; # note which AC witnesses crop up at this rank
594 my $extant; # note which witnesses crop up at this rank full stop
595 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
597 foreach my $rdg ( @idxkeys ) {
598 my @sg = sort @{$ghash->{$rdg}};
599 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
600 map { $extant->{$_} = 1 } @sg;
601 push( @grouping, \@sg );
603 # Save the reading order
604 $index_groupkeys->{$idx} = \@idxkeys;
606 # Now associate the distinct group with this index
607 my $gstr = wit_stringify( \@grouping );
608 push( @{$group_indices->{$gstr}}, $idx );
610 # Finally, add the group to the list to be calculated for this graph.
611 map { s/\Q$aclabel\E$// } @acwits;
613 ## TODO When we get rid of the safe_wit_strings HACK we should also
614 ## be able to save the graph here as a dotstring rather than as an
615 ## object, thus simplifying life enormously.
617 $graph = $stemma->situation_graph( $extant, \@acwits );
620 die "Unable to extend graph with @acwits";
622 my $graphkey = "$graph || " . wit_stringify( [ sort keys %$extant ] );
623 unless( exists $graph_problems->{$graphkey} ) {
624 $graph_problems->{$graphkey} = { 'object' => $graph, 'groups' => {} };
626 $graph_problems->{$graphkey}->{'groups'}->{wit_stringify( \@grouping )} = \@grouping;
628 say STDERR "Created " . scalar( keys %$graph_problems ). " distinct graph(s)";
629 return( $index_groupkeys, $group_indices, $graph_problems );
632 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
634 sub _safe_wit_strings {
635 my( $graph, $c, $groupings, $witness_map ) = @_;
636 # Convert the graph to a safe representation and store the conversion.
637 my $safegraph = Graph->new();
638 foreach my $n ( $graph->vertices ) {
639 my $sn = _safe_witstr( $n );
640 if( exists $witness_map->{$sn} ) {
641 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
642 if $witness_map->{$sn} ne $n;
644 $witness_map->{$sn} = $n;
646 $safegraph->add_vertex( $sn );
647 $safegraph->set_vertex_attributes( $sn,
648 $graph->get_vertex_attributes( $n ) );
650 foreach my $e ( $graph->edges ) {
651 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
652 $safegraph->add_edge( @safe_e );
655 # Now convert the witness groupings to a safe representation.
656 my $safe_groupings = [];
657 foreach my $grouping ( @$groupings ) {
658 my $safe_grouping = [];
659 foreach my $group ( @$grouping ) {
661 foreach my $n ( @$group ) {
662 my $sn = _safe_witstr( $n );
663 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
664 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
665 $witness_map->{$sn} = $n;
666 push( @$safe_group, $sn );
668 push( @$safe_grouping, $safe_group );
670 push( @$safe_groupings, $safe_grouping );
673 # Return it all in the struct we expect. We have stored the reductions
674 # in the $witness_map that we were passed.
675 return { 'graph' => Text::Tradition::Stemma::editable_graph(
676 $safegraph, { 'linesep' => ' ' } ),
677 'groupings' => $safe_groupings };
682 $witstr =~ s/\s+/_/g;
683 $witstr =~ s/[^\w\d-]//g;
687 sub _desanitize_names {
688 my( $element, $witness_map ) = @_;
690 if( ref( $element ) eq 'ARRAY' ) {
691 foreach my $n ( @$element ) {
692 push( @$result, _desanitize_names( $n, $witness_map ) );
694 } elsif( ref( $element ) eq 'HASH' ) {
696 map { $real_hash->{$_} = _desanitize_names( $element->{$_}, $witness_map ) }
698 $result = $real_hash;
699 } elsif( exists $witness_map->{$element} ) {
700 $result = $witness_map->{$element}
710 foreach my $k ( keys %$hash ) {
711 if( ref( $hash->{$k} ) eq 'ARRAY' ) {
712 foreach my $v ( @{$hash->{$k}} ) {
716 $newhash->{$hash->{$k}} = $k;
724 =head2 analyze_location ( $tradition, $graph, $location_hash )
726 Given the tradition, its stemma graph, and the solution from the graph solver,
727 work out the rest of the information we want. For each reading we need missing,
728 conflict, reading_parents, independent_occurrence, followed, not_followed,
729 and follow_unknown. Alters the location_hash in place.
733 sub analyze_location {
734 my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
735 my $c = $tradition->collation;
737 # Make a hash of all known node memberships, and make the subgraphs.
739 my $reading_roots = {};
741 my $acstr = $c->ac_label;
745 if( exists $variant_row->{'reading_roots'} ) {
746 $reading_roots = delete $variant_row->{'reading_roots'};
748 warn "No reading source information from IDP - proceed at your own risk";
752 # Note which witnesses positively belong to which group. This information
753 # comes ultimately from the IDP solver.
754 # Also make a note of the reading's roots.
755 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
756 my $rid = $rdghash->{'readingid'};
758 foreach my $wit ( @{$rdghash->{'group'}} ) {
759 $contig->{$wit} = $rid;
760 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
763 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
764 push( @roots, $wit );
767 $rdghash->{'independent_occurrence'} = \@roots;
770 # Get the actual graph we should work with
773 # contig contains all extant wits and all hypothetical wits
774 # needed to make up the groups.
775 $graph = $stemma->situation_graph( $contig, \@acwits );
776 } catch ( Text::Tradition::Error $e ) {
777 die "Could not extend graph with given extant and a.c. witnesses: "
780 die "Could not extend graph with a.c. witnesses @acwits";
784 # Now that we have all the node group memberships, calculate followed/
785 # non-followed/unknown values for each reading. Also figure out the
786 # reading's evident parent(s).
787 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
788 my $rid = $rdghash->{'readingid'};
789 my $rdg = $c->reading( $rid );
790 my @roots = @{$rdghash->{'independent_occurrence'}};
791 my @group = @{$rdghash->{'group'}};
793 # Start figuring things out.
794 $rdghash->{'followed'} = scalar( @group ) - scalar( @roots );
795 # Find the parent readings, if any, of this reading.
797 foreach my $wit ( @roots ) {
798 # Look in the stemma graph to find this witness's extant or known-reading
799 # immediate ancestor(s), and look up the reading that each ancestor olds.
800 my @check = $graph->predecessors( $wit );
803 foreach my $wparent( @check ) {
804 my $preading = $contig->{$wparent};
805 if( $preading && $preading ne $rid ) {
806 $rdgparents->{$preading} = 1;
808 push( @next, $graph->predecessors( $wparent ) );
814 foreach my $p ( keys %$rdgparents ) {
815 # Resolve the relationship of the parent to the reading, and
816 # save it in our hash.
817 my $pobj = $c->reading( $p );
818 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
819 my $phash = { 'label' => $prep };
821 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
823 _add_to_hash( $rel, $phash );
825 # First check for a transposed relationship
826 if( $rdg->rank != $pobj->rank ) {
827 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
828 next unless $ti->text eq $rdg->text;
829 $rel = $c->get_relationship( $ti, $pobj );
831 _add_to_hash( $rel, $phash, 1 );
836 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
837 next unless $ti->text eq $pobj->text;
838 $rel = $c->get_relationship( $ti, $rdg );
840 _add_to_hash( $rel, $phash, 1 );
847 # and then check for sheer word similarity.
848 my $rtext = $rdg->text;
849 my $ptext = $pobj->text;
850 if( similar( $rtext, $ptext ) ) {
851 # say STDERR "Words $rtext and $ptext judged similar";
852 $phash->{relation} = { type => 'wordsimilar' };
856 $phash->{relation} = { type => 'deletion' };
858 # Get the attributes of the parent object while we are here
859 $phash->{'text'} = $pobj->text if $pobj;
860 $phash->{'is_nonsense'} = $pobj->is_nonsense;
861 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
862 } elsif( $p eq '(omitted)' ) {
863 $phash->{relation} = { type => 'addition' };
866 $rdgparents->{$p} = $phash;
869 $rdghash->{'reading_parents'} = $rdgparents;
871 # Find the number of times this reading was altered, and the number of
872 # times we're not sure.
873 my( %nofollow, %unknownfollow );
874 foreach my $wit ( @{$rdghash->{'group'}} ) {
875 foreach my $wchild ( $graph->successors( $wit ) ) {
876 if( $reading_roots->{$wchild} && $contig->{$wchild}
877 && $contig->{$wchild} ne $rid ) {
878 # It definitely changed here.
879 $nofollow{$wchild} = 1;
880 } elsif( !($contig->{$wchild}) ) {
881 # The child is a hypothetical node not definitely in
882 # any group. Answer is unknown.
883 $unknownfollow{$wchild} = 1;
884 } # else it is either in our group, or it is a non-root node in a
885 # known group and therefore is presumed to have its reading from
886 # its group, not this link.
889 $rdghash->{'not_followed'} = keys %nofollow;
890 $rdghash->{'follow_unknown'} = keys %unknownfollow;
892 # Now say whether this reading represents a conflict.
893 unless( $variant_row->{'genealogical'} ) {
895 if( exists $variant_row->{'classes'} ) {
896 # We have tested for reversions. Use the information.
898 foreach my $rdgroot ( @roots ) {
899 ## TODO This needs IDP to prune itself in order to be
901 if( $variant_row->{'classes'}->{$rdgroot} eq 'revert' ) {
902 push( @reversions, $rdgroot );
904 push( @trueroots, $rdgroot );
907 $rdghash->{'independent_occurrence'} = \@trueroots;
908 $rdghash->{'reversion'} = \@reversions if @reversions;
912 $rdghash->{'conflict'} = @trueroots != 1;
918 my( $rel, $phash, $is_transposed ) = @_;
919 $phash->{relation} = { type => $rel->type };
920 $phash->{relation}->{transposed} = 1 if $is_transposed;
921 $phash->{relation}->{annotation} = $rel->annotation
922 if $rel->has_annotation;
925 =head2 similar( $word1, $word2 )
927 Use Algorithm::Diff to get a sense of how close the words are to each other.
928 This will hopefully handle substitutions a bit more nicely than Levenshtein.
935 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
936 my @let1 = split( '', lc( $word1 ) );
937 my @let2 = split( '', lc( $word2 ) );
938 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
940 while( $diff->Next ) {
942 # Take off points for longer strings
943 my $cs = $diff->Range(1) - 2;
946 } elsif( !$diff->Items(1) ) {
947 $mag += $diff->Range(2);
948 } elsif( !$diff->Items(2) ) {
949 $mag += $diff->Range(1);
951 # Split the difference for substitutions
952 my $c1 = $diff->Range(1) || 1;
953 my $c2 = $diff->Range(2) || 1;
954 my $cd = ( $c1 + $c2 ) / 2;
958 return ( $mag <= length( $word1 ) / 2 );
962 my( $group, $graph ) = @_;
964 # Record the existence of the vertices in the group
965 map { $relevant->{$_} = 1 } @$group;
967 my $subgraph = $graph->deep_copy;
968 map { $subgraph->delete_vertex( $_ ) unless $relevant->{$_} }
970 # Now prune and return the remaining vertices.
971 _prune_subtree( $subgraph );
972 # Return the list of vertices and the list of roots.
973 my $pruned_group = [ sort $subgraph->vertices ];
974 my $pruned_roots = [ $subgraph->predecessorless_vertices ];
975 return( $pruned_group, $pruned_roots );
981 # Delete lacunose witnesses that have no successors
982 my @orphan_hypotheticals;
985 die "Infinite loop on leaves" if $ctr > 100;
986 @orphan_hypotheticals =
987 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' }
988 $tree->successorless_vertices;
989 $tree->delete_vertices( @orphan_hypotheticals );
991 } while( @orphan_hypotheticals );
993 # Delete lacunose roots that have a single successor
997 die "Infinite loop on roots" if $ctr > 100;
999 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical'
1000 && $tree->successors( $_ ) == 1 }
1001 $tree->predecessorless_vertices;
1002 $tree->delete_vertices( @redundant_root );
1004 } while( @redundant_root );
1007 sub _useful_variant {
1008 my( $group_readings, $graph, $acstr ) = @_;
1010 # TODO Decide what to do with AC witnesses
1012 # Sort by group size and return
1014 my( @readings, @groups ); # The sorted groups for our answer.
1015 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
1016 keys %$group_readings ) {
1017 push( @readings, $rdg );
1018 push( @groups, $group_readings->{$rdg} );
1019 if( @{$group_readings->{$rdg}} > 1 ) {
1022 my( $wit ) = @{$group_readings->{$rdg}};
1023 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
1024 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
1027 if( $is_useful > 1 ) {
1028 return( \@readings, \@groups );
1034 =head2 wit_stringify( $groups )
1036 Takes an array of witness groupings and produces a string like
1037 ['A','B'] / ['C','D','E'] / ['F']
1044 # If we were passed an array of witnesses instead of an array of
1045 # groupings, then "group" the witnesses first.
1046 unless( ref( $groups->[0] ) ) {
1047 my $mkgrp = [ $groups ];
1050 foreach my $g ( @$groups ) {
1051 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1053 return join( ' / ', @gst );
1057 my( $lista, $listb ) = @_;
1060 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1061 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
1062 my @set = grep { $union{$_} == 1 } keys %union;
1063 return map { $scalars{$_} } @set;
1070 This package is free software and is provided "as is" without express
1071 or implied warranty. You can redistribute it and/or modify it under
1072 the same terms as Perl itself.
1076 Tara L Andrews E<lt>aurum@cpan.orgE<gt>