1 package Text::Tradition::Analysis;
5 use Algorithm::Diff; # for word similarity measure
7 use Encode qw/ encode_utf8 /;
10 use JSON qw/ encode_json decode_json /;
13 use Text::Tradition::Stemma;
16 use vars qw/ @EXPORT_OK /;
17 @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
21 Text::Tradition::Analysis - functions for stemma analysis of a tradition
26 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
27 my $t = Text::Tradition->new(
28 'name' => 'this is a text',
30 'file' => '/path/to/tei_parallel_seg_file.xml' );
31 $t->add_stemma( 'dotfile' => $stemmafile );
33 my $variant_data = run_analysis( $tradition );
34 # Recalculate rank $n treating all orthographic variants as equivalent
35 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
39 Text::Tradition is a library for representation and analysis of collated
40 texts, particularly medieval ones. The Collation is the central feature of
41 a Tradition, where the text, its sequence of readings, and its relationships
42 between readings are actually kept.
46 =head2 run_analysis( $tradition, %opts )
48 Runs the analysis described in analyze_variant_location on every location in the
49 collation of the given tradition, with the given options. These include:
53 =item * stemma_id - Specify which of the tradition's stemmata to use. Default
54 is 0 (i.e. the first).
56 =item * ranks - Specify a list of location ranks to analyze; exclude the rest.
58 =item * merge_types - Specify a list of relationship types, where related readings
59 should be treated as identical for the purposes of analysis.
61 =item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
68 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
70 my $datafile = 't/data/florilegium_tei_ps.xml';
71 my $tradition = Text::Tradition->new( 'input' => 'TEI',
73 'file' => $datafile );
74 my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
75 is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
77 my %expected_genealogical = (
108 my $data = run_analysis( $tradition );
109 my $c = $tradition->collation;
110 foreach my $row ( @{$data->{'variants'}} ) {
111 # Account for rows that used to be "not useful"
112 unless( exists $expected_genealogical{$row->{'id'}} ) {
113 $expected_genealogical{$row->{'id'}} = 1;
115 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
116 is( $gen_bool, $expected_genealogical{$row->{'id'}},
117 "Got correct genealogical flag for row " . $row->{'id'} );
118 # Check that we have the right row with the right groups
119 my $rank = $row->{'id'};
120 foreach my $rdghash ( @{$row->{'readings'}} ) {
121 # Skip 'readings' that aren't really
122 next unless $c->reading( $rdghash->{'readingid'} );
124 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
125 "Got correct reading rank" );
126 # Check the witnesses
127 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
128 my @sgrp = sort @{$rdghash->{'group'}};
129 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
132 is( $data->{'variant_count'}, 58, "Got right total variant number" );
133 # TODO Make something meaningful of conflict count, maybe test other bits
140 my( $tradition, %opts ) = @_;
141 my $c = $tradition->collation;
143 my $stemma_id = $opts{'stemma_id'} || 0;
144 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
145 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
148 my $stemma = $tradition->stemma( $stemma_id );
150 # Figure out which witnesses we are working with - that is, the ones that
151 # appear both in the stemma and in the tradition. All others are 'lacunose'
153 my @lacunose = $stemma->hypotheticals;
154 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
155 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
157 # Find and mark 'common' ranks for exclusion, unless they were
158 # explicitly specified.
161 foreach my $rdg ( $c->common_readings ) {
162 $common_rank{$rdg->rank} = 1;
164 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
167 # Group the variants to send to the solver
172 foreach my $rank ( @ranks ) {
173 my $missing = [ @lacunose ];
174 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
175 # Filter out any empty rankgroups
176 # (e.g. from the later rank for a transposition)
177 next unless keys %$rankgroup;
178 if( $opts{'exclude_type1'} ) {
179 # Check to see whether this is a "useful" group.
180 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
181 $stemma->graph, $c->ac_label );
184 push( @use_ranks, $rank );
185 push( @groups, $rankgroup );
186 $lacunae{$rank} = $missing;
189 my $answer = solve_variants( $stemma, @groups );
191 # Do further analysis on the answer
192 my $conflict_count = 0;
193 my $aclabel = $c->ac_label;
194 foreach my $idx ( 0 .. $#use_ranks ) {
195 my $location = $answer->{'variants'}->[$idx];
196 # Add the rank back in
197 my $rank = $use_ranks[$idx];
198 $location->{'id'} = $rank;
199 # Note what our lacunae are
201 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
202 $location->{'missing'} = [ keys %lmiss ];
204 # Run the extra analysis we need.
205 ## TODO We run through all the variants in this call, so
206 ## why not add the reading data there instead of here below?
207 analyze_location( $tradition, $stemma, $location, \%lmiss );
210 # Do the final post-analysis tidying up of the data.
211 foreach my $rdghash ( @{$location->{'readings'}} ) {
213 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
214 # Add the reading text back in, setting display value as needed
215 my $rdg = $c->reading( $rdghash->{'readingid'} );
217 $rdghash->{'text'} = $rdg->text .
218 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
219 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
220 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
222 # Remove lacunose witnesses from this reading's list now that the
225 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
226 $rdghash->{'group'} = \@realgroup;
227 # Note any layered witnesses that appear in this group
228 foreach( @realgroup ) {
229 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
230 push( @layerwits, $1 );
234 $location->{'layerwits'} = \@layerwits if @layerwits;
236 $answer->{'conflict_count'} = $conflict_count;
241 =head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
243 Groups the variants at the given $rank of the collation, treating any
244 relationships in @merge_relationship_types as equivalent. $lacunose should
245 be a reference to an array, to which the sigla of lacunose witnesses at this
246 rank will be appended; $transposed should be a reference to a hash, wherein
247 the identities of transposed readings and their relatives will be stored.
249 Returns a hash $group_readings where $rdg is attested by the witnesses listed
250 in $group_readings->{$rdg}.
254 # Return group_readings, groups, lacunose
256 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
257 my $c = $tradition->collation;
258 my $aclabel = $c->ac_label;
259 my $table = $c->alignment_table;
260 # Get the alignment table readings
261 my %readings_at_rank;
262 my %is_lacunose; # lookup table for witnesses not in stemma
263 map { $is_lacunose{$_} = 1; $is_lacunose{$_.$aclabel} = 1 } @$lacunose;
266 my $has_transposition;
267 foreach my $tablewit ( @{$table->{'alignment'}} ) {
268 my $rdg = $tablewit->{'tokens'}->[$rank-1];
269 my $wit = $tablewit->{'witness'};
270 # Exclude the witness if it is "lacunose" which if we got here
271 # means "not in the stemma".
272 next if $is_lacunose{$wit};
273 # Note if the witness is actually in a lacuna
274 if( $rdg && $rdg->{'t'}->is_lacuna ) {
275 _add_to_witlist( $wit, $lacunose, $aclabel );
276 # Otherwise the witness either has a positive reading...
278 # If the reading has been counted elsewhere as a transposition, ignore it.
279 if( $transposed->{$rdg->{'t'}->id} ) {
280 # TODO Does this cope with three-way transpositions?
281 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
284 # Otherwise, record it...
285 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
286 # ...and grab any transpositions, and their relations.
287 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
288 foreach my $trdg ( @transp ) {
289 next if exists $readings_at_rank{$trdg->id};
290 $has_transposition = 1;
291 my @affected_wits = _table_witnesses(
292 $table, $trdg, \%is_lacunose, $aclabel );
293 next unless @affected_wits;
294 map { $moved_wits{$_} = 1 } @affected_wits;
295 $transposed->{$trdg->id} =
296 [ _table_witnesses( $table, $rdg->{'t'}, \%is_lacunose, $aclabel ) ];
297 $readings_at_rank{$trdg->id} = $trdg;
299 # ...or it is empty, ergo a gap.
301 _add_to_witlist( $wit, \@check_for_gaps, $aclabel );
305 map { _add_to_witlist( $_, \@gap_wits, $aclabel )
306 unless $moved_wits{$_} } @check_for_gaps;
307 # Group the readings, collapsing groups by relationship if needed
308 my $grouped_readings = {};
309 foreach my $rdg ( values %readings_at_rank ) {
310 # Skip readings that have been collapsed into others.
311 next if exists $grouped_readings->{$rdg->id}
312 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
313 # Get the witness list, including from readings collapsed into this one.
314 my @wits = _table_witnesses( $table, $rdg, \%is_lacunose, $aclabel );
315 if( $collapse && @$collapse ) {
316 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
317 foreach my $other ( $rdg->related_readings( $filter ) ) {
318 my @otherwits = _table_witnesses(
319 $table, $other, \%is_lacunose, $aclabel );
320 push( @wits, @otherwits );
321 $grouped_readings->{$other->id} = 'COLLAPSE';
324 $grouped_readings->{$rdg->id} = \@wits;
326 $grouped_readings->{'(omitted)'} = \@gap_wits if @gap_wits;
327 # Get rid of our collapsed readings
328 map { delete $grouped_readings->{$_} if $grouped_readings->{$_} eq 'COLLAPSE' }
329 keys %$grouped_readings
332 # If something was transposed, check the groups for doubled-up readings
333 if( $has_transposition ) {
334 # print STDERR "Group for rank $rank:\n";
335 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
336 # keys %$grouped_readings;
337 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
341 return $grouped_readings;
344 # Helper function to query the alignment table for all witnesses (a.c. included)
345 # that have a given reading at its rank.
346 sub _table_witnesses {
347 my( $table, $trdg, $lacunose, $aclabel ) = @_;
348 my $tableidx = $trdg->rank - 1;
350 foreach my $row ( @{$table->{'alignment'}} ) {
351 my $wit = $row->{'witness'};
352 next if $lacunose->{$wit};
353 my $rdg = $row->{'tokens'}->[$tableidx];
354 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
355 _add_to_witlist( $wit, \@has_reading, $aclabel )
356 if $rdg->{'t'}->id eq $trdg->id;
361 # Helper function to ensure that X and X a.c. never appear in the same list.
362 sub _add_to_witlist {
363 my( $wit, $list, $acstr ) = @_;
366 map { $inlist{$_} = $idx++ } @$list;
367 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
369 unless( exists $inlist{$acwit} ) {
370 push( @$list, $acwit.$acstr );
373 if( exists( $inlist{$wit.$acstr} ) ) {
374 # Replace the a.c. version with the main witness
375 my $i = $inlist{$wit.$acstr};
378 push( @$list, $wit );
383 sub _check_transposed_consistency {
384 my( $c, $rank, $transposed, $groupings ) = @_;
387 # Note which readings are actually at this rank, and which witnesses
388 # belong to which reading.
389 foreach my $rdg ( keys %$groupings ) {
390 my $rdgobj = $c->reading( $rdg );
391 # Count '(omitted)' as a reading at this rank
392 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
393 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
395 # Our work is done if we have no witness belonging to more than one
397 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
398 return unless @doubled;
399 # If we have a symmetric related transposition, drop the non-rank readings.
400 if( @doubled == scalar keys %seen_wits ) {
401 foreach my $rdg ( keys %$groupings ) {
402 if( !$thisrank{$rdg} ) {
403 my $groupstr = wit_stringify( $groupings->{$rdg} );
404 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
406 delete $groupings->{$rdg};
407 # If we found a group match, assume there is a symmetry happening.
408 # TODO think more about this
409 # print STDERR "*** Deleting symmetric reading $rdg\n";
411 delete $transposed->{$rdg};
412 warn "Found problem in evident symmetry with reading $rdg";
416 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
418 foreach my $dup ( @doubled ) {
419 foreach my $rdg ( @{$seen_wits{$dup}} ) {
420 next if $thisrank{$rdg};
421 next unless exists $groupings->{$rdg};
422 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
423 delete $groupings->{$rdg};
424 delete $transposed->{$rdg};
427 # and put any now-orphaned readings into an 'omitted' reading.
428 foreach my $wit ( keys %seen_wits ) {
429 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
430 $groupings->{'(omitted)'} = [] unless exists $groupings->{'(omitted)'};
431 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
437 =head2 solve_variants( $graph, @groups )
439 Sends the set of groups to the external graph solver service and returns
440 a cleaned-up answer, adding the rank IDs back where they belong.
442 The JSON has the form
443 { "graph": [ stemmagraph DOT string without newlines ],
444 "groupings": [ array of arrays of groups, one per rank ] }
446 The answer has the form
447 { "variants" => [ array of variant location structures ],
448 "variant_count" => total,
449 "conflict_count" => number of conflicts detected,
450 "genealogical_count" => number of solutions found }
455 my( $stemma, @groups ) = @_;
456 my $aclabel = $stemma->collation->ac_label;
458 # Filter the groups down to distinct groups, and work out what graph
459 # should be used in the calculation of each group. We want to send each
460 # distinct problem to the solver only once.
461 # We need a whole bunch of lookup tables for this.
462 my $index_groupkeys = {}; # Save the order of readings
463 my $group_indices = {}; # Save the indices that have a given grouping
464 my $graph_problems = {}; # Save the groupings for the given graph
466 foreach my $idx ( 0..$#groups ) {
467 my $ghash = $groups[$idx];
469 # Sort the groupings from big to little, and scan for a.c. witnesses
470 # that would need an extended graph.
471 my @acwits; # note which AC witnesses crop up at this rank
472 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
474 foreach my $rdg ( @idxkeys ) {
475 my @sg = sort @{$ghash->{$rdg}};
476 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
477 push( @grouping, \@sg );
479 # Save the reading order
480 $index_groupkeys->{$idx} = \@idxkeys;
482 # Now associate the distinct group with this index
483 my $gstr = wit_stringify( \@grouping );
484 push( @{$group_indices->{$gstr}}, $idx );
486 # Finally, add the group to the list to be calculated for this graph.
487 map { s/\Q$aclabel\E$// } @acwits;
490 $graph = $stemma->extend_graph( \@acwits );
492 die "Unable to extend graph with @acwits";
494 unless( exists $graph_problems->{"$graph"} ) {
495 $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] };
497 push( @{$graph_problems->{"$graph"}->{'groups'}}, \@grouping );
500 ## For each distinct graph, send its groups to the solver.
501 my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
502 my $ua = LWP::UserAgent->new();
503 ## Witness map is a HACK to get around limitations in node names from IDP
504 my $witness_map = {};
505 ## Variables to store answers as they come back
506 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
507 my $genealogical = 0;
508 foreach my $graphkey ( keys %$graph_problems ) {
509 my $graph = $graph_problems->{$graphkey}->{'object'};
510 my $groupings = $graph_problems->{$graphkey}->{'groups'};
511 my $req = _safe_wit_strings( $graph, $stemma->collation,
512 $groupings, $witness_map );
513 $req->{'command'} = 'findGroupings';
514 my $json = encode_json( $req );
515 # Send it off and get the result
516 #print STDERR "Sending request: $json\n";
517 my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json',
518 'Content' => $json );
521 if( $resp->is_success ) {
522 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
525 # Fall back to the old method.
526 warn "IDP solver returned " . $resp->status_line . " / " . $resp->content
527 . "; falling back to perl method";
528 $answer = perl_solver( $graph, @$groupings );
530 ## The answer is the evaluated groupings, plus a boolean for whether
531 ## they were genealogical. Reconstruct our original groups.
532 foreach my $gidx ( 0 .. $#{$groupings} ) {
533 my( $calc_groups, $result ) = @{$answer->[$gidx]};
536 # Prune the calculated groups, in case the IDP solver failed to.
539 foreach my $cg ( @$calc_groups ) {
540 # This is a little wasteful but the path of least
541 # resistance. Send both the stemma, which knows what
542 # its hypotheticals are, and the actual graph used.
543 my @pg = _prune_group( $cg, $stemma, $graph );
544 push( @pruned_groups, \@pg );
546 $calc_groups = \@pruned_groups;
549 # Retrieve the key for the original group that went to the solver
550 my $input_group = wit_stringify( $groupings->[$gidx] );
551 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
552 my @readings = @{$index_groupkeys->{$oidx}};
554 'genealogical' => $result,
557 foreach my $ridx ( 0 .. $#readings ) {
558 push( @{$vstruct->{'readings'}},
559 { 'readingid' => $readings[$ridx],
560 'group' => $calc_groups->[$ridx] } );
562 $variants->[$oidx] = $vstruct;
567 return { 'variants' => $variants,
568 'variant_count' => scalar @$variants,
569 'genealogical_count' => $genealogical };
572 #### HACKERY to cope with IDP's limited idea of what a node name looks like ###
574 sub _safe_wit_strings {
575 my( $graph, $c, $groupings, $witness_map ) = @_;
576 # Parse the graph we were given into a stemma.
577 my $safegraph = Graph->new();
578 # Convert the graph to a safe representation and store the conversion.
579 foreach my $n ( $graph->vertices ) {
580 my $sn = _safe_witstr( $n );
581 if( exists $witness_map->{$sn} ) {
582 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
583 if $witness_map->{$sn} ne $n;
585 $witness_map->{$sn} = $n;
587 $safegraph->add_vertex( $sn );
588 $safegraph->set_vertex_attributes( $sn,
589 $graph->get_vertex_attributes( $n ) );
591 foreach my $e ( $graph->edges ) {
592 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
593 $safegraph->add_edge( @safe_e );
595 my $safe_stemma = Text::Tradition::Stemma->new(
596 'collation' => $c, 'graph' => $safegraph );
598 # Now convert the witness groupings to a safe representation.
599 my $safe_groupings = [];
600 foreach my $grouping ( @$groupings ) {
601 my $safe_grouping = [];
602 foreach my $group ( @$grouping ) {
604 foreach my $n ( @$group ) {
605 my $sn = _safe_witstr( $n );
606 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
607 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
608 $witness_map->{$sn} = $n;
609 push( @$safe_group, $sn );
611 push( @$safe_grouping, $safe_group );
613 push( @$safe_groupings, $safe_grouping );
616 # Return it all in the struct we expect. We have stored the reductions
617 # in the $witness_map that we were passed.
618 return { 'graph' => $safe_stemma->editable( { 'linesep' => ' ' } ),
619 'groupings' => $safe_groupings };
624 $witstr =~ s/\s+/_/g;
625 $witstr =~ s/[^\w\d-]//g;
629 sub _desanitize_names {
630 my( $jsonstruct, $witness_map ) = @_;
632 foreach my $grouping ( @$jsonstruct ) {
633 my $real_grouping = [];
634 foreach my $element ( @$grouping ) {
635 if( ref( $element ) eq 'ARRAY' ) {
637 my $real_groupset = [];
638 foreach my $group ( @$element ) {
640 foreach my $n ( @$group ) {
641 my $rn = $witness_map->{$n};
642 push( @$real_group, $rn );
644 push( @$real_groupset, $real_group );
646 push( @$real_grouping, $real_groupset );
648 # It is the boolean, not actually a group.
649 push( @$real_grouping, $element );
652 push( @$result, $real_grouping );
659 =head2 analyze_location ( $tradition, $graph, $location_hash )
661 Given the tradition, its stemma graph, and the solution from the graph solver,
662 work out the rest of the information we want. For each reading we need missing,
663 conflict, reading_parents, independent_occurrence, followed, not_followed, and follow_unknown. Alters the location_hash in place.
667 sub analyze_location {
668 my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
669 my $c = $tradition->collation;
671 # Make a hash of all known node memberships, and make the subgraphs.
673 my $reading_roots = {};
675 my $acstr = $c->ac_label;
677 # Note which witnesses positively belong to which group
678 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
679 my $rid = $rdghash->{'readingid'};
680 foreach my $wit ( @{$rdghash->{'group'}} ) {
681 $contig->{$wit} = $rid;
682 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
687 # Get the actual graph we should work with
690 $graph = @acwits ? $stemma->extend_graph( \@acwits ) : $stemma->graph;
692 die "Could not extend graph with a.c. witnesses @acwits";
695 # Now, armed with that knowledge, make a subgraph for each reading
696 # and note the root(s) of each subgraph.
697 foreach my $rdghash( @{$variant_row->{'readings'}} ) {
698 my $rid = $rdghash->{'readingid'};
701 my $part = $graph->copy;
702 my @todelete = grep { exists $contig->{$_} && $contig->{$_} ne $rid }
704 $part->delete_vertices( @todelete );
705 _prune_subtree( $part, $lacunose );
706 $subgraph->{$rid} = $part;
707 # Record the remaining lacunose nodes as part of this group, if
708 # we are dealing with a non-genealogical reading.
709 unless( $variant_row->{'genealogical'} ) {
710 map { $contig->{$_} = $rid } $part->vertices;
712 # Get the reading roots.
713 map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices;
716 # Now that we have all the node group memberships, calculate followed/
717 # non-followed/unknown values for each reading. Also figure out the
718 # reading's evident parent(s).
719 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
720 my $rid = $rdghash->{'readingid'};
721 my $rdg = $c->reading( $rid );
723 my $part = $subgraph->{$rid};
725 # Start figuring things out.
726 my @roots = grep { $reading_roots->{$_} eq $rid } keys %$reading_roots;
727 $rdghash->{'independent_occurrence'} = \@roots;
728 $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots );
729 # Find the parent readings, if any, of this reading.
731 foreach my $wit ( @roots ) {
732 # Look in the main stemma to find this witness's extant or known-reading
733 # immediate ancestor(s), and look up the reading that each ancestor olds.
734 my @check = $graph->predecessors( $wit );
737 foreach my $wparent( @check ) {
738 my $preading = $contig->{$wparent};
739 # IDP assigns all nodes, hypothetical included, to a reading
740 # in the case of genealogical sets. We prune non-necessary
741 # hypothetical readings, but they are still in $contig, so
742 # we account for that here.
743 if( $preading && $preading ne $rid ) {
744 $rdgparents->{$preading} = 1;
746 push( @next, $graph->predecessors( $wparent ) );
752 foreach my $p ( keys %$rdgparents ) {
753 # Resolve the relationship of the parent to the reading, and
754 # save it in our hash.
755 my $pobj = $c->reading( $p );
756 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
757 my $phash = { 'label' => $prep };
759 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
761 _add_to_hash( $rel, $phash );
763 # First check for a transposed relationship
764 if( $rdg->rank != $pobj->rank ) {
765 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
766 next unless $ti->text eq $rdg->text;
767 $rel = $c->get_relationship( $ti, $pobj );
769 _add_to_hash( $rel, $phash, 1 );
774 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
775 next unless $ti->text eq $pobj->text;
776 $rel = $c->get_relationship( $ti, $rdg );
778 _add_to_hash( $rel, $phash, 1 );
785 # and then check for sheer word similarity.
786 my $rtext = $rdg->text;
787 my $ptext = $pobj->text;
788 if( similar( $rtext, $ptext ) ) {
789 # say STDERR "Words $rtext and $ptext judged similar";
790 $phash->{relation} = { type => 'wordsimilar' };
794 $phash->{relation} = { type => 'deletion' };
796 # Get the attributes of the parent object while we are here
797 $phash->{'text'} = $pobj->text if $pobj;
798 $phash->{'is_nonsense'} = $pobj->is_nonsense;
799 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
800 } elsif( $p eq '(omitted)' ) {
801 $phash->{relation} = { type => 'addition' };
804 $rdgparents->{$p} = $phash;
807 $rdghash->{'reading_parents'} = $rdgparents;
809 # Find the number of times this reading was altered, and the number of
810 # times we're not sure.
811 my( %nofollow, %unknownfollow );
812 foreach my $wit ( $part->vertices ) {
813 foreach my $wchild ( $graph->successors( $wit ) ) {
814 next if $part->has_vertex( $wchild );
815 if( $reading_roots->{$wchild} && $contig->{$wchild} ) {
816 # It definitely changed here.
817 $nofollow{$wchild} = 1;
818 } elsif( !($contig->{$wchild}) ) {
819 # The child is a hypothetical node not definitely in
820 # any group. Answer is unknown.
821 $unknownfollow{$wchild} = 1;
822 } # else it's a non-root node in a known group, and therefore
823 # is presumed to have its reading from its group, not this link.
826 $rdghash->{'not_followed'} = keys %nofollow;
827 $rdghash->{'follow_unknown'} = keys %unknownfollow;
829 # Now say whether this reading represents a conflict.
830 unless( $variant_row->{'genealogical'} ) {
831 $rdghash->{'conflict'} = @roots != 1;
837 my( $rel, $phash, $is_transposed ) = @_;
838 $phash->{relation} = { type => $rel->type };
839 $phash->{relation}->{transposed} = 1 if $is_transposed;
840 $phash->{relation}->{annotation} = $rel->annotation
841 if $rel->has_annotation;
844 =head2 similar( $word1, $word2 )
846 Use Algorithm::Diff to get a sense of how close the words are to each other.
847 This will hopefully handle substitutions a bit more nicely than Levenshtein.
854 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
855 my @let1 = split( '', lc( $word1 ) );
856 my @let2 = split( '', lc( $word2 ) );
857 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
859 while( $diff->Next ) {
861 # Take off points for longer strings
862 my $cs = $diff->Range(1) - 2;
865 } elsif( !$diff->Items(1) ) {
866 $mag += $diff->Range(2);
867 } elsif( !$diff->Items(2) ) {
868 $mag += $diff->Range(1);
870 # Split the difference for substitutions
871 my $c1 = $diff->Range(1) || 1;
872 my $c2 = $diff->Range(2) || 1;
873 my $cd = ( $c1 + $c2 ) / 2;
877 return ( $mag <= length( $word1 ) / 2 );
882 =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
884 ** NOTE ** This method should hopefully not be called - it is not guaranteed
885 to be correct. Serves as a backup for the real solver.
887 Runs an analysis of the given tradition, at the location given in $rank,
888 against the graph of the stemma specified in $stemma_id. The argument
889 @merge_relationship_types is an optional list of relationship types for
890 which readings so related should be treated as equivalent.
892 Returns a nested array data structure as follows:
894 [ [ group_list, is_genealogical ], [ group_list, is_genealogical ] ... ]
896 where the group list is the array of arrays passed in for each element of @groups,
897 possibly with the addition of hypothetical readings.
903 my( $graph, @groups ) = @_;
905 foreach my $g ( @groups ) {
906 push( @answer, _solve_variant_location( $graph, $g ) );
911 sub _solve_variant_location {
912 my( $graph, $groups ) = @_;
919 # Mark each ms as in its own group, first.
920 foreach my $g ( @$groups ) {
921 my $gst = wit_stringify( $g );
922 map { $contig->{$_} = $gst } @$g;
925 # Now for each unmarked node in the graph, initialize an array
926 # for possible group memberships. We will use this later to
927 # resolve potential conflicts.
928 map { $contig->{$_} = [] unless $contig->{$_} } $graph->vertices;
929 foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
930 my $gst = wit_stringify( $g ); # This is the group name
931 # Copy the graph, and delete all non-members from the new graph.
932 my $part = $graph->copy;
934 $part->delete_vertices(
935 grep { !ref( $contig->{$_} ) && $contig->{$_} ne $gst } $graph->vertices );
937 # Now look to see if our group is connected.
939 # We have to take directionality into account.
940 # How many root nodes do we have?
941 my @roots = grep { ref( $contig->{$_} ) || $contig->{$_} eq $gst }
942 $part->predecessorless_vertices;
943 # Assuming that @$g > 1, find the first root node that has at
944 # least one successor belonging to our group. If this reading
945 # is genealogical, there should be only one, but we will check
946 # that implicitly later.
947 foreach my $root ( @roots ) {
948 # Prune the tree to get rid of extraneous hypotheticals.
949 $root = _prune_subtree_old( $part, $root, $contig );
951 # Save this root for our group.
952 push( @group_roots, $root );
953 # Get all the successor nodes of our root.
956 # Dispense with the trivial case of one reading.
958 @group_roots = ( $wit );
959 foreach my $v ( $part->vertices ) {
960 $part->delete_vertex( $v ) unless $v eq $wit;
964 if( @group_roots > 1 ) {
965 $conflict->{$gst} = 1;
968 # Paint the 'hypotheticals' with our group.
969 foreach my $wit ( $part->vertices ) {
970 if( ref( $contig->{$wit} ) ) {
971 push( @{$contig->{$wit}}, $gst );
972 } elsif( $contig->{$wit} ne $gst ) {
973 warn "How did we get here?";
978 # Save the relevant subgraph.
979 $subgraph->{$gst} = $part;
982 # For each of our hypothetical readings, flatten its 'contig' array if
983 # the array contains zero or one group. If we have any unflattened arrays,
984 # we may need to run the resolution process. If the reading is already known
985 # to have a conflict, flatten the 'contig' array to nothing; we won't resolve
988 foreach my $wit ( keys %$contig ) {
989 next unless ref( $contig->{$wit} );
990 if( @{$contig->{$wit}} > 1 ) {
991 if( $is_conflicted ) {
992 $contig->{$wit} = ''; # We aren't going to decide.
994 push( @resolve, $wit );
997 my $gst = pop @{$contig->{$wit}};
998 $contig->{$wit} = $gst || '';
1003 my $still_contig = {};
1004 foreach my $h ( @resolve ) {
1005 # For each of the hypothetical readings with more than one possibility,
1006 # try deleting it from each of its member subgraphs in turn, and see
1007 # if that breaks the contiguous grouping.
1008 # TODO This can still break in a corner case where group A can use
1009 # either vertex 1 or 2, and group B can use either vertex 2 or 1.
1010 # Revisit this if necessary; it could get brute-force nasty.
1011 foreach my $gst ( @{$contig->{$h}} ) {
1012 my $gpart = $subgraph->{$gst}->copy();
1013 # If we have come this far, there is only one root and everything
1014 # is reachable from it.
1015 my( $root ) = $gpart->predecessorless_vertices;
1017 map { $reachable->{$_} = 1 } $gpart->vertices;
1019 # Try deleting the hypothetical node.
1020 $gpart->delete_vertex( $h );
1022 # See if we still have a single root.
1023 my @roots = $gpart->predecessorless_vertices;
1024 warn "This shouldn't have happened" unless @roots;
1026 # $h is needed by this group.
1027 if( exists( $still_contig->{$h} ) ) {
1029 $conflict->{$gst} = 1;
1030 $still_contig->{$h} = '';
1032 $still_contig->{$h} = $gst;
1036 # $h is somewhere in the middle. See if everything
1037 # else can still be reached from the root.
1038 my %still_reachable = ( $root => 1 );
1039 map { $still_reachable{$_} = 1 }
1040 $gpart->all_successors( $root );
1041 foreach my $v ( keys %$reachable ) {
1043 if( !$still_reachable{$v}
1044 && ( $contig->{$v} eq $gst
1045 || ( exists $still_contig->{$v}
1046 && $still_contig->{$v} eq $gst ) ) ) {
1048 if( exists $still_contig->{$h} ) {
1050 $conflict->{$gst} = 1;
1051 $still_contig->{$h} = '';
1053 $still_contig->{$h} = $gst;
1056 } # else we don't need $h in this group.
1058 } # endif $h eq $root
1059 } # end foreach $gst
1062 # Now we have some hypothetical vertices in $still_contig that are the
1063 # "real" group memberships. Replace these in $contig.
1064 foreach my $v ( keys %$contig ) {
1065 next unless ref $contig->{$v};
1066 $contig->{$v} = $still_contig->{$v};
1070 my $is_genealogical = keys %$conflict ? JSON::false : JSON::true;
1071 my $variant_row = [ [], $is_genealogical ];
1072 # Fill in the groupings from $contig.
1073 foreach my $g ( @$groups ) {
1074 my $gst = wit_stringify( $g );
1075 my @realgroup = grep { $contig->{$_} eq $gst } keys %$contig;
1076 push( @{$variant_row->[0]}, \@realgroup );
1078 return $variant_row;
1082 my( $group, $stemma, $graph ) = @_;
1084 map { $lacunose->{$_} = 1 } $stemma->hypotheticals;
1085 map { $lacunose->{$_} = 0 } @$group;
1087 my $subgraph = $graph->copy;
1088 map { $subgraph->delete_vertex( $_ ) unless exists $lacunose->{$_} }
1089 $subgraph->vertices;
1090 # ...and find the root.
1091 # Now prune and return the remaining vertices.
1092 _prune_subtree( $subgraph, $lacunose );
1093 return $subgraph->vertices;
1096 sub _prune_subtree {
1097 my( $tree, $lacunose ) = @_;
1099 # Delete lacunose witnesses that have no successors
1100 my @orphan_hypotheticals;
1103 die "Infinite loop on leaves" if $ctr > 100;
1104 @orphan_hypotheticals = grep { $lacunose->{$_} }
1105 $tree->successorless_vertices;
1106 $tree->delete_vertices( @orphan_hypotheticals );
1108 } while( @orphan_hypotheticals );
1110 # Delete lacunose roots that have a single successor
1114 die "Infinite loop on roots" if $ctr > 100;
1115 @redundant_root = grep { $lacunose->{$_} && $tree->successors( $_ ) == 1 }
1116 $tree->predecessorless_vertices;
1117 $tree->delete_vertices( @redundant_root );
1119 } while( @redundant_root );
1122 sub _prune_subtree_old {
1123 my( $tree, $root, $contighash ) = @_;
1124 # First, delete hypothetical leaves / orphans until there are none left.
1125 my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1126 $tree->successorless_vertices;
1127 while( @orphan_hypotheticals ) {
1128 $tree->delete_vertices( @orphan_hypotheticals );
1129 @orphan_hypotheticals = grep { ref( $contighash->{$_} ) }
1130 $tree->successorless_vertices;
1132 # Then delete a hypothetical root with only one successor, moving the
1133 # root to the first child that has no other predecessors.
1134 while( $tree->successors( $root ) == 1 && ref $contighash->{$root} ) {
1135 my @nextroot = $tree->successors( $root );
1136 $tree->delete_vertex( $root );
1137 ( $root ) = grep { $tree->is_predecessorless_vertex( $_ ) } @nextroot;
1139 # The tree has been modified in place, but we need to know the new root.
1140 $root = undef unless $root && $tree->has_vertex( $root );
1143 # Add the variant, subject to a.c. representation logic.
1144 # This assumes that we will see the 'main' version before the a.c. version.
1145 sub add_variant_wit {
1146 my( $arr, $wit, $acstr ) = @_;
1148 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
1150 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
1152 push( @$arr, $wit ) unless $skip;
1155 sub _useful_variant {
1156 my( $group_readings, $graph, $acstr ) = @_;
1158 # TODO Decide what to do with AC witnesses
1160 # Sort by group size and return
1162 my( @readings, @groups ); # The sorted groups for our answer.
1163 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
1164 keys %$group_readings ) {
1165 push( @readings, $rdg );
1166 push( @groups, $group_readings->{$rdg} );
1167 if( @{$group_readings->{$rdg}} > 1 ) {
1170 my( $wit ) = @{$group_readings->{$rdg}};
1171 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
1172 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
1175 if( $is_useful > 1 ) {
1176 return( \@readings, \@groups );
1182 =head2 wit_stringify( $groups )
1184 Takes an array of witness groupings and produces a string like
1185 ['A','B'] / ['C','D','E'] / ['F']
1192 # If we were passed an array of witnesses instead of an array of
1193 # groupings, then "group" the witnesses first.
1194 unless( ref( $groups->[0] ) ) {
1195 my $mkgrp = [ $groups ];
1198 foreach my $g ( @$groups ) {
1199 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1201 return join( ' / ', @gst );
1205 my( $lista, $listb ) = @_;
1208 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1209 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
1210 my @set = grep { $union{$_} == 1 } keys %union;
1211 return map { $scalars{$_} } @set;
1218 This package is free software and is provided "as is" without express
1219 or implied warranty. You can redistribute it and/or modify it under
1220 the same terms as Perl itself.
1224 Tara L Andrews E<lt>aurum@cpan.orgE<gt>