overhaul of analysis with corresponding updates to stemma graph generation
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
428bcf0b 5use Algorithm::Diff; # for word similarity measure
e4386ba9 6use Benchmark;
88a6bac5 7use Encode qw/ encode_utf8 /;
d1348d38 8use Exporter 'import';
b4cb2d60 9use Graph;
88a6bac5 10use JSON qw/ encode_json decode_json /;
11use LWP::UserAgent;
d71100ed 12use Text::Tradition;
13use Text::Tradition::Stemma;
d120c995 14use TryCatch;
d71100ed 15
d1348d38 16use vars qw/ @EXPORT_OK /;
a2cf85dd 17@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 18
5c44c598 19my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
20
21
7f52eac8 22=head1 NAME
23
24Text::Tradition::Analysis - functions for stemma analysis of a tradition
25
26=head1 SYNOPSIS
27
28 use Text::Tradition;
29 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
30 my $t = Text::Tradition->new(
31 'name' => 'this is a text',
32 'input' => 'TEI',
33 'file' => '/path/to/tei_parallel_seg_file.xml' );
34 $t->add_stemma( 'dotfile' => $stemmafile );
35
36 my $variant_data = run_analysis( $tradition );
37 # Recalculate rank $n treating all orthographic variants as equivalent
38 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
39
40=head1 DESCRIPTION
41
42Text::Tradition is a library for representation and analysis of collated
43texts, particularly medieval ones. The Collation is the central feature of
44a Tradition, where the text, its sequence of readings, and its relationships
45between readings are actually kept.
46
47=head1 SUBROUTINES
48
88a6bac5 49=head2 run_analysis( $tradition, %opts )
7f52eac8 50
88a6bac5 51Runs the analysis described in analyze_variant_location on every location in the
52collation of the given tradition, with the given options. These include:
7f52eac8 53
88a6bac5 54=over 4
55
56=item * stemma_id - Specify which of the tradition's stemmata to use. Default
57is 0 (i.e. the first).
58
59=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
60
61=item * merge_types - Specify a list of relationship types, where related readings
62should be treated as identical for the purposes of analysis.
63
ffa22d6f 64=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
65
88a6bac5 66=back
7f52eac8 67
68=begin testing
69
70use Text::Tradition;
71use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
72
73my $datafile = 't/data/florilegium_tei_ps.xml';
74my $tradition = Text::Tradition->new( 'input' => 'TEI',
75 'name' => 'test0',
76 'file' => $datafile );
77my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
78is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
79
f00cefe8 80my %expected_genealogical = (
a44aaf2a 81 1 => 0,
f00cefe8 82 2 => 1,
a44aaf2a 83 3 => 0,
84 5 => 0,
85 7 => 0,
86 8 => 0,
87 10 => 0,
f00cefe8 88 13 => 1,
a44aaf2a 89 33 => 0,
90 34 => 0,
91 37 => 0,
92 60 => 0,
f00cefe8 93 81 => 1,
a44aaf2a 94 84 => 0,
95 87 => 0,
96 101 => 0,
97 102 => 0,
f00cefe8 98 122 => 1,
a44aaf2a 99 157 => 0,
f00cefe8 100 166 => 1,
101 169 => 1,
a44aaf2a 102 200 => 0,
f00cefe8 103 216 => 1,
104 217 => 1,
105 219 => 1,
106 241 => 1,
107 242 => 1,
108 243 => 1,
109);
110
7f52eac8 111my $data = run_analysis( $tradition );
7234b01d 112my $c = $tradition->collation;
f00cefe8 113foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 114 # Account for rows that used to be "not useful"
115 unless( exists $expected_genealogical{$row->{'id'}} ) {
116 $expected_genealogical{$row->{'id'}} = 1;
117 }
18f48b82 118 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
119 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 120 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 121 # Check that we have the right row with the right groups
122 my $rank = $row->{'id'};
123 foreach my $rdghash ( @{$row->{'readings'}} ) {
124 # Skip 'readings' that aren't really
125 next unless $c->reading( $rdghash->{'readingid'} );
126 # Check the rank
127 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
128 "Got correct reading rank" );
129 # Check the witnesses
130 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
131 my @sgrp = sort @{$rdghash->{'group'}};
132 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
133 }
f00cefe8 134}
a44aaf2a 135is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 136# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 137
138=end testing
139
140=cut
141
d71100ed 142sub run_analysis {
88a6bac5 143 my( $tradition, %opts ) = @_;
f00cefe8 144 my $c = $tradition->collation;
88a6bac5 145
146 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 147 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
148 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 149
150 # Get the stemma
151 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 152
4ce27d42 153 # Figure out which witnesses we are working with - that is, the ones that
154 # appear both in the stemma and in the tradition. All others are 'lacunose'
155 # for our purposes.
88a6bac5 156 my @lacunose = $stemma->hypotheticals;
fae07016 157 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
fae07016 158 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 159
160 # Find and mark 'common' ranks for exclusion, unless they were
161 # explicitly specified.
162 unless( @ranks ) {
163 my %common_rank;
a44aaf2a 164 foreach my $rdg ( $c->common_readings ) {
88a6bac5 165 $common_rank{$rdg->rank} = 1;
166 }
167 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 168 }
7f52eac8 169
88a6bac5 170 # Group the variants to send to the solver
171 my @groups;
f629cb3b 172 my @use_ranks;
a44aaf2a 173 my %lacunae;
94654e27 174 my $moved = {};
88a6bac5 175 foreach my $rank ( @ranks ) {
a44aaf2a 176 my $missing = [ @lacunose ];
94654e27 177 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
178 # Filter out any empty rankgroups
179 # (e.g. from the later rank for a transposition)
180 next unless keys %$rankgroup;
ffa22d6f 181 if( $opts{'exclude_type1'} ) {
182 # Check to see whether this is a "useful" group.
183 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
184 $stemma->graph, $c->ac_label );
185 next unless @$rdgs;
186 }
f629cb3b 187 push( @use_ranks, $rank );
ffa22d6f 188 push( @groups, $rankgroup );
a44aaf2a 189 $lacunae{$rank} = $missing;
d71100ed 190 }
4ce27d42 191 # Run the solver
e59b8faa 192 my $answer = solve_variants( $stemma, @groups );
fae07016 193
88a6bac5 194 # Do further analysis on the answer
a44aaf2a 195 my $conflict_count = 0;
7234b01d 196 my $aclabel = $c->ac_label;
f629cb3b 197 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 198 my $location = $answer->{'variants'}->[$idx];
199 # Add the rank back in
94654e27 200 my $rank = $use_ranks[$idx];
201 $location->{'id'} = $rank;
7234b01d 202 # Note what our lacunae are
f629cb3b 203 my %lmiss;
7234b01d 204 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
7234b01d 205 $location->{'missing'} = [ keys %lmiss ];
206
88a6bac5 207 # Run the extra analysis we need.
428bcf0b 208 ## TODO We run through all the variants in this call, so
209 ## why not add the reading data there instead of here below?
638e2a95 210 analyze_location( $tradition, $stemma, $location, \%lmiss );
7234b01d 211
638e2a95 212 my @layerwits;
7234b01d 213 # Do the final post-analysis tidying up of the data.
a44aaf2a 214 foreach my $rdghash ( @{$location->{'readings'}} ) {
215 $conflict_count++
216 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
94654e27 217 # Add the reading text back in, setting display value as needed
a44aaf2a 218 my $rdg = $c->reading( $rdghash->{'readingid'} );
94654e27 219 if( $rdg ) {
220 $rdghash->{'text'} = $rdg->text .
221 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
7b7abf10 222 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
223 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
94654e27 224 }
f629cb3b 225 # Remove lacunose witnesses from this reading's list now that the
7234b01d 226 # analysis is done
f629cb3b 227 my @realgroup;
7234b01d 228 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 229 $rdghash->{'group'} = \@realgroup;
638e2a95 230 # Note any layered witnesses that appear in this group
231 foreach( @realgroup ) {
232 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
233 push( @layerwits, $1 );
234 }
235 }
a44aaf2a 236 }
638e2a95 237 $location->{'layerwits'} = \@layerwits if @layerwits;
88a6bac5 238 }
a44aaf2a 239 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 240
88a6bac5 241 return $answer;
d71100ed 242}
243
7f52eac8 244=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
245
246Groups the variants at the given $rank of the collation, treating any
247relationships in @merge_relationship_types as equivalent. $lacunose should
248be a reference to an array, to which the sigla of lacunose witnesses at this
94654e27 249rank will be appended; $transposed should be a reference to a hash, wherein
250the identities of transposed readings and their relatives will be stored.
7f52eac8 251
ffa22d6f 252Returns a hash $group_readings where $rdg is attested by the witnesses listed
253in $group_readings->{$rdg}.
7f52eac8 254
255=cut
256
257# Return group_readings, groups, lacunose
d1348d38 258sub group_variants {
94654e27 259 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
7f52eac8 260 my $c = $tradition->collation;
335a62ef 261 my $aclabel = $c->ac_label;
d120c995 262 my $table = $c->alignment_table;
7f52eac8 263 # Get the alignment table readings
264 my %readings_at_rank;
d120c995 265 my %is_lacunose; # lookup table for witnesses not in stemma
266 map { $is_lacunose{$_} = 1; $is_lacunose{$_.$aclabel} = 1 } @$lacunose;
94654e27 267 my @check_for_gaps;
268 my %moved_wits;
d120c995 269 my $has_transposition;
270 foreach my $tablewit ( @{$table->{'alignment'}} ) {
7f52eac8 271 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 272 my $wit = $tablewit->{'witness'};
ffa22d6f 273 # Exclude the witness if it is "lacunose" which if we got here
274 # means "not in the stemma".
275 next if $is_lacunose{$wit};
94654e27 276 # Note if the witness is actually in a lacuna
7f52eac8 277 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 278 _add_to_witlist( $wit, $lacunose, $aclabel );
94654e27 279 # Otherwise the witness either has a positive reading...
7f52eac8 280 } elsif( $rdg ) {
94654e27 281 # If the reading has been counted elsewhere as a transposition, ignore it.
282 if( $transposed->{$rdg->{'t'}->id} ) {
d120c995 283 # TODO Does this cope with three-way transpositions?
94654e27 284 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
285 next;
286 }
287 # Otherwise, record it...
288 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
289 # ...and grab any transpositions, and their relations.
290 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
291 foreach my $trdg ( @transp ) {
d120c995 292 next if exists $readings_at_rank{$trdg->id};
293 $has_transposition = 1;
294 my @affected_wits = _table_witnesses(
295 $table, $trdg, \%is_lacunose, $aclabel );
296 next unless @affected_wits;
297 map { $moved_wits{$_} = 1 } @affected_wits;
298 $transposed->{$trdg->id} =
299 [ _table_witnesses( $table, $rdg->{'t'}, \%is_lacunose, $aclabel ) ];
94654e27 300 $readings_at_rank{$trdg->id} = $trdg;
301 }
302 # ...or it is empty, ergo a gap.
7f52eac8 303 } else {
d120c995 304 _add_to_witlist( $wit, \@check_for_gaps, $aclabel );
7f52eac8 305 }
306 }
94654e27 307 my @gap_wits;
308 map { _add_to_witlist( $_, \@gap_wits, $aclabel )
309 unless $moved_wits{$_} } @check_for_gaps;
7f52eac8 310 # Group the readings, collapsing groups by relationship if needed
d120c995 311 my $grouped_readings = {};
4ce27d42 312 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 313 # Skip readings that have been collapsed into others.
d120c995 314 next if exists $grouped_readings->{$rdg->id}
315 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
4ce27d42 316 # Get the witness list, including from readings collapsed into this one.
d120c995 317 my @wits = _table_witnesses( $table, $rdg, \%is_lacunose, $aclabel );
318 if( $collapse && @$collapse ) {
7f52eac8 319 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
320 foreach my $other ( $rdg->related_readings( $filter ) ) {
d120c995 321 my @otherwits = _table_witnesses(
322 $table, $other, \%is_lacunose, $aclabel );
fae07016 323 push( @wits, @otherwits );
d120c995 324 $grouped_readings->{$other->id} = 'COLLAPSE';
d1348d38 325 }
326 }
d120c995 327 $grouped_readings->{$rdg->id} = \@wits;
7f52eac8 328 }
d120c995 329 $grouped_readings->{'(omitted)'} = \@gap_wits if @gap_wits;
7f52eac8 330 # Get rid of our collapsed readings
d120c995 331 map { delete $grouped_readings->{$_} if $grouped_readings->{$_} eq 'COLLAPSE' }
332 keys %$grouped_readings
7f52eac8 333 if $collapse;
d120c995 334
335 # If something was transposed, check the groups for doubled-up readings
336 if( $has_transposition ) {
62a39b8f 337 # print STDERR "Group for rank $rank:\n";
338 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
339 # keys %$grouped_readings;
d120c995 340 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
341 }
7f52eac8 342
4ce27d42 343 # Return the result
d120c995 344 return $grouped_readings;
345}
346
347# Helper function to query the alignment table for all witnesses (a.c. included)
348# that have a given reading at its rank.
349sub _table_witnesses {
350 my( $table, $trdg, $lacunose, $aclabel ) = @_;
351 my $tableidx = $trdg->rank - 1;
352 my @has_reading;
353 foreach my $row ( @{$table->{'alignment'}} ) {
354 my $wit = $row->{'witness'};
355 next if $lacunose->{$wit};
356 my $rdg = $row->{'tokens'}->[$tableidx];
357 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
358 _add_to_witlist( $wit, \@has_reading, $aclabel )
359 if $rdg->{'t'}->id eq $trdg->id;
360 }
361 return @has_reading;
d1348d38 362}
363
335a62ef 364# Helper function to ensure that X and X a.c. never appear in the same list.
365sub _add_to_witlist {
366 my( $wit, $list, $acstr ) = @_;
367 my %inlist;
368 my $idx = 0;
369 map { $inlist{$_} = $idx++ } @$list;
370 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
371 my $acwit = $1;
372 unless( exists $inlist{$acwit} ) {
373 push( @$list, $acwit.$acstr );
374 }
375 } else {
376 if( exists( $inlist{$wit.$acstr} ) ) {
377 # Replace the a.c. version with the main witness
378 my $i = $inlist{$wit.$acstr};
379 $list->[$i] = $wit;
380 } else {
381 push( @$list, $wit );
382 }
383 }
384}
385
d120c995 386sub _check_transposed_consistency {
387 my( $c, $rank, $transposed, $groupings ) = @_;
388 my %seen_wits;
389 my %thisrank;
390 # Note which readings are actually at this rank, and which witnesses
391 # belong to which reading.
392 foreach my $rdg ( keys %$groupings ) {
393 my $rdgobj = $c->reading( $rdg );
394 # Count '(omitted)' as a reading at this rank
395 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
396 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
397 }
398 # Our work is done if we have no witness belonging to more than one
399 # reading.
400 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
401 return unless @doubled;
402 # If we have a symmetric related transposition, drop the non-rank readings.
403 if( @doubled == scalar keys %seen_wits ) {
404 foreach my $rdg ( keys %$groupings ) {
405 if( !$thisrank{$rdg} ) {
406 my $groupstr = wit_stringify( $groupings->{$rdg} );
407 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
408 keys %thisrank;
409 delete $groupings->{$rdg};
410 # If we found a group match, assume there is a symmetry happening.
411 # TODO think more about this
62a39b8f 412 # print STDERR "*** Deleting symmetric reading $rdg\n";
d120c995 413 unless( $matched ) {
414 delete $transposed->{$rdg};
415 warn "Found problem in evident symmetry with reading $rdg";
416 }
417 }
418 }
419 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
420 } else {
421 foreach my $dup ( @doubled ) {
422 foreach my $rdg ( @{$seen_wits{$dup}} ) {
423 next if $thisrank{$rdg};
424 next unless exists $groupings->{$rdg};
62a39b8f 425 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
d120c995 426 delete $groupings->{$rdg};
427 delete $transposed->{$rdg};
428 }
429 }
430 # and put any now-orphaned readings into an 'omitted' reading.
431 foreach my $wit ( keys %seen_wits ) {
432 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
433 $groupings->{'(omitted)'} = [] unless exists $groupings->{'(omitted)'};
434 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
435 }
436 }
437 }
438}
439
88a6bac5 440=head2 solve_variants( $graph, @groups )
441
442Sends the set of groups to the external graph solver service and returns
443a cleaned-up answer, adding the rank IDs back where they belong.
444
445The JSON has the form
446 { "graph": [ stemmagraph DOT string without newlines ],
447 "groupings": [ array of arrays of groups, one per rank ] }
448
449The answer has the form
450 { "variants" => [ array of variant location structures ],
451 "variant_count" => total,
452 "conflict_count" => number of conflicts detected,
453 "genealogical_count" => number of solutions found }
454
455=cut
456
457sub solve_variants {
e59b8faa 458 my( $stemma, @groups ) = @_;
335a62ef 459
460 # Filter the groups down to distinct groups, and work out what graph
461 # should be used in the calculation of each group. We want to send each
462 # distinct problem to the solver only once.
463 # We need a whole bunch of lookup tables for this.
5c44c598 464 my( $index_groupkeys, $group_indices, $graph_problems ) = _prepare_groups( @_ );
335a62ef 465
335a62ef 466 ## For each distinct graph, send its groups to the solver.
88a6bac5 467 my $ua = LWP::UserAgent->new();
335a62ef 468 ## Witness map is a HACK to get around limitations in node names from IDP
469 my $witness_map = {};
470 ## Variables to store answers as they come back
471 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
88a6bac5 472 my $genealogical = 0;
335a62ef 473 foreach my $graphkey ( keys %$graph_problems ) {
474 my $graph = $graph_problems->{$graphkey}->{'object'};
5c44c598 475 my $groupings = [ values %{$graph_problems->{$graphkey}->{'groups'}} ];
c30a4fdb 476 my $req = _safe_wit_strings( $graph, $stemma->collation,
477 $groupings, $witness_map );
478 $req->{'command'} = 'findGroupings';
479 my $json = encode_json( $req );
335a62ef 480 # Send it off and get the result
5c44c598 481 # print STDERR "Sending request: " . to_json( $req ) . "\n";
482 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
335a62ef 483 'Content' => $json );
484 my $answer;
335a62ef 485 if( $resp->is_success ) {
486 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
335a62ef 487 } else {
488 # Fall back to the old method.
5c44c598 489 die "IDP solver returned " . $resp->status_line . " / " . $resp->content
490 . "; cannot run graph analysis";
335a62ef 491 }
5c44c598 492
493 ## If IDP worked, asked it the other two questions for this dataset.
494 my $more_eval = {};
495 foreach my $test ( qw/ findSources findClasses / ) {
496 $req->{'command'} = $test;
497 $json = encode_json( $req );
498 $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
499 'Content' => $json );
500 if( $resp->is_success ) {
501 $more_eval->{$test} = _desanitize_names(
502 decode_json( $resp->content ), $witness_map );
503 } else {
504 warn "IDP solver for $test returned " . $resp->status_line .
505 " / " . $resp->content;
506 # TODO arrange fallback
507 }
508 }
509
335a62ef 510 ## The answer is the evaluated groupings, plus a boolean for whether
511 ## they were genealogical. Reconstruct our original groups.
512 foreach my $gidx ( 0 .. $#{$groupings} ) {
513 my( $calc_groups, $result ) = @{$answer->[$gidx]};
5c44c598 514 # Keep track of the total # of genealogical readings
515 $genealogical++ if $result;
516
517 my( $sources, $classes );
518 # Use the expanded groups from findSources if that got calculated.
519 if( exists( $more_eval->{'findSources'} ) ) {
520 ( $calc_groups, $sources ) = @{$more_eval->{'findSources'}->[$gidx]};
521 }
522 # Use the (same) expanded groups from findClasses if that got calculated
523 # and is relevant.
524 if( exists( $more_eval->{'findClasses'} ) && !$result ) {
525 ( $calc_groups, $classes ) = @{$more_eval->{'findClasses'}->[$gidx]};
526 }
527
528 # Prune the calculated groups, in case the IDP solver failed to.
529 if( $sources || $result ) {
530 my @pruned_groups;
531 my @pruned_roots;
532 foreach my $cg ( @$calc_groups ) {
533 my( $pg, $pr ) = _prune_group( $cg, $graph );
534 push( @pruned_groups, $pg );
535 push( @pruned_roots, @$pr );
6d25a3a0 536 }
5c44c598 537 $calc_groups = \@pruned_groups;
538 say STDERR "Pruned roots from @$sources to @pruned_roots"
539 unless wit_stringify( [ sort @$sources ] )
540 eq wit_stringify( [ sort @pruned_roots ] );
541 $sources = \@pruned_roots;
335a62ef 542 }
5c44c598 543
544 # Convert the source list into a lookup hash
545 my $roots = {};
546 map { $roots->{$_} = 1 } @$sources;
547 # Convert the class list into a lookup hash
548 if( $classes ) {
549 $classes = _invert_hash( $classes );
550 }
551
335a62ef 552 # Retrieve the key for the original group that went to the solver
553 my $input_group = wit_stringify( $groupings->[$gidx] );
5c44c598 554
555 # Make the variant hash for each location that had this particular
556 # grouping on this particular stemma situation
335a62ef 557 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
558 my @readings = @{$index_groupkeys->{$oidx}};
559 my $vstruct = {
560 'genealogical' => $result,
561 'readings' => [],
562 };
563 foreach my $ridx ( 0 .. $#readings ) {
564 push( @{$vstruct->{'readings'}},
565 { 'readingid' => $readings[$ridx],
566 'group' => $calc_groups->[$ridx] } );
567 }
5c44c598 568 $vstruct->{'reading_roots'} = $roots if $roots;
569 $vstruct->{'reading_types'} = $classes if $classes;
335a62ef 570 $variants->[$oidx] = $vstruct;
6d25a3a0 571 }
572 }
88a6bac5 573 }
574
575 return { 'variants' => $variants,
576 'variant_count' => scalar @$variants,
577 'genealogical_count' => $genealogical };
578}
579
5c44c598 580sub _prepare_groups {
581 my( $stemma, @groups ) = @_;
582 my $aclabel = $stemma->collation->ac_label;
583
584 my $index_groupkeys = {}; # Save the order of readings
585 my $group_indices = {}; # Save the indices that have a given grouping
586 my $graph_problems = {}; # Save the groupings for the given graph
587
588 foreach my $idx ( 0..$#groups ) {
589 my $ghash = $groups[$idx];
590 my @grouping;
591 # Sort the groupings from big to little, and scan for a.c. witnesses
592 # that would need an extended graph.
593 my @acwits; # note which AC witnesses crop up at this rank
594 my $extant; # note which witnesses crop up at this rank full stop
595 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
596 keys %$ghash;
597 foreach my $rdg ( @idxkeys ) {
598 my @sg = sort @{$ghash->{$rdg}};
599 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
600 map { $extant->{$_} = 1 } @sg;
601 push( @grouping, \@sg );
602 }
603 # Save the reading order
604 $index_groupkeys->{$idx} = \@idxkeys;
605
606 # Now associate the distinct group with this index
607 my $gstr = wit_stringify( \@grouping );
608 push( @{$group_indices->{$gstr}}, $idx );
609
610 # Finally, add the group to the list to be calculated for this graph.
611 map { s/\Q$aclabel\E$// } @acwits;
612 my $graph;
613 ## TODO When we get rid of the safe_wit_strings HACK we should also
614 ## be able to save the graph here as a dotstring rather than as an
615 ## object, thus simplifying life enormously.
616 try {
617 $graph = $stemma->situation_graph( $extant, \@acwits );
618 } catch {
619 $DB::single = 1;
620 die "Unable to extend graph with @acwits";
621 }
622 my $graphkey = "$graph || " . wit_stringify( [ sort keys %$extant ] );
623 unless( exists $graph_problems->{$graphkey} ) {
624 $graph_problems->{$graphkey} = { 'object' => $graph, 'groups' => {} };
625 }
626 $graph_problems->{$graphkey}->{'groups'}->{wit_stringify( \@grouping )} = \@grouping;
627 }
628 say STDERR "Created " . scalar( keys %$graph_problems ). " distinct graph(s)";
629 return( $index_groupkeys, $group_indices, $graph_problems );
630}
631
b4cb2d60 632#### HACKERY to cope with IDP's limited idea of what a node name looks like ###
633
634sub _safe_wit_strings {
335a62ef 635 my( $graph, $c, $groupings, $witness_map ) = @_;
b4cb2d60 636 # Convert the graph to a safe representation and store the conversion.
5c44c598 637 my $safegraph = Graph->new();
335a62ef 638 foreach my $n ( $graph->vertices ) {
b4cb2d60 639 my $sn = _safe_witstr( $n );
335a62ef 640 if( exists $witness_map->{$sn} ) {
641 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
642 if $witness_map->{$sn} ne $n;
643 } else {
644 $witness_map->{$sn} = $n;
645 }
b4cb2d60 646 $safegraph->add_vertex( $sn );
647 $safegraph->set_vertex_attributes( $sn,
335a62ef 648 $graph->get_vertex_attributes( $n ) );
b4cb2d60 649 }
335a62ef 650 foreach my $e ( $graph->edges ) {
b4cb2d60 651 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
652 $safegraph->add_edge( @safe_e );
653 }
b4cb2d60 654
655 # Now convert the witness groupings to a safe representation.
656 my $safe_groupings = [];
657 foreach my $grouping ( @$groupings ) {
658 my $safe_grouping = [];
659 foreach my $group ( @$grouping ) {
660 my $safe_group = [];
661 foreach my $n ( @$group ) {
662 my $sn = _safe_witstr( $n );
663 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
664 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
665 $witness_map->{$sn} = $n;
666 push( @$safe_group, $sn );
667 }
668 push( @$safe_grouping, $safe_group );
669 }
670 push( @$safe_groupings, $safe_grouping );
671 }
672
673 # Return it all in the struct we expect. We have stored the reductions
674 # in the $witness_map that we were passed.
5c44c598 675 return { 'graph' => Text::Tradition::Stemma::editable_graph(
676 $safegraph, { 'linesep' => ' ' } ),
335a62ef 677 'groupings' => $safe_groupings };
b4cb2d60 678}
679
680sub _safe_witstr {
681 my $witstr = shift;
682 $witstr =~ s/\s+/_/g;
683 $witstr =~ s/[^\w\d-]//g;
684 return $witstr;
685}
686
687sub _desanitize_names {
5c44c598 688 my( $element, $witness_map ) = @_;
b4cb2d60 689 my $result = [];
5c44c598 690 if( ref( $element ) eq 'ARRAY' ) {
691 foreach my $n ( @$element ) {
692 push( @$result, _desanitize_names( $n, $witness_map ) );
b4cb2d60 693 }
5c44c598 694 } elsif( ref( $element ) eq 'HASH' ) {
695 my $real_hash = {};
696 map { $real_hash->{$_} = _desanitize_names( $element->{$_}, $witness_map ) }
697 keys %$element;
698 $result = $real_hash;
699 } elsif( exists $witness_map->{$element} ) {
700 $result = $witness_map->{$element}
701 } else {
702 $result = $element;
b4cb2d60 703 }
704 return $result;
705}
706
5c44c598 707sub _invert_hash {
708 my( $hash ) = @_;
709 my $newhash;
710 foreach my $k ( keys %$hash ) {
711 if( ref( $hash->{$k} ) eq 'ARRAY' ) {
712 foreach my $v ( @{$hash->{$k}} ) {
713 $newhash->{$v} = $k;
714 }
715 } else {
716 $newhash->{$hash->{$k}} = $k;
717 }
718 }
719 return $newhash;
720}
721
b4cb2d60 722### END HACKERY ###
723
fae07016 724=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 725
fae07016 726Given the tradition, its stemma graph, and the solution from the graph solver,
727work out the rest of the information we want. For each reading we need missing,
5c44c598 728conflict, reading_parents, independent_occurrence, followed, not_followed,
729and follow_unknown. Alters the location_hash in place.
7f52eac8 730
731=cut
732152b1 732
fae07016 733sub analyze_location {
638e2a95 734 my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
94654e27 735 my $c = $tradition->collation;
fae07016 736
737 # Make a hash of all known node memberships, and make the subgraphs.
738 my $contig = {};
739 my $reading_roots = {};
740 my $subgraph = {};
638e2a95 741 my $acstr = $c->ac_label;
742 my @acwits;
5c44c598 743
744 my $NO_IDP;
745 if( exists $variant_row->{'reading_roots'} ) {
746 $reading_roots = delete $variant_row->{'reading_roots'};
747 } else {
748 warn "No reading source information from IDP - proceed at your own risk";
749 $NO_IDP = 1;
750 }
751
752 # Note which witnesses positively belong to which group. This information
753 # comes ultimately from the IDP solver.
754 # Also make a note of the reading's roots.
fae07016 755 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
756 my $rid = $rdghash->{'readingid'};
5c44c598 757 my @roots;
638e2a95 758 foreach my $wit ( @{$rdghash->{'group'}} ) {
759 $contig->{$wit} = $rid;
760 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
761 push( @acwits, $1 );
762 }
5c44c598 763 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
764 push( @roots, $wit );
765 }
638e2a95 766 }
5c44c598 767 $rdghash->{'independent_occurrence'} = \@roots;
94654e27 768 }
5c44c598 769
638e2a95 770 # Get the actual graph we should work with
d120c995 771 my $graph;
772 try {
5c44c598 773 # contig contains all extant wits and all hypothetical wits
774 # needed to make up the groups.
775 $graph = $stemma->situation_graph( $contig, \@acwits );
776 } catch ( Text::Tradition::Error $e ) {
777 die "Could not extend graph with given extant and a.c. witnesses: "
778 . $e->message;
d120c995 779 } catch {
780 die "Could not extend graph with a.c. witnesses @acwits";
781 }
638e2a95 782
5c44c598 783
fae07016 784 # Now that we have all the node group memberships, calculate followed/
bebec0e9 785 # non-followed/unknown values for each reading. Also figure out the
786 # reading's evident parent(s).
787 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 788 my $rid = $rdghash->{'readingid'};
428bcf0b 789 my $rdg = $c->reading( $rid );
5c44c598 790 my @roots = @{$rdghash->{'independent_occurrence'}};
791 my @group = @{$rdghash->{'group'}};
fae07016 792
793 # Start figuring things out.
5c44c598 794 $rdghash->{'followed'} = scalar( @group ) - scalar( @roots );
bebec0e9 795 # Find the parent readings, if any, of this reading.
94654e27 796 my $rdgparents = {};
bebec0e9 797 foreach my $wit ( @roots ) {
5c44c598 798 # Look in the stemma graph to find this witness's extant or known-reading
f00cefe8 799 # immediate ancestor(s), and look up the reading that each ancestor olds.
800 my @check = $graph->predecessors( $wit );
801 while( @check ) {
802 my @next;
803 foreach my $wparent( @check ) {
fae07016 804 my $preading = $contig->{$wparent};
428bcf0b 805 if( $preading && $preading ne $rid ) {
94654e27 806 $rdgparents->{$preading} = 1;
f00cefe8 807 } else {
808 push( @next, $graph->predecessors( $wparent ) );
809 }
810 }
811 @check = @next;
812 }
bebec0e9 813 }
94654e27 814 foreach my $p ( keys %$rdgparents ) {
815 # Resolve the relationship of the parent to the reading, and
816 # save it in our hash.
817 my $pobj = $c->reading( $p );
94654e27 818 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
354cc918 819 my $phash = { 'label' => $prep };
94654e27 820 if( $pobj ) {
821 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
822 if( $rel ) {
428bcf0b 823 _add_to_hash( $rel, $phash );
824 } elsif( $rdg ) {
825 # First check for a transposed relationship
826 if( $rdg->rank != $pobj->rank ) {
827 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
828 next unless $ti->text eq $rdg->text;
829 $rel = $c->get_relationship( $ti, $pobj );
830 if( $rel ) {
831 _add_to_hash( $rel, $phash, 1 );
832 last;
833 }
834 }
835 unless( $rel ) {
836 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
837 next unless $ti->text eq $pobj->text;
838 $rel = $c->get_relationship( $ti, $rdg );
839 if( $rel ) {
840 _add_to_hash( $rel, $phash, 1 );
841 last;
842 }
843 }
844 }
354cc918 845 }
428bcf0b 846 unless( $rel ) {
847 # and then check for sheer word similarity.
848 my $rtext = $rdg->text;
849 my $ptext = $pobj->text;
850 if( similar( $rtext, $ptext ) ) {
851 # say STDERR "Words $rtext and $ptext judged similar";
852 $phash->{relation} = { type => 'wordsimilar' };
853 }
94654e27 854 }
428bcf0b 855 } else {
856 $phash->{relation} = { type => 'deletion' };
94654e27 857 }
354cc918 858 # Get the attributes of the parent object while we are here
859 $phash->{'text'} = $pobj->text if $pobj;
860 $phash->{'is_nonsense'} = $pobj->is_nonsense;
861 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
862 } elsif( $p eq '(omitted)' ) {
863 $phash->{relation} = { type => 'addition' };
7b7abf10 864 }
354cc918 865 # Save it
7b7abf10 866 $rdgparents->{$p} = $phash;
94654e27 867 }
868
869 $rdghash->{'reading_parents'} = $rdgparents;
bebec0e9 870
871 # Find the number of times this reading was altered, and the number of
872 # times we're not sure.
873 my( %nofollow, %unknownfollow );
5c44c598 874 foreach my $wit ( @{$rdghash->{'group'}} ) {
bebec0e9 875 foreach my $wchild ( $graph->successors( $wit ) ) {
5c44c598 876 if( $reading_roots->{$wchild} && $contig->{$wchild}
877 && $contig->{$wchild} ne $rid ) {
bebec0e9 878 # It definitely changed here.
879 $nofollow{$wchild} = 1;
880 } elsif( !($contig->{$wchild}) ) {
881 # The child is a hypothetical node not definitely in
882 # any group. Answer is unknown.
883 $unknownfollow{$wchild} = 1;
5c44c598 884 } # else it is either in our group, or it is a non-root node in a
885 # known group and therefore is presumed to have its reading from
886 # its group, not this link.
bebec0e9 887 }
888 }
889 $rdghash->{'not_followed'} = keys %nofollow;
890 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 891
892 # Now say whether this reading represents a conflict.
893 unless( $variant_row->{'genealogical'} ) {
5c44c598 894 my @trueroots;
895 if( exists $variant_row->{'classes'} ) {
896 # We have tested for reversions. Use the information.
897 my @reversions;
898 foreach my $rdgroot ( @roots ) {
899 ## TODO This needs IDP to prune itself in order to be
900 ## correct.
901 if( $variant_row->{'classes'}->{$rdgroot} eq 'revert' ) {
902 push( @reversions, $rdgroot );
903 } else {
904 push( @trueroots, $rdgroot );
905 }
906 }
907 $rdghash->{'independent_occurrence'} = \@trueroots;
908 $rdghash->{'reversion'} = \@reversions if @reversions;
909 } else {
910 @trueroots = @roots;
911 }
912 $rdghash->{'conflict'} = @trueroots != 1;
fae07016 913 }
c4a4fb1b 914 }
d71100ed 915}
916
428bcf0b 917sub _add_to_hash {
918 my( $rel, $phash, $is_transposed ) = @_;
919 $phash->{relation} = { type => $rel->type };
920 $phash->{relation}->{transposed} = 1 if $is_transposed;
921 $phash->{relation}->{annotation} = $rel->annotation
922 if $rel->has_annotation;
923}
924
925=head2 similar( $word1, $word2 )
926
927Use Algorithm::Diff to get a sense of how close the words are to each other.
928This will hopefully handle substitutions a bit more nicely than Levenshtein.
929
930=cut
931
932#!/usr/bin/env perl
933
934sub similar {
935 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
936 my @let1 = split( '', lc( $word1 ) );
937 my @let2 = split( '', lc( $word2 ) );
938 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
939 my $mag = 0;
940 while( $diff->Next ) {
941 if( $diff->Same ) {
942 # Take off points for longer strings
943 my $cs = $diff->Range(1) - 2;
944 $cs = 0 if $cs < 0;
945 $mag -= $cs;
946 } elsif( !$diff->Items(1) ) {
947 $mag += $diff->Range(2);
948 } elsif( !$diff->Items(2) ) {
949 $mag += $diff->Range(1);
950 } else {
951 # Split the difference for substitutions
952 my $c1 = $diff->Range(1) || 1;
953 my $c2 = $diff->Range(2) || 1;
954 my $cd = ( $c1 + $c2 ) / 2;
955 $mag += $cd;
956 }
957 }
958 return ( $mag <= length( $word1 ) / 2 );
959}
960
6d25a3a0 961sub _prune_group {
5c44c598 962 my( $group, $graph ) = @_;
963 my $relevant = {};
964 # Record the existence of the vertices in the group
965 map { $relevant->{$_} = 1 } @$group;
6d25a3a0 966 # Make our subgraph
5c44c598 967 my $subgraph = $graph->deep_copy;
968 map { $subgraph->delete_vertex( $_ ) unless $relevant->{$_} }
6d25a3a0 969 $subgraph->vertices;
6d25a3a0 970 # Now prune and return the remaining vertices.
5c44c598 971 _prune_subtree( $subgraph );
972 # Return the list of vertices and the list of roots.
973 my $pruned_group = [ sort $subgraph->vertices ];
974 my $pruned_roots = [ $subgraph->predecessorless_vertices ];
975 return( $pruned_group, $pruned_roots );
6d25a3a0 976}
977
7f52eac8 978sub _prune_subtree {
5c44c598 979 my( $tree ) = @_;
94654e27 980
981 # Delete lacunose witnesses that have no successors
5c44c598 982 my @orphan_hypotheticals;
983 my $ctr = 0;
984 do {
985 die "Infinite loop on leaves" if $ctr > 100;
986 @orphan_hypotheticals =
987 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' }
988 $tree->successorless_vertices;
989 $tree->delete_vertices( @orphan_hypotheticals );
990 $ctr++;
991 } while( @orphan_hypotheticals );
94654e27 992
993 # Delete lacunose roots that have a single successor
994 my @redundant_root;
995 $ctr = 0;
996 do {
5c44c598 997 die "Infinite loop on roots" if $ctr > 100;
998 @redundant_root =
999 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical'
1000 && $tree->successors( $_ ) == 1 }
1001 $tree->predecessorless_vertices;
94654e27 1002 $tree->delete_vertices( @redundant_root );
1003 $ctr++;
1004 } while( @redundant_root );
1005}
1006
5be0cdeb 1007sub _useful_variant {
1008 my( $group_readings, $graph, $acstr ) = @_;
1009
1010 # TODO Decide what to do with AC witnesses
1011
1012 # Sort by group size and return
1013 my $is_useful = 0;
1014 my( @readings, @groups ); # The sorted groups for our answer.
1015 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
1016 keys %$group_readings ) {
1017 push( @readings, $rdg );
1018 push( @groups, $group_readings->{$rdg} );
1019 if( @{$group_readings->{$rdg}} > 1 ) {
1020 $is_useful++;
1021 } else {
1022 my( $wit ) = @{$group_readings->{$rdg}};
1023 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
1024 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
1025 }
1026 }
1027 if( $is_useful > 1 ) {
1028 return( \@readings, \@groups );
1029 } else {
1030 return( [], [] );
1031 }
1032}
1033
7f52eac8 1034=head2 wit_stringify( $groups )
1035
1036Takes an array of witness groupings and produces a string like
1037['A','B'] / ['C','D','E'] / ['F']
d71100ed 1038
7f52eac8 1039=cut
d71100ed 1040
1041sub wit_stringify {
1042 my $groups = shift;
1043 my @gst;
1044 # If we were passed an array of witnesses instead of an array of
1045 # groupings, then "group" the witnesses first.
1046 unless( ref( $groups->[0] ) ) {
1047 my $mkgrp = [ $groups ];
1048 $groups = $mkgrp;
1049 }
1050 foreach my $g ( @$groups ) {
1051 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1052 }
1053 return join( ' / ', @gst );
1054}
7f52eac8 1055
bebec0e9 1056sub _symmdiff {
1057 my( $lista, $listb ) = @_;
7f52eac8 1058 my %union;
1059 my %scalars;
1060 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1061 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 1062 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 1063 return map { $scalars{$_} } @set;
1064}
1065
10661;
1067
1068=head1 LICENSE
1069
1070This package is free software and is provided "as is" without express
1071or implied warranty. You can redistribute it and/or modify it under
1072the same terms as Perl itself.
1073
1074=head1 AUTHOR
1075
1076Tara L Andrews E<lt>aurum@cpan.orgE<gt>