work in reversion info; trust IDP pruning
[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm
CommitLineData
d71100ed 1package Text::Tradition::Analysis;
2
3use strict;
4use warnings;
428bcf0b 5use Algorithm::Diff; # for word similarity measure
e4386ba9 6use Benchmark;
88a6bac5 7use Encode qw/ encode_utf8 /;
d1348d38 8use Exporter 'import';
b4cb2d60 9use Graph;
88a6bac5 10use JSON qw/ encode_json decode_json /;
11use LWP::UserAgent;
d71100ed 12use Text::Tradition;
13use Text::Tradition::Stemma;
d120c995 14use TryCatch;
d71100ed 15
d1348d38 16use vars qw/ @EXPORT_OK /;
a2cf85dd 17@EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /;
d1348d38 18
5c44c598 19my $SOLVER_URL = 'http://byzantini.st/cgi-bin/graphcalc.cgi';
20
21
7f52eac8 22=head1 NAME
23
24Text::Tradition::Analysis - functions for stemma analysis of a tradition
25
26=head1 SYNOPSIS
27
28 use Text::Tradition;
29 use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
30 my $t = Text::Tradition->new(
31 'name' => 'this is a text',
32 'input' => 'TEI',
33 'file' => '/path/to/tei_parallel_seg_file.xml' );
34 $t->add_stemma( 'dotfile' => $stemmafile );
35
36 my $variant_data = run_analysis( $tradition );
37 # Recalculate rank $n treating all orthographic variants as equivalent
38 my $reanalyze = analyze_variant_location( $tradition, $n, 0, 'orthographic' );
39
40=head1 DESCRIPTION
41
42Text::Tradition is a library for representation and analysis of collated
43texts, particularly medieval ones. The Collation is the central feature of
44a Tradition, where the text, its sequence of readings, and its relationships
45between readings are actually kept.
46
47=head1 SUBROUTINES
48
88a6bac5 49=head2 run_analysis( $tradition, %opts )
7f52eac8 50
88a6bac5 51Runs the analysis described in analyze_variant_location on every location in the
52collation of the given tradition, with the given options. These include:
7f52eac8 53
88a6bac5 54=over 4
55
56=item * stemma_id - Specify which of the tradition's stemmata to use. Default
57is 0 (i.e. the first).
58
59=item * ranks - Specify a list of location ranks to analyze; exclude the rest.
60
61=item * merge_types - Specify a list of relationship types, where related readings
62should be treated as identical for the purposes of analysis.
63
ffa22d6f 64=item * exclude_type1 - Exclude those ranks whose groupings have only type-1 variants.
65
88a6bac5 66=back
7f52eac8 67
68=begin testing
69
70use Text::Tradition;
71use Text::Tradition::Analysis qw/ run_analysis analyze_variant_location /;
72
73my $datafile = 't/data/florilegium_tei_ps.xml';
74my $tradition = Text::Tradition->new( 'input' => 'TEI',
75 'name' => 'test0',
76 'file' => $datafile );
77my $s = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
78is( ref( $s ), 'Text::Tradition::Stemma', "Added stemma to tradition" );
79
f00cefe8 80my %expected_genealogical = (
a44aaf2a 81 1 => 0,
f00cefe8 82 2 => 1,
a44aaf2a 83 3 => 0,
84 5 => 0,
85 7 => 0,
86 8 => 0,
87 10 => 0,
f00cefe8 88 13 => 1,
a44aaf2a 89 33 => 0,
90 34 => 0,
91 37 => 0,
92 60 => 0,
f00cefe8 93 81 => 1,
a44aaf2a 94 84 => 0,
95 87 => 0,
96 101 => 0,
97 102 => 0,
f00cefe8 98 122 => 1,
a44aaf2a 99 157 => 0,
f00cefe8 100 166 => 1,
101 169 => 1,
a44aaf2a 102 200 => 0,
f00cefe8 103 216 => 1,
104 217 => 1,
105 219 => 1,
106 241 => 1,
107 242 => 1,
108 243 => 1,
109);
110
7f52eac8 111my $data = run_analysis( $tradition );
7234b01d 112my $c = $tradition->collation;
f00cefe8 113foreach my $row ( @{$data->{'variants'}} ) {
a44aaf2a 114 # Account for rows that used to be "not useful"
115 unless( exists $expected_genealogical{$row->{'id'}} ) {
116 $expected_genealogical{$row->{'id'}} = 1;
117 }
18f48b82 118 my $gen_bool = $row->{'genealogical'} ? 1 : 0;
119 is( $gen_bool, $expected_genealogical{$row->{'id'}},
f00cefe8 120 "Got correct genealogical flag for row " . $row->{'id'} );
7234b01d 121 # Check that we have the right row with the right groups
122 my $rank = $row->{'id'};
123 foreach my $rdghash ( @{$row->{'readings'}} ) {
124 # Skip 'readings' that aren't really
125 next unless $c->reading( $rdghash->{'readingid'} );
126 # Check the rank
127 is( $c->reading( $rdghash->{'readingid'} )->rank, $rank,
128 "Got correct reading rank" );
129 # Check the witnesses
130 my @realwits = sort $c->reading_witnesses( $rdghash->{'readingid'} );
131 my @sgrp = sort @{$rdghash->{'group'}};
132 is_deeply( \@sgrp, \@realwits, "Reading analyzed with correct groups" );
133 }
f00cefe8 134}
a44aaf2a 135is( $data->{'variant_count'}, 58, "Got right total variant number" );
b4cb2d60 136# TODO Make something meaningful of conflict count, maybe test other bits
7f52eac8 137
138=end testing
139
140=cut
141
d71100ed 142sub run_analysis {
88a6bac5 143 my( $tradition, %opts ) = @_;
f00cefe8 144 my $c = $tradition->collation;
88a6bac5 145
146 my $stemma_id = $opts{'stemma_id'} || 0;
1d73ecad 147 my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
148 my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
88a6bac5 149
150 # Get the stemma
151 my $stemma = $tradition->stemma( $stemma_id );
b4cb2d60 152
4ce27d42 153 # Figure out which witnesses we are working with - that is, the ones that
154 # appear both in the stemma and in the tradition. All others are 'lacunose'
155 # for our purposes.
88a6bac5 156 my @lacunose = $stemma->hypotheticals;
fae07016 157 my @tradition_wits = map { $_->sigil } $tradition->witnesses;
fae07016 158 push( @lacunose, _symmdiff( [ $stemma->witnesses ], \@tradition_wits ) );
88a6bac5 159
160 # Find and mark 'common' ranks for exclusion, unless they were
161 # explicitly specified.
162 unless( @ranks ) {
163 my %common_rank;
a44aaf2a 164 foreach my $rdg ( $c->common_readings ) {
88a6bac5 165 $common_rank{$rdg->rank} = 1;
166 }
167 @ranks = grep { !$common_rank{$_} } ( 1 .. $c->end->rank-1 );
d71100ed 168 }
7f52eac8 169
88a6bac5 170 # Group the variants to send to the solver
171 my @groups;
f629cb3b 172 my @use_ranks;
a44aaf2a 173 my %lacunae;
94654e27 174 my $moved = {};
88a6bac5 175 foreach my $rank ( @ranks ) {
a44aaf2a 176 my $missing = [ @lacunose ];
94654e27 177 my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse );
178 # Filter out any empty rankgroups
179 # (e.g. from the later rank for a transposition)
180 next unless keys %$rankgroup;
ffa22d6f 181 if( $opts{'exclude_type1'} ) {
182 # Check to see whether this is a "useful" group.
183 my( $rdgs, $grps ) = _useful_variant( $rankgroup,
184 $stemma->graph, $c->ac_label );
185 next unless @$rdgs;
186 }
f629cb3b 187 push( @use_ranks, $rank );
ffa22d6f 188 push( @groups, $rankgroup );
a44aaf2a 189 $lacunae{$rank} = $missing;
d71100ed 190 }
4ce27d42 191 # Run the solver
e59b8faa 192 my $answer = solve_variants( $stemma, @groups );
fae07016 193
88a6bac5 194 # Do further analysis on the answer
a44aaf2a 195 my $conflict_count = 0;
7234b01d 196 my $aclabel = $c->ac_label;
f629cb3b 197 foreach my $idx ( 0 .. $#use_ranks ) {
88a6bac5 198 my $location = $answer->{'variants'}->[$idx];
199 # Add the rank back in
94654e27 200 my $rank = $use_ranks[$idx];
201 $location->{'id'} = $rank;
7234b01d 202 # Note what our lacunae are
f629cb3b 203 my %lmiss;
7234b01d 204 map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}};
7234b01d 205 $location->{'missing'} = [ keys %lmiss ];
206
88a6bac5 207 # Run the extra analysis we need.
428bcf0b 208 ## TODO We run through all the variants in this call, so
209 ## why not add the reading data there instead of here below?
638e2a95 210 analyze_location( $tradition, $stemma, $location, \%lmiss );
7234b01d 211
638e2a95 212 my @layerwits;
7234b01d 213 # Do the final post-analysis tidying up of the data.
a44aaf2a 214 foreach my $rdghash ( @{$location->{'readings'}} ) {
215 $conflict_count++
216 if exists $rdghash->{'conflict'} && $rdghash->{'conflict'};
94654e27 217 # Add the reading text back in, setting display value as needed
a44aaf2a 218 my $rdg = $c->reading( $rdghash->{'readingid'} );
94654e27 219 if( $rdg ) {
220 $rdghash->{'text'} = $rdg->text .
221 ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' );
7b7abf10 222 $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid;
223 $rdghash->{'is_nonsense'} = $rdg->is_nonsense;
94654e27 224 }
f629cb3b 225 # Remove lacunose witnesses from this reading's list now that the
7234b01d 226 # analysis is done
f629cb3b 227 my @realgroup;
7234b01d 228 map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}};
f629cb3b 229 $rdghash->{'group'} = \@realgroup;
638e2a95 230 # Note any layered witnesses that appear in this group
231 foreach( @realgroup ) {
232 if( $_ =~ /^(.*)\Q$aclabel\E$/ ) {
233 push( @layerwits, $1 );
234 }
235 }
a44aaf2a 236 }
638e2a95 237 $location->{'layerwits'} = \@layerwits if @layerwits;
88a6bac5 238 }
a44aaf2a 239 $answer->{'conflict_count'} = $conflict_count;
f00cefe8 240
88a6bac5 241 return $answer;
d71100ed 242}
243
7f52eac8 244=head2 group_variants( $tradition, $rank, $lacunose, @merge_relationship_types )
245
246Groups the variants at the given $rank of the collation, treating any
247relationships in @merge_relationship_types as equivalent. $lacunose should
248be a reference to an array, to which the sigla of lacunose witnesses at this
94654e27 249rank will be appended; $transposed should be a reference to a hash, wherein
250the identities of transposed readings and their relatives will be stored.
7f52eac8 251
ffa22d6f 252Returns a hash $group_readings where $rdg is attested by the witnesses listed
253in $group_readings->{$rdg}.
7f52eac8 254
255=cut
256
257# Return group_readings, groups, lacunose
d1348d38 258sub group_variants {
94654e27 259 my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_;
7f52eac8 260 my $c = $tradition->collation;
335a62ef 261 my $aclabel = $c->ac_label;
d120c995 262 my $table = $c->alignment_table;
7f52eac8 263 # Get the alignment table readings
264 my %readings_at_rank;
d120c995 265 my %is_lacunose; # lookup table for witnesses not in stemma
266 map { $is_lacunose{$_} = 1; $is_lacunose{$_.$aclabel} = 1 } @$lacunose;
94654e27 267 my @check_for_gaps;
268 my %moved_wits;
d120c995 269 my $has_transposition;
270 foreach my $tablewit ( @{$table->{'alignment'}} ) {
7f52eac8 271 my $rdg = $tablewit->{'tokens'}->[$rank-1];
fae07016 272 my $wit = $tablewit->{'witness'};
ffa22d6f 273 # Exclude the witness if it is "lacunose" which if we got here
274 # means "not in the stemma".
275 next if $is_lacunose{$wit};
94654e27 276 # Note if the witness is actually in a lacuna
7f52eac8 277 if( $rdg && $rdg->{'t'}->is_lacuna ) {
335a62ef 278 _add_to_witlist( $wit, $lacunose, $aclabel );
94654e27 279 # Otherwise the witness either has a positive reading...
7f52eac8 280 } elsif( $rdg ) {
94654e27 281 # If the reading has been counted elsewhere as a transposition, ignore it.
282 if( $transposed->{$rdg->{'t'}->id} ) {
d120c995 283 # TODO Does this cope with three-way transpositions?
94654e27 284 map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}};
285 next;
286 }
287 # Otherwise, record it...
288 $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'};
289 # ...and grab any transpositions, and their relations.
290 my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings();
291 foreach my $trdg ( @transp ) {
d120c995 292 next if exists $readings_at_rank{$trdg->id};
293 $has_transposition = 1;
294 my @affected_wits = _table_witnesses(
295 $table, $trdg, \%is_lacunose, $aclabel );
296 next unless @affected_wits;
297 map { $moved_wits{$_} = 1 } @affected_wits;
298 $transposed->{$trdg->id} =
299 [ _table_witnesses( $table, $rdg->{'t'}, \%is_lacunose, $aclabel ) ];
94654e27 300 $readings_at_rank{$trdg->id} = $trdg;
301 }
302 # ...or it is empty, ergo a gap.
7f52eac8 303 } else {
d120c995 304 _add_to_witlist( $wit, \@check_for_gaps, $aclabel );
7f52eac8 305 }
306 }
94654e27 307 my @gap_wits;
308 map { _add_to_witlist( $_, \@gap_wits, $aclabel )
309 unless $moved_wits{$_} } @check_for_gaps;
7f52eac8 310 # Group the readings, collapsing groups by relationship if needed
d120c995 311 my $grouped_readings = {};
4ce27d42 312 foreach my $rdg ( values %readings_at_rank ) {
7f52eac8 313 # Skip readings that have been collapsed into others.
d120c995 314 next if exists $grouped_readings->{$rdg->id}
315 && $grouped_readings->{$rdg->id} eq 'COLLAPSE';
4ce27d42 316 # Get the witness list, including from readings collapsed into this one.
d120c995 317 my @wits = _table_witnesses( $table, $rdg, \%is_lacunose, $aclabel );
318 if( $collapse && @$collapse ) {
7f52eac8 319 my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
320 foreach my $other ( $rdg->related_readings( $filter ) ) {
d120c995 321 my @otherwits = _table_witnesses(
322 $table, $other, \%is_lacunose, $aclabel );
fae07016 323 push( @wits, @otherwits );
d120c995 324 $grouped_readings->{$other->id} = 'COLLAPSE';
d1348d38 325 }
326 }
d120c995 327 $grouped_readings->{$rdg->id} = \@wits;
7f52eac8 328 }
d120c995 329 $grouped_readings->{'(omitted)'} = \@gap_wits if @gap_wits;
7f52eac8 330 # Get rid of our collapsed readings
d120c995 331 map { delete $grouped_readings->{$_} if $grouped_readings->{$_} eq 'COLLAPSE' }
332 keys %$grouped_readings
7f52eac8 333 if $collapse;
d120c995 334
335 # If something was transposed, check the groups for doubled-up readings
336 if( $has_transposition ) {
62a39b8f 337 # print STDERR "Group for rank $rank:\n";
338 # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" }
339 # keys %$grouped_readings;
d120c995 340 _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings );
341 }
7f52eac8 342
4ce27d42 343 # Return the result
d120c995 344 return $grouped_readings;
345}
346
347# Helper function to query the alignment table for all witnesses (a.c. included)
348# that have a given reading at its rank.
349sub _table_witnesses {
350 my( $table, $trdg, $lacunose, $aclabel ) = @_;
351 my $tableidx = $trdg->rank - 1;
352 my @has_reading;
353 foreach my $row ( @{$table->{'alignment'}} ) {
354 my $wit = $row->{'witness'};
355 next if $lacunose->{$wit};
356 my $rdg = $row->{'tokens'}->[$tableidx];
357 next unless exists $rdg->{'t'} && defined $rdg->{'t'};
358 _add_to_witlist( $wit, \@has_reading, $aclabel )
359 if $rdg->{'t'}->id eq $trdg->id;
360 }
361 return @has_reading;
d1348d38 362}
363
335a62ef 364# Helper function to ensure that X and X a.c. never appear in the same list.
365sub _add_to_witlist {
366 my( $wit, $list, $acstr ) = @_;
367 my %inlist;
368 my $idx = 0;
369 map { $inlist{$_} = $idx++ } @$list;
370 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
371 my $acwit = $1;
372 unless( exists $inlist{$acwit} ) {
373 push( @$list, $acwit.$acstr );
374 }
375 } else {
376 if( exists( $inlist{$wit.$acstr} ) ) {
377 # Replace the a.c. version with the main witness
378 my $i = $inlist{$wit.$acstr};
379 $list->[$i] = $wit;
380 } else {
381 push( @$list, $wit );
382 }
383 }
384}
385
d120c995 386sub _check_transposed_consistency {
387 my( $c, $rank, $transposed, $groupings ) = @_;
388 my %seen_wits;
389 my %thisrank;
390 # Note which readings are actually at this rank, and which witnesses
391 # belong to which reading.
392 foreach my $rdg ( keys %$groupings ) {
393 my $rdgobj = $c->reading( $rdg );
394 # Count '(omitted)' as a reading at this rank
395 $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank;
396 map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}};
397 }
398 # Our work is done if we have no witness belonging to more than one
399 # reading.
400 my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits;
401 return unless @doubled;
402 # If we have a symmetric related transposition, drop the non-rank readings.
403 if( @doubled == scalar keys %seen_wits ) {
404 foreach my $rdg ( keys %$groupings ) {
405 if( !$thisrank{$rdg} ) {
406 my $groupstr = wit_stringify( $groupings->{$rdg} );
407 my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) }
408 keys %thisrank;
409 delete $groupings->{$rdg};
410 # If we found a group match, assume there is a symmetry happening.
411 # TODO think more about this
62a39b8f 412 # print STDERR "*** Deleting symmetric reading $rdg\n";
d120c995 413 unless( $matched ) {
414 delete $transposed->{$rdg};
415 warn "Found problem in evident symmetry with reading $rdg";
416 }
417 }
418 }
419 # Otherwise 'unhook' the transposed reading(s) that have duplicates.
420 } else {
421 foreach my $dup ( @doubled ) {
422 foreach my $rdg ( @{$seen_wits{$dup}} ) {
423 next if $thisrank{$rdg};
424 next unless exists $groupings->{$rdg};
62a39b8f 425 # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n";
d120c995 426 delete $groupings->{$rdg};
427 delete $transposed->{$rdg};
428 }
429 }
430 # and put any now-orphaned readings into an 'omitted' reading.
431 foreach my $wit ( keys %seen_wits ) {
432 unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) {
433 $groupings->{'(omitted)'} = [] unless exists $groupings->{'(omitted)'};
434 _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label );
435 }
436 }
437 }
438}
439
88a6bac5 440=head2 solve_variants( $graph, @groups )
441
442Sends the set of groups to the external graph solver service and returns
443a cleaned-up answer, adding the rank IDs back where they belong.
444
445The JSON has the form
446 { "graph": [ stemmagraph DOT string without newlines ],
447 "groupings": [ array of arrays of groups, one per rank ] }
448
449The answer has the form
450 { "variants" => [ array of variant location structures ],
451 "variant_count" => total,
452 "conflict_count" => number of conflicts detected,
453 "genealogical_count" => number of solutions found }
454
455=cut
456
457sub solve_variants {
e59b8faa 458 my( $stemma, @groups ) = @_;
335a62ef 459
460 # Filter the groups down to distinct groups, and work out what graph
461 # should be used in the calculation of each group. We want to send each
462 # distinct problem to the solver only once.
463 # We need a whole bunch of lookup tables for this.
5c44c598 464 my( $index_groupkeys, $group_indices, $graph_problems ) = _prepare_groups( @_ );
335a62ef 465
335a62ef 466 ## For each distinct graph, send its groups to the solver.
88a6bac5 467 my $ua = LWP::UserAgent->new();
335a62ef 468 ## Witness map is a HACK to get around limitations in node names from IDP
469 my $witness_map = {};
470 ## Variables to store answers as they come back
471 my $variants = [ ( undef ) x ( scalar keys %$index_groupkeys ) ];
88a6bac5 472 my $genealogical = 0;
335a62ef 473 foreach my $graphkey ( keys %$graph_problems ) {
474 my $graph = $graph_problems->{$graphkey}->{'object'};
5c44c598 475 my $groupings = [ values %{$graph_problems->{$graphkey}->{'groups'}} ];
c30a4fdb 476 my $req = _safe_wit_strings( $graph, $stemma->collation,
477 $groupings, $witness_map );
478 $req->{'command'} = 'findGroupings';
479 my $json = encode_json( $req );
335a62ef 480 # Send it off and get the result
5c44c598 481 # print STDERR "Sending request: " . to_json( $req ) . "\n";
482 my $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
335a62ef 483 'Content' => $json );
484 my $answer;
335a62ef 485 if( $resp->is_success ) {
486 $answer = _desanitize_names( decode_json( $resp->content ), $witness_map );
335a62ef 487 } else {
488 # Fall back to the old method.
5c44c598 489 die "IDP solver returned " . $resp->status_line . " / " . $resp->content
490 . "; cannot run graph analysis";
335a62ef 491 }
5c44c598 492
493 ## If IDP worked, asked it the other two questions for this dataset.
494 my $more_eval = {};
495 foreach my $test ( qw/ findSources findClasses / ) {
496 $req->{'command'} = $test;
497 $json = encode_json( $req );
498 $resp = $ua->post( $SOLVER_URL, 'Content-Type' => 'application/json',
499 'Content' => $json );
500 if( $resp->is_success ) {
501 $more_eval->{$test} = _desanitize_names(
502 decode_json( $resp->content ), $witness_map );
503 } else {
504 warn "IDP solver for $test returned " . $resp->status_line .
505 " / " . $resp->content;
506 # TODO arrange fallback
507 }
508 }
509
335a62ef 510 ## The answer is the evaluated groupings, plus a boolean for whether
511 ## they were genealogical. Reconstruct our original groups.
512 foreach my $gidx ( 0 .. $#{$groupings} ) {
513 my( $calc_groups, $result ) = @{$answer->[$gidx]};
5c44c598 514 # Keep track of the total # of genealogical readings
515 $genealogical++ if $result;
516
517 my( $sources, $classes );
518 # Use the expanded groups from findSources if that got calculated.
519 if( exists( $more_eval->{'findSources'} ) ) {
520 ( $calc_groups, $sources ) = @{$more_eval->{'findSources'}->[$gidx]};
521 }
522 # Use the (same) expanded groups from findClasses if that got calculated
523 # and is relevant.
524 if( exists( $more_eval->{'findClasses'} ) && !$result ) {
c6b011b3 525 my $throwaway_groups;
526 ( $throwaway_groups, $classes ) = @{$more_eval->{'findClasses'}->[$gidx]};
335a62ef 527 }
5c44c598 528
529 # Convert the source list into a lookup hash
530 my $roots = {};
531 map { $roots->{$_} = 1 } @$sources;
532 # Convert the class list into a lookup hash
533 if( $classes ) {
534 $classes = _invert_hash( $classes );
535 }
536
335a62ef 537 # Retrieve the key for the original group that went to the solver
538 my $input_group = wit_stringify( $groupings->[$gidx] );
5c44c598 539
540 # Make the variant hash for each location that had this particular
541 # grouping on this particular stemma situation
335a62ef 542 foreach my $oidx ( @{$group_indices->{$input_group}} ) {
543 my @readings = @{$index_groupkeys->{$oidx}};
544 my $vstruct = {
545 'genealogical' => $result,
546 'readings' => [],
547 };
548 foreach my $ridx ( 0 .. $#readings ) {
549 push( @{$vstruct->{'readings'}},
550 { 'readingid' => $readings[$ridx],
551 'group' => $calc_groups->[$ridx] } );
552 }
5c44c598 553 $vstruct->{'reading_roots'} = $roots if $roots;
554 $vstruct->{'reading_types'} = $classes if $classes;
335a62ef 555 $variants->[$oidx] = $vstruct;
6d25a3a0 556 }
557 }
88a6bac5 558 }
559
560 return { 'variants' => $variants,
561 'variant_count' => scalar @$variants,
562 'genealogical_count' => $genealogical };
563}
564
5c44c598 565sub _prepare_groups {
566 my( $stemma, @groups ) = @_;
567 my $aclabel = $stemma->collation->ac_label;
568
569 my $index_groupkeys = {}; # Save the order of readings
570 my $group_indices = {}; # Save the indices that have a given grouping
571 my $graph_problems = {}; # Save the groupings for the given graph
572
573 foreach my $idx ( 0..$#groups ) {
574 my $ghash = $groups[$idx];
575 my @grouping;
576 # Sort the groupings from big to little, and scan for a.c. witnesses
577 # that would need an extended graph.
578 my @acwits; # note which AC witnesses crop up at this rank
579 my $extant; # note which witnesses crop up at this rank full stop
580 my @idxkeys = sort { scalar @{$ghash->{$b}} <=> scalar @{$ghash->{$a}} }
581 keys %$ghash;
582 foreach my $rdg ( @idxkeys ) {
583 my @sg = sort @{$ghash->{$rdg}};
584 push( @acwits, grep { $_ =~ /\Q$aclabel\E$/ } @sg );
585 map { $extant->{$_} = 1 } @sg;
586 push( @grouping, \@sg );
587 }
588 # Save the reading order
589 $index_groupkeys->{$idx} = \@idxkeys;
590
591 # Now associate the distinct group with this index
592 my $gstr = wit_stringify( \@grouping );
593 push( @{$group_indices->{$gstr}}, $idx );
594
595 # Finally, add the group to the list to be calculated for this graph.
596 map { s/\Q$aclabel\E$// } @acwits;
597 my $graph;
598 ## TODO When we get rid of the safe_wit_strings HACK we should also
599 ## be able to save the graph here as a dotstring rather than as an
600 ## object, thus simplifying life enormously.
601 try {
602 $graph = $stemma->situation_graph( $extant, \@acwits );
603 } catch {
5c44c598 604 die "Unable to extend graph with @acwits";
605 }
606 my $graphkey = "$graph || " . wit_stringify( [ sort keys %$extant ] );
607 unless( exists $graph_problems->{$graphkey} ) {
608 $graph_problems->{$graphkey} = { 'object' => $graph, 'groups' => {} };
609 }
610 $graph_problems->{$graphkey}->{'groups'}->{wit_stringify( \@grouping )} = \@grouping;
611 }
612 say STDERR "Created " . scalar( keys %$graph_problems ). " distinct graph(s)";
613 return( $index_groupkeys, $group_indices, $graph_problems );
614}
615
b4cb2d60 616#### HACKERY to cope with IDP's limited idea of what a node name looks like ###
617
618sub _safe_wit_strings {
335a62ef 619 my( $graph, $c, $groupings, $witness_map ) = @_;
b4cb2d60 620 # Convert the graph to a safe representation and store the conversion.
5c44c598 621 my $safegraph = Graph->new();
335a62ef 622 foreach my $n ( $graph->vertices ) {
b4cb2d60 623 my $sn = _safe_witstr( $n );
335a62ef 624 if( exists $witness_map->{$sn} ) {
625 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
626 if $witness_map->{$sn} ne $n;
627 } else {
628 $witness_map->{$sn} = $n;
629 }
b4cb2d60 630 $safegraph->add_vertex( $sn );
631 $safegraph->set_vertex_attributes( $sn,
335a62ef 632 $graph->get_vertex_attributes( $n ) );
b4cb2d60 633 }
335a62ef 634 foreach my $e ( $graph->edges ) {
b4cb2d60 635 my @safe_e = ( _safe_witstr( $e->[0] ), _safe_witstr( $e->[1] ) );
636 $safegraph->add_edge( @safe_e );
637 }
b4cb2d60 638
639 # Now convert the witness groupings to a safe representation.
640 my $safe_groupings = [];
641 foreach my $grouping ( @$groupings ) {
642 my $safe_grouping = [];
643 foreach my $group ( @$grouping ) {
644 my $safe_group = [];
645 foreach my $n ( @$group ) {
646 my $sn = _safe_witstr( $n );
647 warn "Ambiguous stringification $sn for $n and " . $witness_map->{$sn}
648 if exists $witness_map->{$sn} && $witness_map->{$sn} ne $n;
649 $witness_map->{$sn} = $n;
650 push( @$safe_group, $sn );
651 }
652 push( @$safe_grouping, $safe_group );
653 }
654 push( @$safe_groupings, $safe_grouping );
655 }
656
657 # Return it all in the struct we expect. We have stored the reductions
658 # in the $witness_map that we were passed.
5c44c598 659 return { 'graph' => Text::Tradition::Stemma::editable_graph(
660 $safegraph, { 'linesep' => ' ' } ),
335a62ef 661 'groupings' => $safe_groupings };
b4cb2d60 662}
663
664sub _safe_witstr {
665 my $witstr = shift;
666 $witstr =~ s/\s+/_/g;
667 $witstr =~ s/[^\w\d-]//g;
668 return $witstr;
669}
670
671sub _desanitize_names {
5c44c598 672 my( $element, $witness_map ) = @_;
b4cb2d60 673 my $result = [];
5c44c598 674 if( ref( $element ) eq 'ARRAY' ) {
675 foreach my $n ( @$element ) {
676 push( @$result, _desanitize_names( $n, $witness_map ) );
b4cb2d60 677 }
5c44c598 678 } elsif( ref( $element ) eq 'HASH' ) {
679 my $real_hash = {};
680 map { $real_hash->{$_} = _desanitize_names( $element->{$_}, $witness_map ) }
681 keys %$element;
682 $result = $real_hash;
683 } elsif( exists $witness_map->{$element} ) {
684 $result = $witness_map->{$element}
685 } else {
686 $result = $element;
b4cb2d60 687 }
688 return $result;
689}
690
5c44c598 691sub _invert_hash {
692 my( $hash ) = @_;
693 my $newhash;
694 foreach my $k ( keys %$hash ) {
695 if( ref( $hash->{$k} ) eq 'ARRAY' ) {
696 foreach my $v ( @{$hash->{$k}} ) {
697 $newhash->{$v} = $k;
698 }
699 } else {
700 $newhash->{$hash->{$k}} = $k;
701 }
702 }
703 return $newhash;
704}
705
b4cb2d60 706### END HACKERY ###
707
fae07016 708=head2 analyze_location ( $tradition, $graph, $location_hash )
7f52eac8 709
fae07016 710Given the tradition, its stemma graph, and the solution from the graph solver,
711work out the rest of the information we want. For each reading we need missing,
5c44c598 712conflict, reading_parents, independent_occurrence, followed, not_followed,
713and follow_unknown. Alters the location_hash in place.
7f52eac8 714
715=cut
732152b1 716
fae07016 717sub analyze_location {
638e2a95 718 my ( $tradition, $stemma, $variant_row, $lacunose ) = @_;
94654e27 719 my $c = $tradition->collation;
fae07016 720
721 # Make a hash of all known node memberships, and make the subgraphs.
722 my $contig = {};
723 my $reading_roots = {};
724 my $subgraph = {};
638e2a95 725 my $acstr = $c->ac_label;
726 my @acwits;
5c44c598 727
728 my $NO_IDP;
729 if( exists $variant_row->{'reading_roots'} ) {
730 $reading_roots = delete $variant_row->{'reading_roots'};
731 } else {
732 warn "No reading source information from IDP - proceed at your own risk";
733 $NO_IDP = 1;
734 }
735
736 # Note which witnesses positively belong to which group. This information
737 # comes ultimately from the IDP solver.
738 # Also make a note of the reading's roots.
fae07016 739 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
740 my $rid = $rdghash->{'readingid'};
5c44c598 741 my @roots;
638e2a95 742 foreach my $wit ( @{$rdghash->{'group'}} ) {
743 $contig->{$wit} = $rid;
744 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
745 push( @acwits, $1 );
746 }
5c44c598 747 if( exists $reading_roots->{$wit} && $reading_roots->{$wit} ) {
748 push( @roots, $wit );
749 }
638e2a95 750 }
5c44c598 751 $rdghash->{'independent_occurrence'} = \@roots;
94654e27 752 }
5c44c598 753
638e2a95 754 # Get the actual graph we should work with
d120c995 755 my $graph;
756 try {
5c44c598 757 # contig contains all extant wits and all hypothetical wits
758 # needed to make up the groups.
759 $graph = $stemma->situation_graph( $contig, \@acwits );
760 } catch ( Text::Tradition::Error $e ) {
761 die "Could not extend graph with given extant and a.c. witnesses: "
762 . $e->message;
d120c995 763 } catch {
764 die "Could not extend graph with a.c. witnesses @acwits";
765 }
638e2a95 766
5c44c598 767
fae07016 768 # Now that we have all the node group memberships, calculate followed/
bebec0e9 769 # non-followed/unknown values for each reading. Also figure out the
770 # reading's evident parent(s).
771 foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
fae07016 772 my $rid = $rdghash->{'readingid'};
428bcf0b 773 my $rdg = $c->reading( $rid );
5c44c598 774 my @roots = @{$rdghash->{'independent_occurrence'}};
775 my @group = @{$rdghash->{'group'}};
fae07016 776
777 # Start figuring things out.
5c44c598 778 $rdghash->{'followed'} = scalar( @group ) - scalar( @roots );
bebec0e9 779 # Find the parent readings, if any, of this reading.
94654e27 780 my $rdgparents = {};
bebec0e9 781 foreach my $wit ( @roots ) {
5c44c598 782 # Look in the stemma graph to find this witness's extant or known-reading
f00cefe8 783 # immediate ancestor(s), and look up the reading that each ancestor olds.
784 my @check = $graph->predecessors( $wit );
785 while( @check ) {
786 my @next;
787 foreach my $wparent( @check ) {
fae07016 788 my $preading = $contig->{$wparent};
428bcf0b 789 if( $preading && $preading ne $rid ) {
94654e27 790 $rdgparents->{$preading} = 1;
f00cefe8 791 } else {
792 push( @next, $graph->predecessors( $wparent ) );
793 }
794 }
795 @check = @next;
796 }
bebec0e9 797 }
94654e27 798 foreach my $p ( keys %$rdgparents ) {
799 # Resolve the relationship of the parent to the reading, and
800 # save it in our hash.
801 my $pobj = $c->reading( $p );
94654e27 802 my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p;
354cc918 803 my $phash = { 'label' => $prep };
94654e27 804 if( $pobj ) {
805 my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
806 if( $rel ) {
428bcf0b 807 _add_to_hash( $rel, $phash );
808 } elsif( $rdg ) {
809 # First check for a transposed relationship
810 if( $rdg->rank != $pobj->rank ) {
811 foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
812 next unless $ti->text eq $rdg->text;
813 $rel = $c->get_relationship( $ti, $pobj );
814 if( $rel ) {
815 _add_to_hash( $rel, $phash, 1 );
816 last;
817 }
818 }
819 unless( $rel ) {
820 foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
821 next unless $ti->text eq $pobj->text;
822 $rel = $c->get_relationship( $ti, $rdg );
823 if( $rel ) {
824 _add_to_hash( $rel, $phash, 1 );
825 last;
826 }
827 }
828 }
354cc918 829 }
428bcf0b 830 unless( $rel ) {
831 # and then check for sheer word similarity.
832 my $rtext = $rdg->text;
833 my $ptext = $pobj->text;
834 if( similar( $rtext, $ptext ) ) {
835 # say STDERR "Words $rtext and $ptext judged similar";
836 $phash->{relation} = { type => 'wordsimilar' };
837 }
94654e27 838 }
428bcf0b 839 } else {
840 $phash->{relation} = { type => 'deletion' };
94654e27 841 }
354cc918 842 # Get the attributes of the parent object while we are here
843 $phash->{'text'} = $pobj->text if $pobj;
844 $phash->{'is_nonsense'} = $pobj->is_nonsense;
845 $phash->{'is_ungrammatical'} = $pobj->grammar_invalid;
846 } elsif( $p eq '(omitted)' ) {
847 $phash->{relation} = { type => 'addition' };
7b7abf10 848 }
354cc918 849 # Save it
7b7abf10 850 $rdgparents->{$p} = $phash;
94654e27 851 }
852
853 $rdghash->{'reading_parents'} = $rdgparents;
bebec0e9 854
855 # Find the number of times this reading was altered, and the number of
856 # times we're not sure.
857 my( %nofollow, %unknownfollow );
5c44c598 858 foreach my $wit ( @{$rdghash->{'group'}} ) {
bebec0e9 859 foreach my $wchild ( $graph->successors( $wit ) ) {
5c44c598 860 if( $reading_roots->{$wchild} && $contig->{$wchild}
861 && $contig->{$wchild} ne $rid ) {
bebec0e9 862 # It definitely changed here.
863 $nofollow{$wchild} = 1;
864 } elsif( !($contig->{$wchild}) ) {
865 # The child is a hypothetical node not definitely in
866 # any group. Answer is unknown.
867 $unknownfollow{$wchild} = 1;
5c44c598 868 } # else it is either in our group, or it is a non-root node in a
869 # known group and therefore is presumed to have its reading from
870 # its group, not this link.
bebec0e9 871 }
872 }
873 $rdghash->{'not_followed'} = keys %nofollow;
874 $rdghash->{'follow_unknown'} = keys %unknownfollow;
fae07016 875
876 # Now say whether this reading represents a conflict.
877 unless( $variant_row->{'genealogical'} ) {
5c44c598 878 my @trueroots;
c6b011b3 879 if( exists $variant_row->{'reading_types'} ) {
880 my $classinfo = delete $variant_row->{'reading_types'};
5c44c598 881 # We have tested for reversions. Use the information.
882 my @reversions;
883 foreach my $rdgroot ( @roots ) {
c6b011b3 884 if( $classinfo->{$rdgroot} eq 'revert' ) {
5c44c598 885 push( @reversions, $rdgroot );
886 } else {
887 push( @trueroots, $rdgroot );
888 }
889 }
890 $rdghash->{'independent_occurrence'} = \@trueroots;
891 $rdghash->{'reversion'} = \@reversions if @reversions;
892 } else {
893 @trueroots = @roots;
894 }
895 $rdghash->{'conflict'} = @trueroots != 1;
fae07016 896 }
c4a4fb1b 897 }
d71100ed 898}
899
428bcf0b 900sub _add_to_hash {
901 my( $rel, $phash, $is_transposed ) = @_;
902 $phash->{relation} = { type => $rel->type };
903 $phash->{relation}->{transposed} = 1 if $is_transposed;
904 $phash->{relation}->{annotation} = $rel->annotation
905 if $rel->has_annotation;
906}
907
908=head2 similar( $word1, $word2 )
909
910Use Algorithm::Diff to get a sense of how close the words are to each other.
911This will hopefully handle substitutions a bit more nicely than Levenshtein.
912
913=cut
914
915#!/usr/bin/env perl
916
917sub similar {
918 my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
919 my @let1 = split( '', lc( $word1 ) );
920 my @let2 = split( '', lc( $word2 ) );
921 my $diff = Algorithm::Diff->new( \@let1, \@let2 );
922 my $mag = 0;
923 while( $diff->Next ) {
924 if( $diff->Same ) {
925 # Take off points for longer strings
926 my $cs = $diff->Range(1) - 2;
927 $cs = 0 if $cs < 0;
928 $mag -= $cs;
929 } elsif( !$diff->Items(1) ) {
930 $mag += $diff->Range(2);
931 } elsif( !$diff->Items(2) ) {
932 $mag += $diff->Range(1);
933 } else {
934 # Split the difference for substitutions
935 my $c1 = $diff->Range(1) || 1;
936 my $c2 = $diff->Range(2) || 1;
937 my $cd = ( $c1 + $c2 ) / 2;
938 $mag += $cd;
939 }
940 }
941 return ( $mag <= length( $word1 ) / 2 );
942}
943
6d25a3a0 944sub _prune_group {
5c44c598 945 my( $group, $graph ) = @_;
946 my $relevant = {};
947 # Record the existence of the vertices in the group
948 map { $relevant->{$_} = 1 } @$group;
6d25a3a0 949 # Make our subgraph
5c44c598 950 my $subgraph = $graph->deep_copy;
951 map { $subgraph->delete_vertex( $_ ) unless $relevant->{$_} }
6d25a3a0 952 $subgraph->vertices;
6d25a3a0 953 # Now prune and return the remaining vertices.
5c44c598 954 _prune_subtree( $subgraph );
955 # Return the list of vertices and the list of roots.
956 my $pruned_group = [ sort $subgraph->vertices ];
957 my $pruned_roots = [ $subgraph->predecessorless_vertices ];
958 return( $pruned_group, $pruned_roots );
6d25a3a0 959}
960
7f52eac8 961sub _prune_subtree {
5c44c598 962 my( $tree ) = @_;
94654e27 963
964 # Delete lacunose witnesses that have no successors
5c44c598 965 my @orphan_hypotheticals;
966 my $ctr = 0;
967 do {
968 die "Infinite loop on leaves" if $ctr > 100;
969 @orphan_hypotheticals =
970 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical' }
971 $tree->successorless_vertices;
972 $tree->delete_vertices( @orphan_hypotheticals );
973 $ctr++;
974 } while( @orphan_hypotheticals );
94654e27 975
976 # Delete lacunose roots that have a single successor
977 my @redundant_root;
978 $ctr = 0;
979 do {
5c44c598 980 die "Infinite loop on roots" if $ctr > 100;
981 @redundant_root =
982 grep { $tree->get_vertex_attribute( $_, 'class' ) eq 'hypothetical'
983 && $tree->successors( $_ ) == 1 }
984 $tree->predecessorless_vertices;
94654e27 985 $tree->delete_vertices( @redundant_root );
986 $ctr++;
987 } while( @redundant_root );
988}
989
5be0cdeb 990sub _useful_variant {
991 my( $group_readings, $graph, $acstr ) = @_;
992
993 # TODO Decide what to do with AC witnesses
994
995 # Sort by group size and return
996 my $is_useful = 0;
997 my( @readings, @groups ); # The sorted groups for our answer.
998 foreach my $rdg ( sort { @{$group_readings->{$b}} <=> @{$group_readings->{$a}} }
999 keys %$group_readings ) {
1000 push( @readings, $rdg );
1001 push( @groups, $group_readings->{$rdg} );
1002 if( @{$group_readings->{$rdg}} > 1 ) {
1003 $is_useful++;
1004 } else {
1005 my( $wit ) = @{$group_readings->{$rdg}};
1006 $wit =~ s/^(.*)\Q$acstr\E$/$1/;
1007 $is_useful++ unless( $graph->is_sink_vertex( $wit ) );
1008 }
1009 }
1010 if( $is_useful > 1 ) {
1011 return( \@readings, \@groups );
1012 } else {
1013 return( [], [] );
1014 }
1015}
1016
7f52eac8 1017=head2 wit_stringify( $groups )
1018
1019Takes an array of witness groupings and produces a string like
1020['A','B'] / ['C','D','E'] / ['F']
d71100ed 1021
7f52eac8 1022=cut
d71100ed 1023
1024sub wit_stringify {
1025 my $groups = shift;
1026 my @gst;
1027 # If we were passed an array of witnesses instead of an array of
1028 # groupings, then "group" the witnesses first.
1029 unless( ref( $groups->[0] ) ) {
1030 my $mkgrp = [ $groups ];
1031 $groups = $mkgrp;
1032 }
1033 foreach my $g ( @$groups ) {
1034 push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
1035 }
1036 return join( ' / ', @gst );
1037}
7f52eac8 1038
bebec0e9 1039sub _symmdiff {
1040 my( $lista, $listb ) = @_;
7f52eac8 1041 my %union;
1042 my %scalars;
1043 map { $union{$_} = 1; $scalars{$_} = $_ } @$lista;
1044 map { $union{$_} += 1; $scalars{$_} = $_ } @$listb;
bebec0e9 1045 my @set = grep { $union{$_} == 1 } keys %union;
7f52eac8 1046 return map { $scalars{$_} } @set;
1047}
1048
10491;
1050
1051=head1 LICENSE
1052
1053This package is free software and is provided "as is" without express
1054or implied warranty. You can redistribute it and/or modify it under
1055the same terms as Perl itself.
1056
1057=head1 AUTHOR
1058
1059Tara L Andrews E<lt>aurum@cpan.orgE<gt>