do our indirect exclusions too
[scpubgit/stemmatology.git] / analysis / script / exclude.pl
CommitLineData
31573367 1#!/usr/bin/env perl
2
3use feature 'say';
4use lib 'lib';
5use strict;
6use warnings;
7use Getopt::Long;
8use Text::CSV_XS;
9use Text::Tradition::Analysis qw/ run_analysis /;
10use Text::Tradition::Directory;
11use TryCatch;
12
13binmode STDOUT, ':utf8';
14binmode STDERR, ':utf8';
15
16my( $dsn, $dbuser, $dbpass );
17my $filename = 'analysis.csv';
18GetOptions(
19 'dsn=s' => \$dsn,
20 'u|user=s' => \$dbuser,
21 'p|pass=s' => \$dbpass,
22 'f|file=s' => \$filename
23);
24
25my %dbopts = ( dsn => $dsn );
26if( $dbuser || $dbpass ) {
27 $dbopts{extra_args} = { user => $dbuser, password => $dbpass }
28}
29
30my $dir = Text::Tradition::Directory->new( %dbopts );
31my $scope = $dir->new_scope();
32my $lookfor = shift @ARGV || '';
33my %collapse;
34if( @ARGV ) {
35 say "Merging relationship types @ARGV";
36 map { $collapse{$_} = 1 } @ARGV;
37}
38
39## Set up the relationship types we will exclude in turn. False means "run
40## analysis with basic set of exclusions", i.e. orth/spelling/punct, and exclude
41## the variants in question later. True means "explicitly exclude this type too
42## at analysis time."
43my %relation_types = (
44 sameword => undef,
45 grammatical => 1,
46 lexical => 1,
47 uncertain => 1,
48 other => 1,
49 addition => undef,
50 deletion => undef
51);
52
53# Set up the things we want to calculate for each text
54my @calcs = qw/ total genealogical excoincidental reverted exgenealogical coincidental /;
55my @resultfields = ( 'text_name' );
56foreach my $rt ( keys %relation_types ) {
57 foreach my $cc ( @calcs ) {
58 push( @resultfields, sprintf( "%s_ex_%s", $cc, $rt ) );
59 }
60}
61
62my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
63open my $fh, ">:encoding(UTF-8)", $filename or die "$filename: $!";
64if( $csv->combine( @resultfields ) ) {
65 say $fh $csv->string;
66} else {
67 say "combine() failed on argument: " . $csv->error_input;
68}
69
70foreach my $tinfo( $dir->traditionlist ) {
71 next if $tinfo->{'name'} eq 'xxxxx';
72 next if $tinfo->{'name'} =~ /158/;
73 next if $tinfo->{'name'} =~ /Heinrichi part/;
74 if( $lookfor ) {
75 next unless $tinfo->{'id'} eq $lookfor
76 || $tinfo->{'name'} =~ /$lookfor/;
77 }
78 my $tradition = $dir->lookup( $tinfo->{'id'} );
79 next unless $tradition->stemma_count;
80 say "Analyzing tradition " . $tradition->name;
81 ## HACK
82 my $MAXRANK;
83 if( $tradition->name =~ /Chronicle/ ) {
84 $MAXRANK = $tradition->collation->reading('L1545')->rank;
85 }
86 my %datahash;
87 # Initialize everything with zeroes
88 map { $datahash{$_} = 0 } @resultfields;
89 # Put in the real text ID and name
90 $datahash{text_id} = $tinfo->{'id'};
91 $datahash{text_name} = $tradition->name;
92
93 # Run the analysis for each row in @rows
94 my $vanilla; # Store the run with no extra exclusions
95 my $result;
aee39255 96 foreach my $rtype ( keys %relation_types ) {
97 say "...calculating on exclusion of $rtype";
98 if( $relation_types{$rtype} ) {
99 $result = run_exclude( $tradition, $rtype );
31573367 100 } elsif( !$vanilla ) {
101 $result = run_exclude( $tradition );
102 $vanilla = $result;
103 } else {
104 $result = $vanilla;
105 }
106
107 # Get the totals by location and by variant as we go.
108 my $totalvariant = 0;
109 my $singleton = 0;
110 my $genvariant = 0;
111 my $conflictvariant = 0;
112 my $revertvariant = 0;
113 my $msgd; # for the HACK
114 my @unknown;
115 foreach my $loc ( @{$result->{variants}} ) {
116 # A transition is the relationship type between parent and child.
117 # Find each genealogical transition
118 # Find each non-genealogical transition
119 if( exists $loc->{unsolved} ) {
120 # Not solved; remove it from the total.
121 say "Skipping unsolved location at " . $loc->{id};
122 next;
123 } elsif( $MAXRANK && $loc->{id} > $MAXRANK ) {
124 # HACK until Chronicle tagging is done
125 say "Skipping ranks above $MAXRANK"
126 unless $msgd;
127 $msgd = 1;
128 next;
129 }
130 foreach my $rdghash( @{$loc->{readings}} ) {
131 # Weed out singletons
132 $totalvariant++;
133 my @roots = @{$rdghash->{independent_occurrence}};
134 if( @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'}
135 && !$rdghash->{'follow_unknown'} ) {
136 $singleton++;
137 next;
138 }
139 my $type;
140 if( $rdghash->{'is_conflict'} ) {
141 $type = 'conflict';
31573367 142 } elsif( $rdghash->{'is_reverted'} ) {
143 $type = 'reverted';
31573367 144 } elsif( @roots == 1 ) {
145 $type = 'genealogical';
31573367 146 } else {
147 warn 'Reading ' . $rdghash->{readingid} . ' neither conflict, genealogical, nor reverted. What?';
148 $type = 'ERROR';
149 }
aee39255 150 # Get the relationship type stats for reading parents.
151 my $rdg = $tradition->collation->reading( $rdghash->{readingid} );
152
153 my $phash = $type eq 'reverted'
154 ? $rdghash->{'reversion_parents'} : $rdghash->{'source_parents'};
155 my $rel = _get_reading_relations( $rdghash->{'readingid'}, $loc->{'id'},
156 $rdg, $type, $phash, \%datahash, \@unknown );
157 # If this is one of our exclusions, take it out of the total.
158 if( $rel eq $rtype ) {
159 $totalvariant--;
160 next;
161 }
162 # Otherwise add the variant type to our count.
163 $conflictvariant++ if $type eq 'conflict';
164 $revertvariant++ if $type eq 'reverted';
165 $genvariant++ if $type eq 'genealogical';
31573367 166 }
167 }
168
169 # Add in the sums for the whole location
aee39255 170 $datahash{"total_$rtype"} = $totalvariant - $singleton;
171 $datahash{"genealogical_ex_$rtype"} = $genvariant;
172 $datahash{"reverted_ex_$rtype"} = $revertvariant;
173 $datahash{"coincidental_ex_$rtype"} = $conflictvariant;
174 $datahash{"excoincidental_ex_$rtype"} = $genvariant + $revertvariant;
31573367 175 $datahash{"exgenealogical_ex_type"} = $conflictvariant + $revertvariant;
176 }
177
178 # Write them out to CSV.
179 my @csvalues = map { $datahash{$_} } @resultfields;
180 if( $csv->combine( @csvalues ) ) {
181 say $fh $csv->string;
182 } else {
183 say "combine() failed on argument: " . $csv->error_input;
184 }
185}
186
187close $fh;
188
aee39255 189sub _get_reading_relations {
190 my( $rid, $rank, $robj, $type, $parenthash, $datahash, $unknown ) = @_;
191 my @kp = keys ( %$parenthash );
192 unless( @kp ) {
193 _increment_typekey( $datahash, $type, 'source' );
194 return;
195 }
196 if( @kp > 1 ) {
197 $datahash->{multiparent} = @kp - 1;
198 }
199 foreach my $p ( @kp ) {
200 my $pdata = $parenthash->{$p};
201 my $relation;
202 if( $pdata->{relation} ) {
203 $relation = $pdata->{relation}->{transposed}
204 ? 'transposition' : $pdata->{relation}->{type};
205 } else {
206 $relation = 'unknown';
207 if( !$robj ) {
208 say "Unknown relation on missing reading object $rid at rank $rank";
209 } elsif( !$pdata ) {
210 say "Unknown relation on missing parent object for $rid at rank $rank";
211 } else {
212 push( @$unknown, [ $pdata->{label}, $robj->id, $robj->text, $type ] );
213 }
214 }
215 return $relation;
216 }
217}
218
31573367 219sub run_exclude {
220 my( $tradition, $type ) = @_;
221 my $merge = [ qw/ orthographic spelling punctuation / ];
222 if( $type && $relation_types{$type} ) {
223 push( @$merge, $type );
224 }
225
226 my $result;
227 try {
228 $result = run_analysis( $tradition, exclude_type1 => 1,
229 merge_types => $merge );
230 } catch {
231 say "Analysis run failed on tradition " . $tradition->name . ": @_";
232 return;
233 }
234 return $result;
235}