do our indirect exclusions too
[scpubgit/stemmatology.git] / analysis / script / analyze.pl
CommitLineData
a23b3715 1#!/usr/bin/env perl
2
3use feature 'say';
4use lib 'lib';
5use strict;
6use warnings;
5c2fdc10 7use Getopt::Long;
a23b3715 8use Text::CSV_XS;
9use Text::Tradition::Analysis qw/ run_analysis /;
10use Text::Tradition::Directory;
5c2fdc10 11use TryCatch;
a23b3715 12
13binmode STDOUT, ':utf8';
14binmode STDERR, ':utf8';
15
5c2fdc10 16my( $dsn, $dbuser, $dbpass );
3bce8f64 17my $filename = 'analysis.csv';
5c2fdc10 18GetOptions(
19 'dsn=s' => \$dsn,
20 'u|user=s' => \$dbuser,
21 'p|pass=s' => \$dbpass,
3bce8f64 22 'f|file=s' => \$filename
5c2fdc10 23);
24
25my %dbopts = ( dsn => $dsn );
26if( $dbuser || $dbpass ) {
27 $dbopts{extra_args} = { user => $dbuser, password => $dbpass }
28}
29
30my $dir = Text::Tradition::Directory->new( %dbopts );
31my $scope = $dir->new_scope();
a23b3715 32my $lookfor = shift @ARGV || '';
ee26c4d9 33my %collapse;
5c2fdc10 34if( @ARGV ) {
35 say "Merging relationship types @ARGV";
36 map { $collapse{$_} = 1 } @ARGV;
37}
a23b3715 38
3ec8b047 39my @relation_types = grep { !$collapse{$_} }
ee26c4d9 40 qw/ orthographic spelling grammatical lexical transposition repetition
3bce8f64 41 uncertain other addition deletion wordsimilar unknown source /;
a23b3715 42
43my @resultfields = qw/
3bce8f64 44 text_name loc_total loc_singleton multiparent loc_totalvariant loc_genealogical loc_genvariant
6a5f4aa7 45 loc_conflict loc_conflictvariant loc_reverted loc_revertvariant /;
ee26c4d9 46map { push( @resultfields, "gen_$_", "con_$_", "rev_$_" ) } @relation_types;
a23b3715 47
48my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
3bce8f64 49open my $fh, ">:encoding(UTF-8)", $filename or die "$filename: $!";
a23b3715 50if( $csv->combine( @resultfields ) ) {
51 say $fh $csv->string;
52} else {
53 say "combine() failed on argument: " . $csv->error_input;
54}
55
56foreach my $tinfo( $dir->traditionlist ) {
57 next if $tinfo->{'name'} eq 'xxxxx';
5c2fdc10 58 next if $tinfo->{'name'} =~ /158/;
59 next if $tinfo->{'name'} =~ /Heinrichi part/;
60 if( $lookfor ) {
61 next unless $tinfo->{'id'} eq $lookfor
62 || $tinfo->{'name'} =~ /$lookfor/;
63 }
a23b3715 64 my $tradition = $dir->lookup( $tinfo->{'id'} );
5c2fdc10 65 next unless $tradition->stemma_count;
a23b3715 66 say "Analyzing tradition " . $tradition->name;
5c2fdc10 67 ## HACK
68 my $MAXRANK;
69 if( $tradition->name =~ /Chronicle/ ) {
70 $MAXRANK = $tradition->collation->reading('L1545')->rank;
71 }
a23b3715 72 my %datahash;
73 # Initialize everything with zeroes
74 map { $datahash{$_} = 0 } @resultfields;
75 # Put in the real text ID and name
76 $datahash{text_id} = $tinfo->{'id'};
77 $datahash{text_name} = $tradition->name;
78
79 # Run the analysis for each row in @rows
ead7032e 80 my %opts = (
81 exclude_type1 => 1,
ee26c4d9 82 merge_types => [ 'punctuation' ] );
83 if( keys %collapse ) {
84 push( @{$opts{merge_types}}, keys %collapse );
a23b3715 85 }
86
5c2fdc10 87 my $result;
6a5f4aa7 88 try {
5c2fdc10 89 $result = run_analysis( $tradition, %opts );
6a5f4aa7 90 } catch {
91 say "Analysis run failed on tradition " . $tradition->name . ": @_";
92 next;
93 }
a23b3715 94 $datahash{loc_total} = $result->{variant_count};
3bce8f64 95 $datahash{multiparent} = 0;
6a5f4aa7 96 #$datahash{loc_genealogical} = $result->{genealogical_count};
97 #$datahash{loc_conflictvariant} = $result->{conflict_count};
98 #$datahash{loc_revertvariant} = $result->{reversion_count};
99 # Get the totals by location and by variant as we go.
a23b3715 100 my $totalvariant = 0;
6a5f4aa7 101 my $singleton = 0;
102 my $genloc = 0;
a23b3715 103 my $genvariant = 0;
3ec8b047 104 my $conflictloc = 0;
6a5f4aa7 105 my $conflictvariant = 0;
3ec8b047 106 my $revertloc = 0;
6a5f4aa7 107 my $revertvariant = 0;
108 my $msgd; # for the HACK
a23b3715 109 my @unknown;
110 foreach my $loc ( @{$result->{variants}} ) {
111 # A transition is the relationship type between parent and child.
112 # Find each genealogical transition and get the relationship type (if any)
113 # Find each non-genealogical transition and get the relationship type (if any)
3ec8b047 114 my( $loc_conflict, $loc_reversion );
ee26c4d9 115 if( exists $loc->{unsolved} ) {
6a5f4aa7 116 # Not solved; remove it from the total.
117 say "Skipping unsolved location at " . $loc->{id};
118 $datahash{loc_total}--;
ee26c4d9 119 next;
5c2fdc10 120 } elsif( $MAXRANK && $loc->{id} > $MAXRANK ) {
6a5f4aa7 121 # HACK until Chronicle tagging is done
122 say "Skipping ranks above $MAXRANK"
5c2fdc10 123 unless $msgd;
124 $msgd = 1;
125 next;
ee26c4d9 126 }
a23b3715 127 foreach my $rdghash( @{$loc->{readings}} ) {
128 # Weed out singletons
6a5f4aa7 129 $totalvariant++;
a23b3715 130 my @roots = @{$rdghash->{independent_occurrence}};
6a5f4aa7 131 if( @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'}
132 && !$rdghash->{'follow_unknown'} ) {
133 $singleton++;
134 next;
135 }
a23b3715 136 my $rdg = $tradition->collation->reading( $rdghash->{readingid} );
3ec8b047 137 my $type;
138 if( $rdghash->{'is_conflict'} ) {
139 $type = 'conflict';
140 $loc_conflict = 1;
6a5f4aa7 141 $conflictvariant++;
3ec8b047 142 } elsif( $rdghash->{'is_reverted'} ) {
143 $type = 'reverted';
144 $loc_reversion = 1;
6a5f4aa7 145 $revertvariant++;
3ec8b047 146 } elsif( @roots == 1 ) {
147 $type = 'genealogical';
148 $genvariant++;
149 } else {
150 warn "Reading $rdg neither conflict, genealogical, nor reverted. What?";
151 $type = 'ERROR';
152 }
153 my $typekey = substr( $type, 0, 3 ) . '_';
154
155 # Add relation stats for reading parents. If the reading is reverted,
156 # treat it as genealogical for the parent.
3bce8f64 157 my $phash = $type eq 'reverted'
158 ? $rdghash->{'reversion_parents'} : $rdghash->{'source_parents'};
3ec8b047 159 _add_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, $rdg,
3bce8f64 160 $type, $phash, \%datahash, \@unknown );
3ec8b047 161 }
162 if( $loc_conflict ) {
163 $conflictloc++;
164 } elsif( $loc_reversion ) {
165 $revertloc++;
6a5f4aa7 166 } else {
167 $genloc++;
a23b3715 168 }
169 }
170
171 # Add in the sums for the whole location
6a5f4aa7 172 $datahash{loc_totalvariant} = $totalvariant;
173 $datahash{loc_genealogical} = $genloc;
174 $datahash{loc_genvariant} = $genvariant;
175 $datahash{loc_conflict} = $conflictloc;
176 $datahash{loc_conflictvariant} = $conflictvariant;
177 $datahash{loc_reverted} = $revertloc;
178 $datahash{loc_revertvariant} = $revertvariant;
179 $datahash{loc_singleton} = $singleton;
180 $datahash{percent_genealogical} = $datahash{loc_genealogical} / $datahash{loc_total};
181 $datahash{percent_genvariant} = $genvariant / $totalvariant;
182 $datahash{percent_genorrevert} = ( $genvariant + $datahash{loc_revertvariant} ) / $totalvariant;
a23b3715 183
184 # Write them out to CSV.
185 my @csvalues = map { $datahash{$_} } @resultfields;
186 if( $csv->combine( @csvalues ) ) {
187 say $fh $csv->string;
188 } else {
189 say "combine() failed on argument: " . $csv->error_input;
190 }
5c2fdc10 191 map { say STDERR sprintf( "Unknown transition %s -> %s (%s) :%s", @$_ ) } @unknown;
a23b3715 192}
193
194close $fh;
3ec8b047 195
196sub _add_reading_relations {
197 my( $rid, $rank, $robj, $type, $parenthash, $datahash, $unknown ) = @_;
3bce8f64 198 my @kp = keys ( %$parenthash );
199 unless( @kp ) {
200 _increment_typekey( $datahash, $type, 'source' );
201 return;
202 }
203 if( @kp > 1 ) {
204 $datahash->{multiparent} = @kp - 1;
205 }
206 foreach my $p ( @kp ) {
3ec8b047 207 my $pdata = $parenthash->{$p};
208 my $relation;
209 if( $pdata->{relation} ) {
3bce8f64 210 $relation = $pdata->{relation}->{transposed}
211 ? 'transposition' : $pdata->{relation}->{type};
3ec8b047 212 } else {
213 $relation = 'unknown';
214 if( !$robj ) {
215 say "Unknown relation on missing reading object $rid at rank $rank";
216 } elsif( !$pdata ) {
217 say "Unknown relation on missing parent object for $rid at rank $rank";
218 } else {
219 push( @$unknown, [ $pdata->{label}, $robj->id, $robj->text, $type ] );
220 }
221 }
3bce8f64 222 _increment_typekey( $datahash, $type, $relation );
223 }
224}
225
226sub _increment_typekey {
227 my( $datahash, $type, $relation ) = @_;
228 my $typekey = substr( $type, 0, 3 ) . "_$relation";
229 unless( exists $datahash->{$typekey} ) {
230 $DB::single = 1;
231 warn "No field for $typekey";
3ec8b047 232 }
3bce8f64 233 $datahash->{$typekey}++;
234# # TODO distinguish parent-bad vs. rdg-bad
235# if( $robj && $robj->grammar_invalid ) {
236# $datahash->{$typekey.'_ungramm'} = 1;
237# } elsif( $robj && $robj->is_nonsense ) {
238# $datahash->{$typekey.'_nonsense'} = 1;
239# }
5c2fdc10 240}