another script to exclude relatioship types in turn from the analysis
[scpubgit/stemmatology.git] / analysis / script / exclude.pl
CommitLineData
31573367 1#!/usr/bin/env perl
2
3use feature 'say';
4use lib 'lib';
5use strict;
6use warnings;
7use Getopt::Long;
8use Text::CSV_XS;
9use Text::Tradition::Analysis qw/ run_analysis /;
10use Text::Tradition::Directory;
11use TryCatch;
12
13binmode STDOUT, ':utf8';
14binmode STDERR, ':utf8';
15
16my( $dsn, $dbuser, $dbpass );
17my $filename = 'analysis.csv';
18GetOptions(
19 'dsn=s' => \$dsn,
20 'u|user=s' => \$dbuser,
21 'p|pass=s' => \$dbpass,
22 'f|file=s' => \$filename
23);
24
25my %dbopts = ( dsn => $dsn );
26if( $dbuser || $dbpass ) {
27 $dbopts{extra_args} = { user => $dbuser, password => $dbpass }
28}
29
30my $dir = Text::Tradition::Directory->new( %dbopts );
31my $scope = $dir->new_scope();
32my $lookfor = shift @ARGV || '';
33my %collapse;
34if( @ARGV ) {
35 say "Merging relationship types @ARGV";
36 map { $collapse{$_} = 1 } @ARGV;
37}
38
39## Set up the relationship types we will exclude in turn. False means "run
40## analysis with basic set of exclusions", i.e. orth/spelling/punct, and exclude
41## the variants in question later. True means "explicitly exclude this type too
42## at analysis time."
43my %relation_types = (
44 sameword => undef,
45 grammatical => 1,
46 lexical => 1,
47 uncertain => 1,
48 other => 1,
49 addition => undef,
50 deletion => undef
51);
52
53# Set up the things we want to calculate for each text
54my @calcs = qw/ total genealogical excoincidental reverted exgenealogical coincidental /;
55my @resultfields = ( 'text_name' );
56foreach my $rt ( keys %relation_types ) {
57 foreach my $cc ( @calcs ) {
58 push( @resultfields, sprintf( "%s_ex_%s", $cc, $rt ) );
59 }
60}
61
62my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
63open my $fh, ">:encoding(UTF-8)", $filename or die "$filename: $!";
64if( $csv->combine( @resultfields ) ) {
65 say $fh $csv->string;
66} else {
67 say "combine() failed on argument: " . $csv->error_input;
68}
69
70foreach my $tinfo( $dir->traditionlist ) {
71 next if $tinfo->{'name'} eq 'xxxxx';
72 next if $tinfo->{'name'} =~ /158/;
73 next if $tinfo->{'name'} =~ /Heinrichi part/;
74 if( $lookfor ) {
75 next unless $tinfo->{'id'} eq $lookfor
76 || $tinfo->{'name'} =~ /$lookfor/;
77 }
78 my $tradition = $dir->lookup( $tinfo->{'id'} );
79 next unless $tradition->stemma_count;
80 say "Analyzing tradition " . $tradition->name;
81 ## HACK
82 my $MAXRANK;
83 if( $tradition->name =~ /Chronicle/ ) {
84 $MAXRANK = $tradition->collation->reading('L1545')->rank;
85 }
86 my %datahash;
87 # Initialize everything with zeroes
88 map { $datahash{$_} = 0 } @resultfields;
89 # Put in the real text ID and name
90 $datahash{text_id} = $tinfo->{'id'};
91 $datahash{text_name} = $tradition->name;
92
93 # Run the analysis for each row in @rows
94 my $vanilla; # Store the run with no extra exclusions
95 my $result;
96 foreach my $type ( keys %relation_types ) {
97 say "...calculating on exclusion of $type";
98 if( $relation_types{$type} ) {
99 $result = run_exclude( $tradition, $type );
100 } elsif( !$vanilla ) {
101 $result = run_exclude( $tradition );
102 $vanilla = $result;
103 } else {
104 $result = $vanilla;
105 }
106
107 # Get the totals by location and by variant as we go.
108 my $totalvariant = 0;
109 my $singleton = 0;
110 my $genvariant = 0;
111 my $conflictvariant = 0;
112 my $revertvariant = 0;
113 my $msgd; # for the HACK
114 my @unknown;
115 foreach my $loc ( @{$result->{variants}} ) {
116 # A transition is the relationship type between parent and child.
117 # Find each genealogical transition
118 # Find each non-genealogical transition
119 if( exists $loc->{unsolved} ) {
120 # Not solved; remove it from the total.
121 say "Skipping unsolved location at " . $loc->{id};
122 next;
123 } elsif( $MAXRANK && $loc->{id} > $MAXRANK ) {
124 # HACK until Chronicle tagging is done
125 say "Skipping ranks above $MAXRANK"
126 unless $msgd;
127 $msgd = 1;
128 next;
129 }
130 foreach my $rdghash( @{$loc->{readings}} ) {
131 # Weed out singletons
132 $totalvariant++;
133 my @roots = @{$rdghash->{independent_occurrence}};
134 if( @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'}
135 && !$rdghash->{'follow_unknown'} ) {
136 $singleton++;
137 next;
138 }
139 my $type;
140 if( $rdghash->{'is_conflict'} ) {
141 $type = 'conflict';
142 $conflictvariant++;
143 } elsif( $rdghash->{'is_reverted'} ) {
144 $type = 'reverted';
145 $revertvariant++;
146 } elsif( @roots == 1 ) {
147 $type = 'genealogical';
148 $genvariant++;
149 } else {
150 warn 'Reading ' . $rdghash->{readingid} . ' neither conflict, genealogical, nor reverted. What?';
151 $type = 'ERROR';
152 }
153 }
154 }
155
156 # Add in the sums for the whole location
157 $datahash{"total_$type"} = $totalvariant - $singleton;
158 $datahash{"genealogical_ex_$type"} = $genvariant;
159 $datahash{"reverted_ex_$type"} = $revertvariant;
160 $datahash{"coincidental_ex_$type"} = $conflictvariant;
161 $datahash{"excoincidental_ex_type"} = $genvariant + $revertvariant;
162 $datahash{"exgenealogical_ex_type"} = $conflictvariant + $revertvariant;
163 }
164
165 # Write them out to CSV.
166 my @csvalues = map { $datahash{$_} } @resultfields;
167 if( $csv->combine( @csvalues ) ) {
168 say $fh $csv->string;
169 } else {
170 say "combine() failed on argument: " . $csv->error_input;
171 }
172}
173
174close $fh;
175
176sub run_exclude {
177 my( $tradition, $type ) = @_;
178 my $merge = [ qw/ orthographic spelling punctuation / ];
179 if( $type && $relation_types{$type} ) {
180 push( @$merge, $type );
181 }
182
183 my $result;
184 try {
185 $result = run_analysis( $tradition, exclude_type1 => 1,
186 merge_types => $merge );
187 } catch {
188 say "Analysis run failed on tradition " . $tradition->name . ": @_";
189 return;
190 }
191 return $result;
192}