notice and save changes to graph name / stemma identifier (tla/stemmaweb#28)
[scpubgit/stemmatology.git] / analysis / script / exclude.pl
CommitLineData
31573367 1#!/usr/bin/env perl
2
3use feature 'say';
4use lib 'lib';
5use strict;
6use warnings;
7use Getopt::Long;
8use Text::CSV_XS;
9use Text::Tradition::Analysis qw/ run_analysis /;
10use Text::Tradition::Directory;
11use TryCatch;
12
13binmode STDOUT, ':utf8';
14binmode STDERR, ':utf8';
15
16my( $dsn, $dbuser, $dbpass );
6ad2c61a 17my $filename = 'exclusions.csv';
31573367 18GetOptions(
19 'dsn=s' => \$dsn,
20 'u|user=s' => \$dbuser,
21 'p|pass=s' => \$dbpass,
22 'f|file=s' => \$filename
23);
24
25my %dbopts = ( dsn => $dsn );
26if( $dbuser || $dbpass ) {
27 $dbopts{extra_args} = { user => $dbuser, password => $dbpass }
28}
29
30my $dir = Text::Tradition::Directory->new( %dbopts );
31my $scope = $dir->new_scope();
32my $lookfor = shift @ARGV || '';
31573367 33
34## Set up the relationship types we will exclude in turn. False means "run
35## analysis with basic set of exclusions", i.e. orth/spelling/punct, and exclude
36## the variants in question later. True means "explicitly exclude this type too
37## at analysis time."
861e4c6d 38my @relation_types = qw/ none sameword grammatical lexical uncertain other
6ad2c61a 39 addition deletion transposition /;
31573367 40
41# Set up the things we want to calculate for each text
42my @calcs = qw/ total genealogical excoincidental reverted exgenealogical coincidental /;
43my @resultfields = ( 'text_name' );
6ad2c61a 44foreach my $rt ( @relation_types ) {
31573367 45 foreach my $cc ( @calcs ) {
46 push( @resultfields, sprintf( "%s_ex_%s", $cc, $rt ) );
47 }
48}
49
50my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
51open my $fh, ">:encoding(UTF-8)", $filename or die "$filename: $!";
52if( $csv->combine( @resultfields ) ) {
53 say $fh $csv->string;
54} else {
55 say "combine() failed on argument: " . $csv->error_input;
56}
57
58foreach my $tinfo( $dir->traditionlist ) {
59 next if $tinfo->{'name'} eq 'xxxxx';
60 next if $tinfo->{'name'} =~ /158/;
61 next if $tinfo->{'name'} =~ /Heinrichi part/;
62 if( $lookfor ) {
63 next unless $tinfo->{'id'} eq $lookfor
64 || $tinfo->{'name'} =~ /$lookfor/;
65 }
66 my $tradition = $dir->lookup( $tinfo->{'id'} );
67 next unless $tradition->stemma_count;
68 say "Analyzing tradition " . $tradition->name;
69 ## HACK
70 my $MAXRANK;
71 if( $tradition->name =~ /Chronicle/ ) {
72 $MAXRANK = $tradition->collation->reading('L1545')->rank;
73 }
74 my %datahash;
75 # Initialize everything with zeroes
76 map { $datahash{$_} = 0 } @resultfields;
77 # Put in the real text ID and name
78 $datahash{text_id} = $tinfo->{'id'};
79 $datahash{text_name} = $tradition->name;
80
861e4c6d 81 my $fullresult;
82 my $noorthresult;
6ad2c61a 83 try {
861e4c6d 84 $fullresult = run_analysis( $tradition, exclude_type1 => 1 );
85 $noorthresult = run_analysis( $tradition, exclude_type1 => 1,
6ad2c61a 86 merge_types => [ qw/ orthographic spelling punctuation / ] );
87 } catch {
88 say "Analysis run failed on tradition " . $tradition->name . ": @_";
89 return;
90 }
91 foreach my $rtype ( @relation_types ) {
aee39255 92 say "...calculating on exclusion of $rtype";
861e4c6d 93 my $result = $rtype eq 'none' ? $fullresult : $noorthresult;
31573367 94
95 # Get the totals by location and by variant as we go.
96 my $totalvariant = 0;
97 my $singleton = 0;
98 my $genvariant = 0;
99 my $conflictvariant = 0;
100 my $revertvariant = 0;
101 my $msgd; # for the HACK
102 my @unknown;
103 foreach my $loc ( @{$result->{variants}} ) {
104 # A transition is the relationship type between parent and child.
105 # Find each genealogical transition
106 # Find each non-genealogical transition
107 if( exists $loc->{unsolved} ) {
108 # Not solved; remove it from the total.
109 say "Skipping unsolved location at " . $loc->{id};
110 next;
111 } elsif( $MAXRANK && $loc->{id} > $MAXRANK ) {
112 # HACK until Chronicle tagging is done
113 say "Skipping ranks above $MAXRANK"
114 unless $msgd;
115 $msgd = 1;
116 next;
117 }
118 foreach my $rdghash( @{$loc->{readings}} ) {
119 # Weed out singletons
120 $totalvariant++;
121 my @roots = @{$rdghash->{independent_occurrence}};
122 if( @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'}
123 && !$rdghash->{'follow_unknown'} ) {
124 $singleton++;
125 next;
126 }
127 my $type;
128 if( $rdghash->{'is_conflict'} ) {
129 $type = 'conflict';
31573367 130 } elsif( $rdghash->{'is_reverted'} ) {
131 $type = 'reverted';
31573367 132 } elsif( @roots == 1 ) {
133 $type = 'genealogical';
31573367 134 } else {
135 warn 'Reading ' . $rdghash->{readingid} . ' neither conflict, genealogical, nor reverted. What?';
136 $type = 'ERROR';
137 }
aee39255 138 # Get the relationship type stats for reading parents.
139 my $rdg = $tradition->collation->reading( $rdghash->{readingid} );
140
141 my $phash = $type eq 'reverted'
142 ? $rdghash->{'reversion_parents'} : $rdghash->{'source_parents'};
6ad2c61a 143 my @rels = _get_reading_relations( $rdghash->{'readingid'}, $loc->{'id'},
aee39255 144 $rdg, $type, $phash, \%datahash, \@unknown );
145 # If this is one of our exclusions, take it out of the total.
6ad2c61a 146 foreach my $rel ( @rels ) {
6ad2c61a 147 if( $rel eq $rtype ) {
148 $totalvariant--;
149 next;
150 } else {
aee39255 151 # Otherwise add the variant type to our count.
6ad2c61a 152 $conflictvariant++ if $type eq 'conflict';
153 $revertvariant++ if $type eq 'reverted';
154 $genvariant++ if $type eq 'genealogical';
155 }
156 }
31573367 157 }
158 }
159
160 # Add in the sums for the whole location
6ad2c61a 161 $datahash{"total_ex_$rtype"} = $totalvariant - $singleton;
aee39255 162 $datahash{"genealogical_ex_$rtype"} = $genvariant;
163 $datahash{"reverted_ex_$rtype"} = $revertvariant;
164 $datahash{"coincidental_ex_$rtype"} = $conflictvariant;
165 $datahash{"excoincidental_ex_$rtype"} = $genvariant + $revertvariant;
6ad2c61a 166 $datahash{"exgenealogical_ex_$rtype"} = $conflictvariant + $revertvariant;
31573367 167 }
168
169 # Write them out to CSV.
170 my @csvalues = map { $datahash{$_} } @resultfields;
171 if( $csv->combine( @csvalues ) ) {
172 say $fh $csv->string;
173 } else {
174 say "combine() failed on argument: " . $csv->error_input;
175 }
176}
177
178close $fh;
179
aee39255 180sub _get_reading_relations {
181 my( $rid, $rank, $robj, $type, $parenthash, $datahash, $unknown ) = @_;
182 my @kp = keys ( %$parenthash );
861e4c6d 183 return ( 'source' ) unless @kp; # In case there is no parent reading to relate.
184
6ad2c61a 185 my @rels;
aee39255 186 foreach my $p ( @kp ) {
187 my $pdata = $parenthash->{$p};
188 my $relation;
189 if( $pdata->{relation} ) {
190 $relation = $pdata->{relation}->{transposed}
191 ? 'transposition' : $pdata->{relation}->{type};
192 } else {
193 $relation = 'unknown';
194 if( !$robj ) {
195 say "Unknown relation on missing reading object $rid at rank $rank";
196 } elsif( !$pdata ) {
197 say "Unknown relation on missing parent object for $rid at rank $rank";
198 } else {
199 push( @$unknown, [ $pdata->{label}, $robj->id, $robj->text, $type ] );
200 }
201 }
6ad2c61a 202 push( @rels, $relation );
aee39255 203 }
6ad2c61a 204 return @rels;
aee39255 205}
206