Commit | Line | Data |
31573367 |
1 | #!/usr/bin/env perl |
2 | |
3 | use feature 'say'; |
4 | use lib 'lib'; |
5 | use strict; |
6 | use warnings; |
7 | use Getopt::Long; |
8 | use Text::CSV_XS; |
9 | use Text::Tradition::Analysis qw/ run_analysis /; |
10 | use Text::Tradition::Directory; |
11 | use TryCatch; |
12 | |
13 | binmode STDOUT, ':utf8'; |
14 | binmode STDERR, ':utf8'; |
15 | |
16 | my( $dsn, $dbuser, $dbpass ); |
17 | my $filename = 'analysis.csv'; |
18 | GetOptions( |
19 | 'dsn=s' => \$dsn, |
20 | 'u|user=s' => \$dbuser, |
21 | 'p|pass=s' => \$dbpass, |
22 | 'f|file=s' => \$filename |
23 | ); |
24 | |
25 | my %dbopts = ( dsn => $dsn ); |
26 | if( $dbuser || $dbpass ) { |
27 | $dbopts{extra_args} = { user => $dbuser, password => $dbpass } |
28 | } |
29 | |
30 | my $dir = Text::Tradition::Directory->new( %dbopts ); |
31 | my $scope = $dir->new_scope(); |
32 | my $lookfor = shift @ARGV || ''; |
33 | my %collapse; |
34 | if( @ARGV ) { |
35 | say "Merging relationship types @ARGV"; |
36 | map { $collapse{$_} = 1 } @ARGV; |
37 | } |
38 | |
39 | ## Set up the relationship types we will exclude in turn. False means "run |
40 | ## analysis with basic set of exclusions", i.e. orth/spelling/punct, and exclude |
41 | ## the variants in question later. True means "explicitly exclude this type too |
42 | ## at analysis time." |
43 | my %relation_types = ( |
44 | sameword => undef, |
45 | grammatical => 1, |
46 | lexical => 1, |
47 | uncertain => 1, |
48 | other => 1, |
49 | addition => undef, |
50 | deletion => undef |
51 | ); |
52 | |
53 | # Set up the things we want to calculate for each text |
54 | my @calcs = qw/ total genealogical excoincidental reverted exgenealogical coincidental /; |
55 | my @resultfields = ( 'text_name' ); |
56 | foreach my $rt ( keys %relation_types ) { |
57 | foreach my $cc ( @calcs ) { |
58 | push( @resultfields, sprintf( "%s_ex_%s", $cc, $rt ) ); |
59 | } |
60 | } |
61 | |
62 | my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } ); |
63 | open my $fh, ">:encoding(UTF-8)", $filename or die "$filename: $!"; |
64 | if( $csv->combine( @resultfields ) ) { |
65 | say $fh $csv->string; |
66 | } else { |
67 | say "combine() failed on argument: " . $csv->error_input; |
68 | } |
69 | |
70 | foreach my $tinfo( $dir->traditionlist ) { |
71 | next if $tinfo->{'name'} eq 'xxxxx'; |
72 | next if $tinfo->{'name'} =~ /158/; |
73 | next if $tinfo->{'name'} =~ /Heinrichi part/; |
74 | if( $lookfor ) { |
75 | next unless $tinfo->{'id'} eq $lookfor |
76 | || $tinfo->{'name'} =~ /$lookfor/; |
77 | } |
78 | my $tradition = $dir->lookup( $tinfo->{'id'} ); |
79 | next unless $tradition->stemma_count; |
80 | say "Analyzing tradition " . $tradition->name; |
81 | ## HACK |
82 | my $MAXRANK; |
83 | if( $tradition->name =~ /Chronicle/ ) { |
84 | $MAXRANK = $tradition->collation->reading('L1545')->rank; |
85 | } |
86 | my %datahash; |
87 | # Initialize everything with zeroes |
88 | map { $datahash{$_} = 0 } @resultfields; |
89 | # Put in the real text ID and name |
90 | $datahash{text_id} = $tinfo->{'id'}; |
91 | $datahash{text_name} = $tradition->name; |
92 | |
93 | # Run the analysis for each row in @rows |
94 | my $vanilla; # Store the run with no extra exclusions |
95 | my $result; |
aee39255 |
96 | foreach my $rtype ( keys %relation_types ) { |
97 | say "...calculating on exclusion of $rtype"; |
98 | if( $relation_types{$rtype} ) { |
99 | $result = run_exclude( $tradition, $rtype ); |
31573367 |
100 | } elsif( !$vanilla ) { |
101 | $result = run_exclude( $tradition ); |
102 | $vanilla = $result; |
103 | } else { |
104 | $result = $vanilla; |
105 | } |
106 | |
107 | # Get the totals by location and by variant as we go. |
108 | my $totalvariant = 0; |
109 | my $singleton = 0; |
110 | my $genvariant = 0; |
111 | my $conflictvariant = 0; |
112 | my $revertvariant = 0; |
113 | my $msgd; # for the HACK |
114 | my @unknown; |
115 | foreach my $loc ( @{$result->{variants}} ) { |
116 | # A transition is the relationship type between parent and child. |
117 | # Find each genealogical transition |
118 | # Find each non-genealogical transition |
119 | if( exists $loc->{unsolved} ) { |
120 | # Not solved; remove it from the total. |
121 | say "Skipping unsolved location at " . $loc->{id}; |
122 | next; |
123 | } elsif( $MAXRANK && $loc->{id} > $MAXRANK ) { |
124 | # HACK until Chronicle tagging is done |
125 | say "Skipping ranks above $MAXRANK" |
126 | unless $msgd; |
127 | $msgd = 1; |
128 | next; |
129 | } |
130 | foreach my $rdghash( @{$loc->{readings}} ) { |
131 | # Weed out singletons |
132 | $totalvariant++; |
133 | my @roots = @{$rdghash->{independent_occurrence}}; |
134 | if( @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'} |
135 | && !$rdghash->{'follow_unknown'} ) { |
136 | $singleton++; |
137 | next; |
138 | } |
139 | my $type; |
140 | if( $rdghash->{'is_conflict'} ) { |
141 | $type = 'conflict'; |
31573367 |
142 | } elsif( $rdghash->{'is_reverted'} ) { |
143 | $type = 'reverted'; |
31573367 |
144 | } elsif( @roots == 1 ) { |
145 | $type = 'genealogical'; |
31573367 |
146 | } else { |
147 | warn 'Reading ' . $rdghash->{readingid} . ' neither conflict, genealogical, nor reverted. What?'; |
148 | $type = 'ERROR'; |
149 | } |
aee39255 |
150 | # Get the relationship type stats for reading parents. |
151 | my $rdg = $tradition->collation->reading( $rdghash->{readingid} ); |
152 | |
153 | my $phash = $type eq 'reverted' |
154 | ? $rdghash->{'reversion_parents'} : $rdghash->{'source_parents'}; |
155 | my $rel = _get_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, |
156 | $rdg, $type, $phash, \%datahash, \@unknown ); |
157 | # If this is one of our exclusions, take it out of the total. |
158 | if( $rel eq $rtype ) { |
159 | $totalvariant--; |
160 | next; |
161 | } |
162 | # Otherwise add the variant type to our count. |
163 | $conflictvariant++ if $type eq 'conflict'; |
164 | $revertvariant++ if $type eq 'reverted'; |
165 | $genvariant++ if $type eq 'genealogical'; |
31573367 |
166 | } |
167 | } |
168 | |
169 | # Add in the sums for the whole location |
aee39255 |
170 | $datahash{"total_$rtype"} = $totalvariant - $singleton; |
171 | $datahash{"genealogical_ex_$rtype"} = $genvariant; |
172 | $datahash{"reverted_ex_$rtype"} = $revertvariant; |
173 | $datahash{"coincidental_ex_$rtype"} = $conflictvariant; |
174 | $datahash{"excoincidental_ex_$rtype"} = $genvariant + $revertvariant; |
31573367 |
175 | $datahash{"exgenealogical_ex_type"} = $conflictvariant + $revertvariant; |
176 | } |
177 | |
178 | # Write them out to CSV. |
179 | my @csvalues = map { $datahash{$_} } @resultfields; |
180 | if( $csv->combine( @csvalues ) ) { |
181 | say $fh $csv->string; |
182 | } else { |
183 | say "combine() failed on argument: " . $csv->error_input; |
184 | } |
185 | } |
186 | |
187 | close $fh; |
188 | |
aee39255 |
189 | sub _get_reading_relations { |
190 | my( $rid, $rank, $robj, $type, $parenthash, $datahash, $unknown ) = @_; |
191 | my @kp = keys ( %$parenthash ); |
192 | unless( @kp ) { |
193 | _increment_typekey( $datahash, $type, 'source' ); |
194 | return; |
195 | } |
196 | if( @kp > 1 ) { |
197 | $datahash->{multiparent} = @kp - 1; |
198 | } |
199 | foreach my $p ( @kp ) { |
200 | my $pdata = $parenthash->{$p}; |
201 | my $relation; |
202 | if( $pdata->{relation} ) { |
203 | $relation = $pdata->{relation}->{transposed} |
204 | ? 'transposition' : $pdata->{relation}->{type}; |
205 | } else { |
206 | $relation = 'unknown'; |
207 | if( !$robj ) { |
208 | say "Unknown relation on missing reading object $rid at rank $rank"; |
209 | } elsif( !$pdata ) { |
210 | say "Unknown relation on missing parent object for $rid at rank $rank"; |
211 | } else { |
212 | push( @$unknown, [ $pdata->{label}, $robj->id, $robj->text, $type ] ); |
213 | } |
214 | } |
215 | return $relation; |
216 | } |
217 | } |
218 | |
31573367 |
219 | sub run_exclude { |
220 | my( $tradition, $type ) = @_; |
221 | my $merge = [ qw/ orthographic spelling punctuation / ]; |
222 | if( $type && $relation_types{$type} ) { |
223 | push( @$merge, $type ); |
224 | } |
225 | |
226 | my $result; |
227 | try { |
228 | $result = run_analysis( $tradition, exclude_type1 => 1, |
229 | merge_types => $merge ); |
230 | } catch { |
231 | say "Analysis run failed on tradition " . $tradition->name . ": @_"; |
232 | return; |
233 | } |
234 | return $result; |
235 | } |