Commit | Line | Data |
a23b3715 |
1 | #!/usr/bin/env perl |
2 | |
3 | use feature 'say'; |
4 | use lib 'lib'; |
5 | use strict; |
6 | use warnings; |
5c2fdc10 |
7 | use Getopt::Long; |
a23b3715 |
8 | use Text::CSV_XS; |
9 | use Text::Tradition::Analysis qw/ run_analysis /; |
10 | use Text::Tradition::Directory; |
5c2fdc10 |
11 | use TryCatch; |
a23b3715 |
12 | |
13 | binmode STDOUT, ':utf8'; |
14 | binmode STDERR, ':utf8'; |
15 | |
5c2fdc10 |
16 | my( $dsn, $dbuser, $dbpass ); |
3bce8f64 |
17 | my $filename = 'analysis.csv'; |
5c2fdc10 |
18 | GetOptions( |
19 | 'dsn=s' => \$dsn, |
20 | 'u|user=s' => \$dbuser, |
21 | 'p|pass=s' => \$dbpass, |
3bce8f64 |
22 | 'f|file=s' => \$filename |
5c2fdc10 |
23 | ); |
24 | |
25 | my %dbopts = ( dsn => $dsn ); |
26 | if( $dbuser || $dbpass ) { |
27 | $dbopts{extra_args} = { user => $dbuser, password => $dbpass } |
28 | } |
29 | |
30 | my $dir = Text::Tradition::Directory->new( %dbopts ); |
31 | my $scope = $dir->new_scope(); |
a23b3715 |
32 | my $lookfor = shift @ARGV || ''; |
ee26c4d9 |
33 | my %collapse; |
5c2fdc10 |
34 | if( @ARGV ) { |
35 | say "Merging relationship types @ARGV"; |
36 | map { $collapse{$_} = 1 } @ARGV; |
37 | } |
a23b3715 |
38 | |
3ec8b047 |
39 | my @relation_types = grep { !$collapse{$_} } |
ee26c4d9 |
40 | qw/ orthographic spelling grammatical lexical transposition repetition |
3bce8f64 |
41 | uncertain other addition deletion wordsimilar unknown source /; |
a23b3715 |
42 | |
43 | my @resultfields = qw/ |
3bce8f64 |
44 | text_name loc_total loc_singleton multiparent loc_totalvariant loc_genealogical loc_genvariant |
6a5f4aa7 |
45 | loc_conflict loc_conflictvariant loc_reverted loc_revertvariant /; |
ee26c4d9 |
46 | map { push( @resultfields, "gen_$_", "con_$_", "rev_$_" ) } @relation_types; |
a23b3715 |
47 | |
48 | my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } ); |
3bce8f64 |
49 | open my $fh, ">:encoding(UTF-8)", $filename or die "$filename: $!"; |
a23b3715 |
50 | if( $csv->combine( @resultfields ) ) { |
51 | say $fh $csv->string; |
52 | } else { |
53 | say "combine() failed on argument: " . $csv->error_input; |
54 | } |
55 | |
56 | foreach my $tinfo( $dir->traditionlist ) { |
57 | next if $tinfo->{'name'} eq 'xxxxx'; |
5c2fdc10 |
58 | next if $tinfo->{'name'} =~ /158/; |
59 | next if $tinfo->{'name'} =~ /Heinrichi part/; |
60 | if( $lookfor ) { |
61 | next unless $tinfo->{'id'} eq $lookfor |
62 | || $tinfo->{'name'} =~ /$lookfor/; |
63 | } |
a23b3715 |
64 | my $tradition = $dir->lookup( $tinfo->{'id'} ); |
5c2fdc10 |
65 | next unless $tradition->stemma_count; |
a23b3715 |
66 | say "Analyzing tradition " . $tradition->name; |
5c2fdc10 |
67 | ## HACK |
68 | my $MAXRANK; |
69 | if( $tradition->name =~ /Chronicle/ ) { |
70 | $MAXRANK = $tradition->collation->reading('L1545')->rank; |
71 | } |
a23b3715 |
72 | my %datahash; |
73 | # Initialize everything with zeroes |
74 | map { $datahash{$_} = 0 } @resultfields; |
75 | # Put in the real text ID and name |
76 | $datahash{text_id} = $tinfo->{'id'}; |
77 | $datahash{text_name} = $tradition->name; |
78 | |
79 | # Run the analysis for each row in @rows |
ead7032e |
80 | my %opts = ( |
81 | exclude_type1 => 1, |
ee26c4d9 |
82 | merge_types => [ 'punctuation' ] ); |
83 | if( keys %collapse ) { |
84 | push( @{$opts{merge_types}}, keys %collapse ); |
a23b3715 |
85 | } |
86 | |
5c2fdc10 |
87 | my $result; |
6a5f4aa7 |
88 | try { |
5c2fdc10 |
89 | $result = run_analysis( $tradition, %opts ); |
6a5f4aa7 |
90 | } catch { |
91 | say "Analysis run failed on tradition " . $tradition->name . ": @_"; |
92 | next; |
93 | } |
a23b3715 |
94 | $datahash{loc_total} = $result->{variant_count}; |
3bce8f64 |
95 | $datahash{multiparent} = 0; |
6a5f4aa7 |
96 | #$datahash{loc_genealogical} = $result->{genealogical_count}; |
97 | #$datahash{loc_conflictvariant} = $result->{conflict_count}; |
98 | #$datahash{loc_revertvariant} = $result->{reversion_count}; |
99 | # Get the totals by location and by variant as we go. |
a23b3715 |
100 | my $totalvariant = 0; |
6a5f4aa7 |
101 | my $singleton = 0; |
102 | my $genloc = 0; |
a23b3715 |
103 | my $genvariant = 0; |
3ec8b047 |
104 | my $conflictloc = 0; |
6a5f4aa7 |
105 | my $conflictvariant = 0; |
3ec8b047 |
106 | my $revertloc = 0; |
6a5f4aa7 |
107 | my $revertvariant = 0; |
108 | my $msgd; # for the HACK |
a23b3715 |
109 | my @unknown; |
110 | foreach my $loc ( @{$result->{variants}} ) { |
111 | # A transition is the relationship type between parent and child. |
112 | # Find each genealogical transition and get the relationship type (if any) |
113 | # Find each non-genealogical transition and get the relationship type (if any) |
3ec8b047 |
114 | my( $loc_conflict, $loc_reversion ); |
ee26c4d9 |
115 | if( exists $loc->{unsolved} ) { |
6a5f4aa7 |
116 | # Not solved; remove it from the total. |
117 | say "Skipping unsolved location at " . $loc->{id}; |
118 | $datahash{loc_total}--; |
ee26c4d9 |
119 | next; |
5c2fdc10 |
120 | } elsif( $MAXRANK && $loc->{id} > $MAXRANK ) { |
6a5f4aa7 |
121 | # HACK until Chronicle tagging is done |
122 | say "Skipping ranks above $MAXRANK" |
5c2fdc10 |
123 | unless $msgd; |
124 | $msgd = 1; |
125 | next; |
ee26c4d9 |
126 | } |
a23b3715 |
127 | foreach my $rdghash( @{$loc->{readings}} ) { |
128 | # Weed out singletons |
6a5f4aa7 |
129 | $totalvariant++; |
a23b3715 |
130 | my @roots = @{$rdghash->{independent_occurrence}}; |
6a5f4aa7 |
131 | if( @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'} |
132 | && !$rdghash->{'follow_unknown'} ) { |
133 | $singleton++; |
134 | next; |
135 | } |
a23b3715 |
136 | my $rdg = $tradition->collation->reading( $rdghash->{readingid} ); |
3ec8b047 |
137 | my $type; |
138 | if( $rdghash->{'is_conflict'} ) { |
139 | $type = 'conflict'; |
140 | $loc_conflict = 1; |
6a5f4aa7 |
141 | $conflictvariant++; |
3ec8b047 |
142 | } elsif( $rdghash->{'is_reverted'} ) { |
143 | $type = 'reverted'; |
144 | $loc_reversion = 1; |
6a5f4aa7 |
145 | $revertvariant++; |
3ec8b047 |
146 | } elsif( @roots == 1 ) { |
147 | $type = 'genealogical'; |
148 | $genvariant++; |
149 | } else { |
150 | warn "Reading $rdg neither conflict, genealogical, nor reverted. What?"; |
151 | $type = 'ERROR'; |
152 | } |
153 | my $typekey = substr( $type, 0, 3 ) . '_'; |
154 | |
155 | # Add relation stats for reading parents. If the reading is reverted, |
156 | # treat it as genealogical for the parent. |
3bce8f64 |
157 | my $phash = $type eq 'reverted' |
158 | ? $rdghash->{'reversion_parents'} : $rdghash->{'source_parents'}; |
3ec8b047 |
159 | _add_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, $rdg, |
3bce8f64 |
160 | $type, $phash, \%datahash, \@unknown ); |
3ec8b047 |
161 | } |
162 | if( $loc_conflict ) { |
163 | $conflictloc++; |
164 | } elsif( $loc_reversion ) { |
165 | $revertloc++; |
6a5f4aa7 |
166 | } else { |
167 | $genloc++; |
a23b3715 |
168 | } |
169 | } |
170 | |
171 | # Add in the sums for the whole location |
6a5f4aa7 |
172 | $datahash{loc_totalvariant} = $totalvariant; |
173 | $datahash{loc_genealogical} = $genloc; |
174 | $datahash{loc_genvariant} = $genvariant; |
175 | $datahash{loc_conflict} = $conflictloc; |
176 | $datahash{loc_conflictvariant} = $conflictvariant; |
177 | $datahash{loc_reverted} = $revertloc; |
178 | $datahash{loc_revertvariant} = $revertvariant; |
179 | $datahash{loc_singleton} = $singleton; |
180 | $datahash{percent_genealogical} = $datahash{loc_genealogical} / $datahash{loc_total}; |
181 | $datahash{percent_genvariant} = $genvariant / $totalvariant; |
182 | $datahash{percent_genorrevert} = ( $genvariant + $datahash{loc_revertvariant} ) / $totalvariant; |
a23b3715 |
183 | |
184 | # Write them out to CSV. |
185 | my @csvalues = map { $datahash{$_} } @resultfields; |
186 | if( $csv->combine( @csvalues ) ) { |
187 | say $fh $csv->string; |
188 | } else { |
189 | say "combine() failed on argument: " . $csv->error_input; |
190 | } |
5c2fdc10 |
191 | map { say STDERR sprintf( "Unknown transition %s -> %s (%s) :%s", @$_ ) } @unknown; |
a23b3715 |
192 | } |
193 | |
194 | close $fh; |
3ec8b047 |
195 | |
196 | sub _add_reading_relations { |
197 | my( $rid, $rank, $robj, $type, $parenthash, $datahash, $unknown ) = @_; |
3bce8f64 |
198 | my @kp = keys ( %$parenthash ); |
199 | unless( @kp ) { |
200 | _increment_typekey( $datahash, $type, 'source' ); |
201 | return; |
202 | } |
203 | if( @kp > 1 ) { |
204 | $datahash->{multiparent} = @kp - 1; |
205 | } |
206 | foreach my $p ( @kp ) { |
3ec8b047 |
207 | my $pdata = $parenthash->{$p}; |
208 | my $relation; |
209 | if( $pdata->{relation} ) { |
3bce8f64 |
210 | $relation = $pdata->{relation}->{transposed} |
211 | ? 'transposition' : $pdata->{relation}->{type}; |
3ec8b047 |
212 | } else { |
213 | $relation = 'unknown'; |
214 | if( !$robj ) { |
215 | say "Unknown relation on missing reading object $rid at rank $rank"; |
216 | } elsif( !$pdata ) { |
217 | say "Unknown relation on missing parent object for $rid at rank $rank"; |
218 | } else { |
219 | push( @$unknown, [ $pdata->{label}, $robj->id, $robj->text, $type ] ); |
220 | } |
221 | } |
3bce8f64 |
222 | _increment_typekey( $datahash, $type, $relation ); |
223 | } |
224 | } |
225 | |
226 | sub _increment_typekey { |
227 | my( $datahash, $type, $relation ) = @_; |
228 | my $typekey = substr( $type, 0, 3 ) . "_$relation"; |
229 | unless( exists $datahash->{$typekey} ) { |
230 | $DB::single = 1; |
231 | warn "No field for $typekey"; |
3ec8b047 |
232 | } |
3bce8f64 |
233 | $datahash->{$typekey}++; |
234 | # # TODO distinguish parent-bad vs. rdg-bad |
235 | # if( $robj && $robj->grammar_invalid ) { |
236 | # $datahash->{$typekey.'_ungramm'} = 1; |
237 | # } elsif( $robj && $robj->is_nonsense ) { |
238 | # $datahash->{$typekey.'_nonsense'} = 1; |
239 | # } |
5c2fdc10 |
240 | } |