Commit | Line | Data |
31573367 |
1 | #!/usr/bin/env perl |
2 | |
3 | use feature 'say'; |
4 | use lib 'lib'; |
5 | use strict; |
6 | use warnings; |
7 | use Getopt::Long; |
8 | use Text::CSV_XS; |
9 | use Text::Tradition::Analysis qw/ run_analysis /; |
10 | use Text::Tradition::Directory; |
11 | use TryCatch; |
12 | |
13 | binmode STDOUT, ':utf8'; |
14 | binmode STDERR, ':utf8'; |
15 | |
16 | my( $dsn, $dbuser, $dbpass ); |
6ad2c61a |
17 | my $filename = 'exclusions.csv'; |
31573367 |
18 | GetOptions( |
19 | 'dsn=s' => \$dsn, |
20 | 'u|user=s' => \$dbuser, |
21 | 'p|pass=s' => \$dbpass, |
22 | 'f|file=s' => \$filename |
23 | ); |
24 | |
25 | my %dbopts = ( dsn => $dsn ); |
26 | if( $dbuser || $dbpass ) { |
27 | $dbopts{extra_args} = { user => $dbuser, password => $dbpass } |
28 | } |
29 | |
30 | my $dir = Text::Tradition::Directory->new( %dbopts ); |
31 | my $scope = $dir->new_scope(); |
32 | my $lookfor = shift @ARGV || ''; |
31573367 |
33 | |
34 | ## Set up the relationship types we will exclude in turn. False means "run |
35 | ## analysis with basic set of exclusions", i.e. orth/spelling/punct, and exclude |
36 | ## the variants in question later. True means "explicitly exclude this type too |
37 | ## at analysis time." |
861e4c6d |
38 | my @relation_types = qw/ none sameword grammatical lexical uncertain other |
6ad2c61a |
39 | addition deletion transposition /; |
31573367 |
40 | |
41 | # Set up the things we want to calculate for each text |
42 | my @calcs = qw/ total genealogical excoincidental reverted exgenealogical coincidental /; |
43 | my @resultfields = ( 'text_name' ); |
6ad2c61a |
44 | foreach my $rt ( @relation_types ) { |
31573367 |
45 | foreach my $cc ( @calcs ) { |
46 | push( @resultfields, sprintf( "%s_ex_%s", $cc, $rt ) ); |
47 | } |
48 | } |
49 | |
50 | my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } ); |
51 | open my $fh, ">:encoding(UTF-8)", $filename or die "$filename: $!"; |
52 | if( $csv->combine( @resultfields ) ) { |
53 | say $fh $csv->string; |
54 | } else { |
55 | say "combine() failed on argument: " . $csv->error_input; |
56 | } |
57 | |
58 | foreach my $tinfo( $dir->traditionlist ) { |
59 | next if $tinfo->{'name'} eq 'xxxxx'; |
60 | next if $tinfo->{'name'} =~ /158/; |
61 | next if $tinfo->{'name'} =~ /Heinrichi part/; |
62 | if( $lookfor ) { |
63 | next unless $tinfo->{'id'} eq $lookfor |
64 | || $tinfo->{'name'} =~ /$lookfor/; |
65 | } |
66 | my $tradition = $dir->lookup( $tinfo->{'id'} ); |
67 | next unless $tradition->stemma_count; |
68 | say "Analyzing tradition " . $tradition->name; |
69 | ## HACK |
70 | my $MAXRANK; |
71 | if( $tradition->name =~ /Chronicle/ ) { |
72 | $MAXRANK = $tradition->collation->reading('L1545')->rank; |
73 | } |
74 | my %datahash; |
75 | # Initialize everything with zeroes |
76 | map { $datahash{$_} = 0 } @resultfields; |
77 | # Put in the real text ID and name |
78 | $datahash{text_id} = $tinfo->{'id'}; |
79 | $datahash{text_name} = $tradition->name; |
80 | |
861e4c6d |
81 | my $fullresult; |
82 | my $noorthresult; |
6ad2c61a |
83 | try { |
861e4c6d |
84 | $fullresult = run_analysis( $tradition, exclude_type1 => 1 ); |
85 | $noorthresult = run_analysis( $tradition, exclude_type1 => 1, |
6ad2c61a |
86 | merge_types => [ qw/ orthographic spelling punctuation / ] ); |
87 | } catch { |
88 | say "Analysis run failed on tradition " . $tradition->name . ": @_"; |
89 | return; |
90 | } |
91 | foreach my $rtype ( @relation_types ) { |
aee39255 |
92 | say "...calculating on exclusion of $rtype"; |
861e4c6d |
93 | my $result = $rtype eq 'none' ? $fullresult : $noorthresult; |
31573367 |
94 | |
95 | # Get the totals by location and by variant as we go. |
96 | my $totalvariant = 0; |
97 | my $singleton = 0; |
98 | my $genvariant = 0; |
99 | my $conflictvariant = 0; |
100 | my $revertvariant = 0; |
101 | my $msgd; # for the HACK |
102 | my @unknown; |
103 | foreach my $loc ( @{$result->{variants}} ) { |
104 | # A transition is the relationship type between parent and child. |
105 | # Find each genealogical transition |
106 | # Find each non-genealogical transition |
107 | if( exists $loc->{unsolved} ) { |
108 | # Not solved; remove it from the total. |
109 | say "Skipping unsolved location at " . $loc->{id}; |
110 | next; |
111 | } elsif( $MAXRANK && $loc->{id} > $MAXRANK ) { |
112 | # HACK until Chronicle tagging is done |
113 | say "Skipping ranks above $MAXRANK" |
114 | unless $msgd; |
115 | $msgd = 1; |
116 | next; |
117 | } |
118 | foreach my $rdghash( @{$loc->{readings}} ) { |
119 | # Weed out singletons |
120 | $totalvariant++; |
121 | my @roots = @{$rdghash->{independent_occurrence}}; |
122 | if( @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'} |
123 | && !$rdghash->{'follow_unknown'} ) { |
124 | $singleton++; |
125 | next; |
126 | } |
127 | my $type; |
128 | if( $rdghash->{'is_conflict'} ) { |
129 | $type = 'conflict'; |
31573367 |
130 | } elsif( $rdghash->{'is_reverted'} ) { |
131 | $type = 'reverted'; |
31573367 |
132 | } elsif( @roots == 1 ) { |
133 | $type = 'genealogical'; |
31573367 |
134 | } else { |
135 | warn 'Reading ' . $rdghash->{readingid} . ' neither conflict, genealogical, nor reverted. What?'; |
136 | $type = 'ERROR'; |
137 | } |
aee39255 |
138 | # Get the relationship type stats for reading parents. |
139 | my $rdg = $tradition->collation->reading( $rdghash->{readingid} ); |
140 | |
141 | my $phash = $type eq 'reverted' |
142 | ? $rdghash->{'reversion_parents'} : $rdghash->{'source_parents'}; |
6ad2c61a |
143 | my @rels = _get_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, |
aee39255 |
144 | $rdg, $type, $phash, \%datahash, \@unknown ); |
145 | # If this is one of our exclusions, take it out of the total. |
6ad2c61a |
146 | foreach my $rel ( @rels ) { |
6ad2c61a |
147 | if( $rel eq $rtype ) { |
148 | $totalvariant--; |
149 | next; |
150 | } else { |
aee39255 |
151 | # Otherwise add the variant type to our count. |
6ad2c61a |
152 | $conflictvariant++ if $type eq 'conflict'; |
153 | $revertvariant++ if $type eq 'reverted'; |
154 | $genvariant++ if $type eq 'genealogical'; |
155 | } |
156 | } |
31573367 |
157 | } |
158 | } |
159 | |
160 | # Add in the sums for the whole location |
6ad2c61a |
161 | $datahash{"total_ex_$rtype"} = $totalvariant - $singleton; |
aee39255 |
162 | $datahash{"genealogical_ex_$rtype"} = $genvariant; |
163 | $datahash{"reverted_ex_$rtype"} = $revertvariant; |
164 | $datahash{"coincidental_ex_$rtype"} = $conflictvariant; |
165 | $datahash{"excoincidental_ex_$rtype"} = $genvariant + $revertvariant; |
6ad2c61a |
166 | $datahash{"exgenealogical_ex_$rtype"} = $conflictvariant + $revertvariant; |
31573367 |
167 | } |
168 | |
169 | # Write them out to CSV. |
170 | my @csvalues = map { $datahash{$_} } @resultfields; |
171 | if( $csv->combine( @csvalues ) ) { |
172 | say $fh $csv->string; |
173 | } else { |
174 | say "combine() failed on argument: " . $csv->error_input; |
175 | } |
176 | } |
177 | |
178 | close $fh; |
179 | |
aee39255 |
180 | sub _get_reading_relations { |
181 | my( $rid, $rank, $robj, $type, $parenthash, $datahash, $unknown ) = @_; |
182 | my @kp = keys ( %$parenthash ); |
861e4c6d |
183 | return ( 'source' ) unless @kp; # In case there is no parent reading to relate. |
184 | |
6ad2c61a |
185 | my @rels; |
aee39255 |
186 | foreach my $p ( @kp ) { |
187 | my $pdata = $parenthash->{$p}; |
188 | my $relation; |
189 | if( $pdata->{relation} ) { |
190 | $relation = $pdata->{relation}->{transposed} |
191 | ? 'transposition' : $pdata->{relation}->{type}; |
192 | } else { |
193 | $relation = 'unknown'; |
194 | if( !$robj ) { |
195 | say "Unknown relation on missing reading object $rid at rank $rank"; |
196 | } elsif( !$pdata ) { |
197 | say "Unknown relation on missing parent object for $rid at rank $rank"; |
198 | } else { |
199 | push( @$unknown, [ $pdata->{label}, $robj->id, $robj->text, $type ] ); |
200 | } |
201 | } |
6ad2c61a |
202 | push( @rels, $relation ); |
aee39255 |
203 | } |
6ad2c61a |
204 | return @rels; |
aee39255 |
205 | } |
206 | |