Commit | Line | Data |
31573367 |
1 | #!/usr/bin/env perl |
2 | |
3 | use feature 'say'; |
4 | use lib 'lib'; |
5 | use strict; |
6 | use warnings; |
7 | use Getopt::Long; |
8 | use Text::CSV_XS; |
9 | use Text::Tradition::Analysis qw/ run_analysis /; |
10 | use Text::Tradition::Directory; |
11 | use TryCatch; |
12 | |
13 | binmode STDOUT, ':utf8'; |
14 | binmode STDERR, ':utf8'; |
15 | |
16 | my( $dsn, $dbuser, $dbpass ); |
17 | my $filename = 'analysis.csv'; |
18 | GetOptions( |
19 | 'dsn=s' => \$dsn, |
20 | 'u|user=s' => \$dbuser, |
21 | 'p|pass=s' => \$dbpass, |
22 | 'f|file=s' => \$filename |
23 | ); |
24 | |
25 | my %dbopts = ( dsn => $dsn ); |
26 | if( $dbuser || $dbpass ) { |
27 | $dbopts{extra_args} = { user => $dbuser, password => $dbpass } |
28 | } |
29 | |
30 | my $dir = Text::Tradition::Directory->new( %dbopts ); |
31 | my $scope = $dir->new_scope(); |
32 | my $lookfor = shift @ARGV || ''; |
33 | my %collapse; |
34 | if( @ARGV ) { |
35 | say "Merging relationship types @ARGV"; |
36 | map { $collapse{$_} = 1 } @ARGV; |
37 | } |
38 | |
39 | ## Set up the relationship types we will exclude in turn. False means "run |
40 | ## analysis with basic set of exclusions", i.e. orth/spelling/punct, and exclude |
41 | ## the variants in question later. True means "explicitly exclude this type too |
42 | ## at analysis time." |
43 | my %relation_types = ( |
44 | sameword => undef, |
45 | grammatical => 1, |
46 | lexical => 1, |
47 | uncertain => 1, |
48 | other => 1, |
49 | addition => undef, |
50 | deletion => undef |
51 | ); |
52 | |
53 | # Set up the things we want to calculate for each text |
54 | my @calcs = qw/ total genealogical excoincidental reverted exgenealogical coincidental /; |
55 | my @resultfields = ( 'text_name' ); |
56 | foreach my $rt ( keys %relation_types ) { |
57 | foreach my $cc ( @calcs ) { |
58 | push( @resultfields, sprintf( "%s_ex_%s", $cc, $rt ) ); |
59 | } |
60 | } |
61 | |
62 | my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } ); |
63 | open my $fh, ">:encoding(UTF-8)", $filename or die "$filename: $!"; |
64 | if( $csv->combine( @resultfields ) ) { |
65 | say $fh $csv->string; |
66 | } else { |
67 | say "combine() failed on argument: " . $csv->error_input; |
68 | } |
69 | |
70 | foreach my $tinfo( $dir->traditionlist ) { |
71 | next if $tinfo->{'name'} eq 'xxxxx'; |
72 | next if $tinfo->{'name'} =~ /158/; |
73 | next if $tinfo->{'name'} =~ /Heinrichi part/; |
74 | if( $lookfor ) { |
75 | next unless $tinfo->{'id'} eq $lookfor |
76 | || $tinfo->{'name'} =~ /$lookfor/; |
77 | } |
78 | my $tradition = $dir->lookup( $tinfo->{'id'} ); |
79 | next unless $tradition->stemma_count; |
80 | say "Analyzing tradition " . $tradition->name; |
81 | ## HACK |
82 | my $MAXRANK; |
83 | if( $tradition->name =~ /Chronicle/ ) { |
84 | $MAXRANK = $tradition->collation->reading('L1545')->rank; |
85 | } |
86 | my %datahash; |
87 | # Initialize everything with zeroes |
88 | map { $datahash{$_} = 0 } @resultfields; |
89 | # Put in the real text ID and name |
90 | $datahash{text_id} = $tinfo->{'id'}; |
91 | $datahash{text_name} = $tradition->name; |
92 | |
93 | # Run the analysis for each row in @rows |
94 | my $vanilla; # Store the run with no extra exclusions |
95 | my $result; |
96 | foreach my $type ( keys %relation_types ) { |
97 | say "...calculating on exclusion of $type"; |
98 | if( $relation_types{$type} ) { |
99 | $result = run_exclude( $tradition, $type ); |
100 | } elsif( !$vanilla ) { |
101 | $result = run_exclude( $tradition ); |
102 | $vanilla = $result; |
103 | } else { |
104 | $result = $vanilla; |
105 | } |
106 | |
107 | # Get the totals by location and by variant as we go. |
108 | my $totalvariant = 0; |
109 | my $singleton = 0; |
110 | my $genvariant = 0; |
111 | my $conflictvariant = 0; |
112 | my $revertvariant = 0; |
113 | my $msgd; # for the HACK |
114 | my @unknown; |
115 | foreach my $loc ( @{$result->{variants}} ) { |
116 | # A transition is the relationship type between parent and child. |
117 | # Find each genealogical transition |
118 | # Find each non-genealogical transition |
119 | if( exists $loc->{unsolved} ) { |
120 | # Not solved; remove it from the total. |
121 | say "Skipping unsolved location at " . $loc->{id}; |
122 | next; |
123 | } elsif( $MAXRANK && $loc->{id} > $MAXRANK ) { |
124 | # HACK until Chronicle tagging is done |
125 | say "Skipping ranks above $MAXRANK" |
126 | unless $msgd; |
127 | $msgd = 1; |
128 | next; |
129 | } |
130 | foreach my $rdghash( @{$loc->{readings}} ) { |
131 | # Weed out singletons |
132 | $totalvariant++; |
133 | my @roots = @{$rdghash->{independent_occurrence}}; |
134 | if( @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'} |
135 | && !$rdghash->{'follow_unknown'} ) { |
136 | $singleton++; |
137 | next; |
138 | } |
139 | my $type; |
140 | if( $rdghash->{'is_conflict'} ) { |
141 | $type = 'conflict'; |
142 | $conflictvariant++; |
143 | } elsif( $rdghash->{'is_reverted'} ) { |
144 | $type = 'reverted'; |
145 | $revertvariant++; |
146 | } elsif( @roots == 1 ) { |
147 | $type = 'genealogical'; |
148 | $genvariant++; |
149 | } else { |
150 | warn 'Reading ' . $rdghash->{readingid} . ' neither conflict, genealogical, nor reverted. What?'; |
151 | $type = 'ERROR'; |
152 | } |
153 | } |
154 | } |
155 | |
156 | # Add in the sums for the whole location |
157 | $datahash{"total_$type"} = $totalvariant - $singleton; |
158 | $datahash{"genealogical_ex_$type"} = $genvariant; |
159 | $datahash{"reverted_ex_$type"} = $revertvariant; |
160 | $datahash{"coincidental_ex_$type"} = $conflictvariant; |
161 | $datahash{"excoincidental_ex_type"} = $genvariant + $revertvariant; |
162 | $datahash{"exgenealogical_ex_type"} = $conflictvariant + $revertvariant; |
163 | } |
164 | |
165 | # Write them out to CSV. |
166 | my @csvalues = map { $datahash{$_} } @resultfields; |
167 | if( $csv->combine( @csvalues ) ) { |
168 | say $fh $csv->string; |
169 | } else { |
170 | say "combine() failed on argument: " . $csv->error_input; |
171 | } |
172 | } |
173 | |
174 | close $fh; |
175 | |
176 | sub run_exclude { |
177 | my( $tradition, $type ) = @_; |
178 | my $merge = [ qw/ orthographic spelling punctuation / ]; |
179 | if( $type && $relation_types{$type} ) { |
180 | push( @$merge, $type ); |
181 | } |
182 | |
183 | my $result; |
184 | try { |
185 | $result = run_analysis( $tradition, exclude_type1 => 1, |
186 | merge_types => $merge ); |
187 | } catch { |
188 | say "Analysis run failed on tradition " . $tradition->name . ": @_"; |
189 | return; |
190 | } |
191 | return $result; |
192 | } |