Commit | Line | Data |
a23b3715 |
1 | #!/usr/bin/env perl |
2 | |
3 | use feature 'say'; |
4 | use lib 'lib'; |
5 | use strict; |
6 | use warnings; |
7 | use Text::CSV_XS; |
8 | use Text::Tradition::Analysis qw/ run_analysis /; |
9 | use Text::Tradition::Directory; |
10 | |
11 | binmode STDOUT, ':utf8'; |
12 | binmode STDERR, ':utf8'; |
13 | |
14 | my $dir = Text::Tradition::Directory->new( |
15 | 'dsn' => 'dbi:SQLite:dbname=stemmaweb/db/traditions.db', |
16 | ); |
3ec8b047 |
17 | my $calcdsn = 'dbi:SQLite:dbname=db/graphs.db'; |
a23b3715 |
18 | |
19 | my $scope = $dir->new_scope(); |
20 | my $lookfor = shift @ARGV || ''; |
ead7032e |
21 | my @collapse = @ARGV; |
a23b3715 |
22 | |
3ec8b047 |
23 | my @relation_types = grep { !$collapse{$_} } |
24 | qw/ orthographic spelling grammatical lexical transposition addition deletion |
25 | wordsimilar unknown /; |
a23b3715 |
26 | |
27 | my @resultfields = qw/ |
28 | text_name loc_total loc_totalvariant loc_genealogical loc_genvariant |
3ec8b047 |
29 | loc_conflict loc_conflictvariant loc_reverted loc_revertvariant |
30 | percent_genealogical percent_genvariant percent_genorrevert /; |
a23b3715 |
31 | map { push( @resultfields, "gen_$_", "gen_${_}_nonsense", "gen_${_}_ungramm" ) } |
32 | @relation_types; |
33 | map { push( @resultfields, "con_$_", "con_${_}_nonsense", "con_${_}_ungramm" ) } |
34 | @relation_types; |
3ec8b047 |
35 | map { push( @resultfields, "rev_$_", "rev_${_}_nonsense", "rev_${_}_ungramm" ) } |
36 | @relation_types; |
a23b3715 |
37 | map { push( @resultfields, "percent_gen_$_", "percent_${_}_gen" ) } @relation_types; |
3ec8b047 |
38 | map { push( @resultfields, "percent_con_$_", "percent_rev_$_" ) } @relation_types; |
a23b3715 |
39 | |
40 | my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } ); |
41 | open my $fh, ">:encoding(UTF-8)", "analysis.csv" or die "analysis.csv: $!"; |
42 | if( $csv->combine( @resultfields ) ) { |
43 | say $fh $csv->string; |
44 | } else { |
45 | say "combine() failed on argument: " . $csv->error_input; |
46 | } |
47 | |
48 | foreach my $tinfo( $dir->traditionlist ) { |
49 | next if $tinfo->{'name'} eq 'xxxxx'; |
50 | next unless $tinfo->{'id'} eq $lookfor |
51 | || $tinfo->{'name'} =~ /$lookfor/; |
52 | my $tradition = $dir->lookup( $tinfo->{'id'} ); |
53 | say "Analyzing tradition " . $tradition->name; |
54 | my %datahash; |
55 | # Initialize everything with zeroes |
56 | map { $datahash{$_} = 0 } @resultfields; |
57 | # Put in the real text ID and name |
58 | $datahash{text_id} = $tinfo->{'id'}; |
59 | $datahash{text_name} = $tradition->name; |
60 | |
61 | # Run the analysis for each row in @rows |
ead7032e |
62 | my %opts = ( |
63 | exclude_type1 => 1, |
64 | merge_types => [ 'punctuation' ], |
65 | calcdsn => $calcdsn ); |
66 | if( @collapse ) { |
67 | push( @{$opts{merge_types}}, @collapse ); |
a23b3715 |
68 | } |
69 | |
70 | my $result = run_analysis( $tradition, %opts ); |
71 | $datahash{loc_total} = $result->{variant_count}; |
72 | $datahash{loc_genealogical} = $result->{genealogical_count}; |
a23b3715 |
73 | $datahash{loc_conflictvariant} = $result->{conflict_count}; |
3ec8b047 |
74 | $datahash{loc_revertvariant} = $result->{reversion_count}; |
75 | # Get the number of total and genealogical variants, and number of |
76 | # conflicted/reverted locations, as we go below. |
a23b3715 |
77 | my $totalvariant = 0; |
78 | my $genvariant = 0; |
3ec8b047 |
79 | my $conflictloc = 0; |
80 | my $revertloc = 0; |
a23b3715 |
81 | my @unknown; |
82 | foreach my $loc ( @{$result->{variants}} ) { |
83 | # A transition is the relationship type between parent and child. |
84 | # Find each genealogical transition and get the relationship type (if any) |
85 | # Find each non-genealogical transition and get the relationship type (if any) |
3ec8b047 |
86 | my( $loc_conflict, $loc_reversion ); |
a23b3715 |
87 | foreach my $rdghash( @{$loc->{readings}} ) { |
88 | # Weed out singletons |
89 | my @roots = @{$rdghash->{independent_occurrence}}; |
90 | next if @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'} |
91 | && !$rdghash->{'follow_unknown'}; |
a23b3715 |
92 | my $rdg = $tradition->collation->reading( $rdghash->{readingid} ); |
3ec8b047 |
93 | my $type; |
94 | if( $rdghash->{'is_conflict'} ) { |
95 | $type = 'conflict'; |
96 | $loc_conflict = 1; |
97 | } elsif( $rdghash->{'is_reverted'} ) { |
98 | $type = 'reverted'; |
99 | $loc_reversion = 1; |
100 | } elsif( @roots == 1 ) { |
101 | $type = 'genealogical'; |
102 | $genvariant++; |
103 | } else { |
104 | warn "Reading $rdg neither conflict, genealogical, nor reverted. What?"; |
105 | $type = 'ERROR'; |
106 | } |
107 | my $typekey = substr( $type, 0, 3 ) . '_'; |
108 | |
109 | # Add relation stats for reading parents. If the reading is reverted, |
110 | # treat it as genealogical for the parent. |
111 | _add_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, $rdg, |
112 | ( $type eq 'reverted' ? 'genealogical' : $type ), |
be590045 |
113 | $rdghash->{'source_parents'}, \%datahash, \@unknown ); |
3ec8b047 |
114 | # Add relation stats for reading reversions if they exist. |
115 | if( $type eq 'reverted' ) { |
116 | # Get relationship between reverted readings and their non-matching |
117 | # parents. |
118 | _add_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, $rdg, |
119 | $type, $rdghash->{'reversion_parents'}, \%datahash, \@unknown ); |
a23b3715 |
120 | } |
3ec8b047 |
121 | |
a23b3715 |
122 | $totalvariant++; |
3ec8b047 |
123 | } |
124 | if( $loc_conflict ) { |
125 | $conflictloc++; |
126 | } elsif( $loc_reversion ) { |
127 | $revertloc++; |
a23b3715 |
128 | } |
129 | } |
130 | |
131 | # Add in the sums for the whole location |
132 | $datahash{'loc_genvariant'} = $genvariant; |
133 | $datahash{'loc_totalvariant'} = $totalvariant; |
3ec8b047 |
134 | $datahash{'loc_conflict'} = $conflictloc; |
135 | $datahash{'loc_reverted'} = $revertloc; |
a23b3715 |
136 | $datahash{'percent_genealogical'} = $datahash{loc_genealogical} / $datahash{loc_total}; |
137 | $datahash{'percent_genvariant'} = $genvariant / $totalvariant; |
3ec8b047 |
138 | $datahash{'percent_genorrevert'} = ( $genvariant + $datahash{loc_revertvariant} ) / $totalvariant; |
a23b3715 |
139 | foreach my $type ( @relation_types ) { |
140 | $datahash{"percent_gen_$type"} = $datahash{"gen_$type"} / $totalvariant; |
141 | $datahash{"percent_con_$type"} = $datahash{"con_$type"} / $totalvariant; |
142 | $datahash{"percent_${type}_gen"} = |
143 | $datahash{"gen_$type"} + $datahash{"con_$type"} == 0 ? 0 : |
144 | $datahash{"gen_$type"} / ( $datahash{"gen_$type"} + $datahash{"con_$type"} ); |
145 | } |
146 | |
147 | # Write them out to CSV. |
148 | my @csvalues = map { $datahash{$_} } @resultfields; |
149 | if( $csv->combine( @csvalues ) ) { |
150 | say $fh $csv->string; |
151 | } else { |
152 | say "combine() failed on argument: " . $csv->error_input; |
153 | } |
154 | map { say sprintf( "Unknown transition %s -> %s (%s) :%s", @$_ ) } @unknown; |
155 | } |
156 | |
157 | close $fh; |
3ec8b047 |
158 | |
159 | sub _add_reading_relations { |
160 | my( $rid, $rank, $robj, $type, $parenthash, $datahash, $unknown ) = @_; |
161 | foreach my $p ( keys %$parenthash ) { |
162 | my $pdata = $parenthash->{$p}; |
163 | my $relation; |
164 | if( $pdata->{relation} ) { |
165 | $relation = $pdata->{relation}->{type}; |
166 | } else { |
167 | $relation = 'unknown'; |
168 | if( !$robj ) { |
169 | say "Unknown relation on missing reading object $rid at rank $rank"; |
170 | } elsif( !$pdata ) { |
171 | say "Unknown relation on missing parent object for $rid at rank $rank"; |
172 | } else { |
173 | push( @$unknown, [ $pdata->{label}, $robj->id, $robj->text, $type ] ); |
174 | } |
175 | } |
176 | my $typekey = substr( $type, 0, 3 ) . "_$relation"; |
177 | $datahash->{$typekey}++; |
178 | ## TODO distinguish parent-bad vs. rdg-bad |
179 | if( $robj && $robj->grammar_invalid ) { |
180 | $datahash->{$typekey.'_ungramm'} = 1; |
181 | } elsif( $robj && $robj->is_nonsense ) { |
182 | $datahash->{$typekey.'_nonsense'} = 1; |
183 | } |
184 | } |
185 | } |