Commit | Line | Data |
a23b3715 |
1 | #!/usr/bin/env perl |
2 | |
3 | use feature 'say'; |
4 | use lib 'lib'; |
5 | use strict; |
6 | use warnings; |
7 | use Text::CSV_XS; |
8 | use Text::Tradition::Analysis qw/ run_analysis /; |
9 | use Text::Tradition::Directory; |
10 | |
11 | binmode STDOUT, ':utf8'; |
12 | binmode STDERR, ':utf8'; |
13 | |
14 | my $dir = Text::Tradition::Directory->new( |
15 | 'dsn' => 'dbi:SQLite:dbname=stemmaweb/db/traditions.db', |
16 | ); |
3ec8b047 |
17 | my $calcdsn = 'dbi:SQLite:dbname=db/graphs.db'; |
a23b3715 |
18 | |
19 | my $scope = $dir->new_scope(); |
20 | my $lookfor = shift @ARGV || ''; |
3ec8b047 |
21 | my %collapse; |
22 | map { $collapse{$_} = 1 } @ARGV; |
a23b3715 |
23 | |
3ec8b047 |
24 | my @relation_types = grep { !$collapse{$_} } |
25 | qw/ orthographic spelling grammatical lexical transposition addition deletion |
26 | wordsimilar unknown /; |
a23b3715 |
27 | |
28 | my @resultfields = qw/ |
29 | text_name loc_total loc_totalvariant loc_genealogical loc_genvariant |
3ec8b047 |
30 | loc_conflict loc_conflictvariant loc_reverted loc_revertvariant |
31 | percent_genealogical percent_genvariant percent_genorrevert /; |
a23b3715 |
32 | map { push( @resultfields, "gen_$_", "gen_${_}_nonsense", "gen_${_}_ungramm" ) } |
33 | @relation_types; |
34 | map { push( @resultfields, "con_$_", "con_${_}_nonsense", "con_${_}_ungramm" ) } |
35 | @relation_types; |
3ec8b047 |
36 | map { push( @resultfields, "rev_$_", "rev_${_}_nonsense", "rev_${_}_ungramm" ) } |
37 | @relation_types; |
a23b3715 |
38 | map { push( @resultfields, "percent_gen_$_", "percent_${_}_gen" ) } @relation_types; |
3ec8b047 |
39 | map { push( @resultfields, "percent_con_$_", "percent_rev_$_" ) } @relation_types; |
a23b3715 |
40 | |
41 | my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } ); |
42 | open my $fh, ">:encoding(UTF-8)", "analysis.csv" or die "analysis.csv: $!"; |
43 | if( $csv->combine( @resultfields ) ) { |
44 | say $fh $csv->string; |
45 | } else { |
46 | say "combine() failed on argument: " . $csv->error_input; |
47 | } |
48 | |
49 | foreach my $tinfo( $dir->traditionlist ) { |
50 | next if $tinfo->{'name'} eq 'xxxxx'; |
51 | next unless $tinfo->{'id'} eq $lookfor |
52 | || $tinfo->{'name'} =~ /$lookfor/; |
53 | my $tradition = $dir->lookup( $tinfo->{'id'} ); |
54 | say "Analyzing tradition " . $tradition->name; |
55 | my %datahash; |
56 | # Initialize everything with zeroes |
57 | map { $datahash{$_} = 0 } @resultfields; |
58 | # Put in the real text ID and name |
59 | $datahash{text_id} = $tinfo->{'id'}; |
60 | $datahash{text_name} = $tradition->name; |
61 | |
62 | # Run the analysis for each row in @rows |
3ec8b047 |
63 | my %opts = ( exclude_type1 => 1, calcdsn => $calcdsn ); |
64 | if( keys %collapse ) { |
65 | $opts{merge_types} = [ keys %collapse ]; |
a23b3715 |
66 | } |
67 | |
68 | my $result = run_analysis( $tradition, %opts ); |
69 | $datahash{loc_total} = $result->{variant_count}; |
70 | $datahash{loc_genealogical} = $result->{genealogical_count}; |
a23b3715 |
71 | $datahash{loc_conflictvariant} = $result->{conflict_count}; |
3ec8b047 |
72 | $datahash{loc_revertvariant} = $result->{reversion_count}; |
73 | # Get the number of total and genealogical variants, and number of |
74 | # conflicted/reverted locations, as we go below. |
a23b3715 |
75 | my $totalvariant = 0; |
76 | my $genvariant = 0; |
3ec8b047 |
77 | my $conflictloc = 0; |
78 | my $revertloc = 0; |
a23b3715 |
79 | my @unknown; |
80 | foreach my $loc ( @{$result->{variants}} ) { |
81 | # A transition is the relationship type between parent and child. |
82 | # Find each genealogical transition and get the relationship type (if any) |
83 | # Find each non-genealogical transition and get the relationship type (if any) |
3ec8b047 |
84 | my( $loc_conflict, $loc_reversion ); |
a23b3715 |
85 | foreach my $rdghash( @{$loc->{readings}} ) { |
86 | # Weed out singletons |
87 | my @roots = @{$rdghash->{independent_occurrence}}; |
88 | next if @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'} |
89 | && !$rdghash->{'follow_unknown'}; |
90 | # TODO Weed out punctuation |
91 | my $rdg = $tradition->collation->reading( $rdghash->{readingid} ); |
3ec8b047 |
92 | my $type; |
93 | if( $rdghash->{'is_conflict'} ) { |
94 | $type = 'conflict'; |
95 | $loc_conflict = 1; |
96 | } elsif( $rdghash->{'is_reverted'} ) { |
97 | $type = 'reverted'; |
98 | $loc_reversion = 1; |
99 | } elsif( @roots == 1 ) { |
100 | $type = 'genealogical'; |
101 | $genvariant++; |
102 | } else { |
103 | warn "Reading $rdg neither conflict, genealogical, nor reverted. What?"; |
104 | $type = 'ERROR'; |
105 | } |
106 | my $typekey = substr( $type, 0, 3 ) . '_'; |
107 | |
108 | # Add relation stats for reading parents. If the reading is reverted, |
109 | # treat it as genealogical for the parent. |
110 | _add_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, $rdg, |
111 | ( $type eq 'reverted' ? 'genealogical' : $type ), |
be590045 |
112 | $rdghash->{'source_parents'}, \%datahash, \@unknown ); |
3ec8b047 |
113 | # Add relation stats for reading reversions if they exist. |
114 | if( $type eq 'reverted' ) { |
115 | # Get relationship between reverted readings and their non-matching |
116 | # parents. |
117 | _add_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, $rdg, |
118 | $type, $rdghash->{'reversion_parents'}, \%datahash, \@unknown ); |
a23b3715 |
119 | } |
3ec8b047 |
120 | |
a23b3715 |
121 | $totalvariant++; |
3ec8b047 |
122 | } |
123 | if( $loc_conflict ) { |
124 | $conflictloc++; |
125 | } elsif( $loc_reversion ) { |
126 | $revertloc++; |
a23b3715 |
127 | } |
128 | } |
129 | |
130 | # Add in the sums for the whole location |
131 | $datahash{'loc_genvariant'} = $genvariant; |
132 | $datahash{'loc_totalvariant'} = $totalvariant; |
3ec8b047 |
133 | $datahash{'loc_conflict'} = $conflictloc; |
134 | $datahash{'loc_reverted'} = $revertloc; |
a23b3715 |
135 | $datahash{'percent_genealogical'} = $datahash{loc_genealogical} / $datahash{loc_total}; |
136 | $datahash{'percent_genvariant'} = $genvariant / $totalvariant; |
3ec8b047 |
137 | $datahash{'percent_genorrevert'} = ( $genvariant + $datahash{loc_revertvariant} ) / $totalvariant; |
a23b3715 |
138 | foreach my $type ( @relation_types ) { |
139 | $datahash{"percent_gen_$type"} = $datahash{"gen_$type"} / $totalvariant; |
140 | $datahash{"percent_con_$type"} = $datahash{"con_$type"} / $totalvariant; |
141 | $datahash{"percent_${type}_gen"} = |
142 | $datahash{"gen_$type"} + $datahash{"con_$type"} == 0 ? 0 : |
143 | $datahash{"gen_$type"} / ( $datahash{"gen_$type"} + $datahash{"con_$type"} ); |
144 | } |
145 | |
146 | # Write them out to CSV. |
147 | my @csvalues = map { $datahash{$_} } @resultfields; |
148 | if( $csv->combine( @csvalues ) ) { |
149 | say $fh $csv->string; |
150 | } else { |
151 | say "combine() failed on argument: " . $csv->error_input; |
152 | } |
153 | map { say sprintf( "Unknown transition %s -> %s (%s) :%s", @$_ ) } @unknown; |
154 | } |
155 | |
156 | close $fh; |
3ec8b047 |
157 | |
158 | sub _add_reading_relations { |
159 | my( $rid, $rank, $robj, $type, $parenthash, $datahash, $unknown ) = @_; |
160 | foreach my $p ( keys %$parenthash ) { |
161 | my $pdata = $parenthash->{$p}; |
162 | my $relation; |
163 | if( $pdata->{relation} ) { |
164 | $relation = $pdata->{relation}->{type}; |
165 | } else { |
166 | $relation = 'unknown'; |
167 | if( !$robj ) { |
168 | say "Unknown relation on missing reading object $rid at rank $rank"; |
169 | } elsif( !$pdata ) { |
170 | say "Unknown relation on missing parent object for $rid at rank $rank"; |
171 | } else { |
172 | push( @$unknown, [ $pdata->{label}, $robj->id, $robj->text, $type ] ); |
173 | } |
174 | } |
175 | my $typekey = substr( $type, 0, 3 ) . "_$relation"; |
176 | $datahash->{$typekey}++; |
177 | ## TODO distinguish parent-bad vs. rdg-bad |
178 | if( $robj && $robj->grammar_invalid ) { |
179 | $datahash->{$typekey.'_ungramm'} = 1; |
180 | } elsif( $robj && $robj->is_nonsense ) { |
181 | $datahash->{$typekey.'_nonsense'} = 1; |
182 | } |
183 | } |
184 | } |