small nomenclature rationalization; save reversion roots
[scpubgit/stemmatology.git] / script / analyze.pl
CommitLineData
a23b3715 1#!/usr/bin/env perl
2
3use feature 'say';
4use lib 'lib';
5use strict;
6use warnings;
7use Text::CSV_XS;
8use Text::Tradition::Analysis qw/ run_analysis /;
9use Text::Tradition::Directory;
10
11binmode STDOUT, ':utf8';
12binmode STDERR, ':utf8';
13
14my $dir = Text::Tradition::Directory->new(
15 'dsn' => 'dbi:SQLite:dbname=stemmaweb/db/traditions.db',
16 );
3ec8b047 17my $calcdsn = 'dbi:SQLite:dbname=db/graphs.db';
a23b3715 18
19my $scope = $dir->new_scope();
20my $lookfor = shift @ARGV || '';
3ec8b047 21my %collapse;
22map { $collapse{$_} = 1 } @ARGV;
a23b3715 23
3ec8b047 24my @relation_types = grep { !$collapse{$_} }
25 qw/ orthographic spelling grammatical lexical transposition addition deletion
26 wordsimilar unknown /;
a23b3715 27
28my @resultfields = qw/
29 text_name loc_total loc_totalvariant loc_genealogical loc_genvariant
3ec8b047 30 loc_conflict loc_conflictvariant loc_reverted loc_revertvariant
31 percent_genealogical percent_genvariant percent_genorrevert /;
a23b3715 32map { push( @resultfields, "gen_$_", "gen_${_}_nonsense", "gen_${_}_ungramm" ) }
33 @relation_types;
34map { push( @resultfields, "con_$_", "con_${_}_nonsense", "con_${_}_ungramm" ) }
35 @relation_types;
3ec8b047 36map { push( @resultfields, "rev_$_", "rev_${_}_nonsense", "rev_${_}_ungramm" ) }
37 @relation_types;
a23b3715 38map { push( @resultfields, "percent_gen_$_", "percent_${_}_gen" ) } @relation_types;
3ec8b047 39map { push( @resultfields, "percent_con_$_", "percent_rev_$_" ) } @relation_types;
a23b3715 40
41my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
42open my $fh, ">:encoding(UTF-8)", "analysis.csv" or die "analysis.csv: $!";
43if( $csv->combine( @resultfields ) ) {
44 say $fh $csv->string;
45} else {
46 say "combine() failed on argument: " . $csv->error_input;
47}
48
49foreach my $tinfo( $dir->traditionlist ) {
50 next if $tinfo->{'name'} eq 'xxxxx';
51 next unless $tinfo->{'id'} eq $lookfor
52 || $tinfo->{'name'} =~ /$lookfor/;
53 my $tradition = $dir->lookup( $tinfo->{'id'} );
54 say "Analyzing tradition " . $tradition->name;
55 my %datahash;
56 # Initialize everything with zeroes
57 map { $datahash{$_} = 0 } @resultfields;
58 # Put in the real text ID and name
59 $datahash{text_id} = $tinfo->{'id'};
60 $datahash{text_name} = $tradition->name;
61
62 # Run the analysis for each row in @rows
3ec8b047 63 my %opts = ( exclude_type1 => 1, calcdsn => $calcdsn );
64 if( keys %collapse ) {
65 $opts{merge_types} = [ keys %collapse ];
a23b3715 66 }
67
68 my $result = run_analysis( $tradition, %opts );
69 $datahash{loc_total} = $result->{variant_count};
70 $datahash{loc_genealogical} = $result->{genealogical_count};
a23b3715 71 $datahash{loc_conflictvariant} = $result->{conflict_count};
3ec8b047 72 $datahash{loc_revertvariant} = $result->{reversion_count};
73 # Get the number of total and genealogical variants, and number of
74 # conflicted/reverted locations, as we go below.
a23b3715 75 my $totalvariant = 0;
76 my $genvariant = 0;
3ec8b047 77 my $conflictloc = 0;
78 my $revertloc = 0;
a23b3715 79 my @unknown;
80 foreach my $loc ( @{$result->{variants}} ) {
81 # A transition is the relationship type between parent and child.
82 # Find each genealogical transition and get the relationship type (if any)
83 # Find each non-genealogical transition and get the relationship type (if any)
3ec8b047 84 my( $loc_conflict, $loc_reversion );
a23b3715 85 foreach my $rdghash( @{$loc->{readings}} ) {
86 # Weed out singletons
87 my @roots = @{$rdghash->{independent_occurrence}};
88 next if @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'}
89 && !$rdghash->{'follow_unknown'};
90 # TODO Weed out punctuation
91 my $rdg = $tradition->collation->reading( $rdghash->{readingid} );
3ec8b047 92 my $type;
93 if( $rdghash->{'is_conflict'} ) {
94 $type = 'conflict';
95 $loc_conflict = 1;
96 } elsif( $rdghash->{'is_reverted'} ) {
97 $type = 'reverted';
98 $loc_reversion = 1;
99 } elsif( @roots == 1 ) {
100 $type = 'genealogical';
101 $genvariant++;
102 } else {
103 warn "Reading $rdg neither conflict, genealogical, nor reverted. What?";
104 $type = 'ERROR';
105 }
106 my $typekey = substr( $type, 0, 3 ) . '_';
107
108 # Add relation stats for reading parents. If the reading is reverted,
109 # treat it as genealogical for the parent.
110 _add_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, $rdg,
111 ( $type eq 'reverted' ? 'genealogical' : $type ),
be590045 112 $rdghash->{'source_parents'}, \%datahash, \@unknown );
3ec8b047 113 # Add relation stats for reading reversions if they exist.
114 if( $type eq 'reverted' ) {
115 # Get relationship between reverted readings and their non-matching
116 # parents.
117 _add_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, $rdg,
118 $type, $rdghash->{'reversion_parents'}, \%datahash, \@unknown );
a23b3715 119 }
3ec8b047 120
a23b3715 121 $totalvariant++;
3ec8b047 122 }
123 if( $loc_conflict ) {
124 $conflictloc++;
125 } elsif( $loc_reversion ) {
126 $revertloc++;
a23b3715 127 }
128 }
129
130 # Add in the sums for the whole location
131 $datahash{'loc_genvariant'} = $genvariant;
132 $datahash{'loc_totalvariant'} = $totalvariant;
3ec8b047 133 $datahash{'loc_conflict'} = $conflictloc;
134 $datahash{'loc_reverted'} = $revertloc;
a23b3715 135 $datahash{'percent_genealogical'} = $datahash{loc_genealogical} / $datahash{loc_total};
136 $datahash{'percent_genvariant'} = $genvariant / $totalvariant;
3ec8b047 137 $datahash{'percent_genorrevert'} = ( $genvariant + $datahash{loc_revertvariant} ) / $totalvariant;
a23b3715 138 foreach my $type ( @relation_types ) {
139 $datahash{"percent_gen_$type"} = $datahash{"gen_$type"} / $totalvariant;
140 $datahash{"percent_con_$type"} = $datahash{"con_$type"} / $totalvariant;
141 $datahash{"percent_${type}_gen"} =
142 $datahash{"gen_$type"} + $datahash{"con_$type"} == 0 ? 0 :
143 $datahash{"gen_$type"} / ( $datahash{"gen_$type"} + $datahash{"con_$type"} );
144 }
145
146 # Write them out to CSV.
147 my @csvalues = map { $datahash{$_} } @resultfields;
148 if( $csv->combine( @csvalues ) ) {
149 say $fh $csv->string;
150 } else {
151 say "combine() failed on argument: " . $csv->error_input;
152 }
153 map { say sprintf( "Unknown transition %s -> %s (%s) :%s", @$_ ) } @unknown;
154}
155
156close $fh;
3ec8b047 157
158sub _add_reading_relations {
159 my( $rid, $rank, $robj, $type, $parenthash, $datahash, $unknown ) = @_;
160 foreach my $p ( keys %$parenthash ) {
161 my $pdata = $parenthash->{$p};
162 my $relation;
163 if( $pdata->{relation} ) {
164 $relation = $pdata->{relation}->{type};
165 } else {
166 $relation = 'unknown';
167 if( !$robj ) {
168 say "Unknown relation on missing reading object $rid at rank $rank";
169 } elsif( !$pdata ) {
170 say "Unknown relation on missing parent object for $rid at rank $rank";
171 } else {
172 push( @$unknown, [ $pdata->{label}, $robj->id, $robj->text, $type ] );
173 }
174 }
175 my $typekey = substr( $type, 0, 3 ) . "_$relation";
176 $datahash->{$typekey}++;
177 ## TODO distinguish parent-bad vs. rdg-bad
178 if( $robj && $robj->grammar_invalid ) {
179 $datahash->{$typekey.'_ungramm'} = 1;
180 } elsif( $robj && $robj->is_nonsense ) {
181 $datahash->{$typekey.'_nonsense'} = 1;
182 }
183 }
184}