fix stemma test
[scpubgit/stemmatology.git] / script / analyze.pl
CommitLineData
a23b3715 1#!/usr/bin/env perl
2
3use feature 'say';
4use lib 'lib';
5use strict;
6use warnings;
7use Text::CSV_XS;
8use Text::Tradition::Analysis qw/ run_analysis /;
9use Text::Tradition::Directory;
10
11binmode STDOUT, ':utf8';
12binmode STDERR, ':utf8';
13
14my $dir = Text::Tradition::Directory->new(
15 'dsn' => 'dbi:SQLite:dbname=stemmaweb/db/traditions.db',
16 );
3ec8b047 17my $calcdsn = 'dbi:SQLite:dbname=db/graphs.db';
a23b3715 18
19my $scope = $dir->new_scope();
20my $lookfor = shift @ARGV || '';
ead7032e 21my @collapse = @ARGV;
a23b3715 22
3ec8b047 23my @relation_types = grep { !$collapse{$_} }
24 qw/ orthographic spelling grammatical lexical transposition addition deletion
25 wordsimilar unknown /;
a23b3715 26
27my @resultfields = qw/
28 text_name loc_total loc_totalvariant loc_genealogical loc_genvariant
3ec8b047 29 loc_conflict loc_conflictvariant loc_reverted loc_revertvariant
30 percent_genealogical percent_genvariant percent_genorrevert /;
a23b3715 31map { push( @resultfields, "gen_$_", "gen_${_}_nonsense", "gen_${_}_ungramm" ) }
32 @relation_types;
33map { push( @resultfields, "con_$_", "con_${_}_nonsense", "con_${_}_ungramm" ) }
34 @relation_types;
3ec8b047 35map { push( @resultfields, "rev_$_", "rev_${_}_nonsense", "rev_${_}_ungramm" ) }
36 @relation_types;
a23b3715 37map { push( @resultfields, "percent_gen_$_", "percent_${_}_gen" ) } @relation_types;
3ec8b047 38map { push( @resultfields, "percent_con_$_", "percent_rev_$_" ) } @relation_types;
a23b3715 39
40my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
41open my $fh, ">:encoding(UTF-8)", "analysis.csv" or die "analysis.csv: $!";
42if( $csv->combine( @resultfields ) ) {
43 say $fh $csv->string;
44} else {
45 say "combine() failed on argument: " . $csv->error_input;
46}
47
48foreach my $tinfo( $dir->traditionlist ) {
49 next if $tinfo->{'name'} eq 'xxxxx';
50 next unless $tinfo->{'id'} eq $lookfor
51 || $tinfo->{'name'} =~ /$lookfor/;
52 my $tradition = $dir->lookup( $tinfo->{'id'} );
53 say "Analyzing tradition " . $tradition->name;
54 my %datahash;
55 # Initialize everything with zeroes
56 map { $datahash{$_} = 0 } @resultfields;
57 # Put in the real text ID and name
58 $datahash{text_id} = $tinfo->{'id'};
59 $datahash{text_name} = $tradition->name;
60
61 # Run the analysis for each row in @rows
ead7032e 62 my %opts = (
63 exclude_type1 => 1,
64 merge_types => [ 'punctuation' ],
65 calcdsn => $calcdsn );
66 if( @collapse ) {
67 push( @{$opts{merge_types}}, @collapse );
a23b3715 68 }
69
70 my $result = run_analysis( $tradition, %opts );
71 $datahash{loc_total} = $result->{variant_count};
72 $datahash{loc_genealogical} = $result->{genealogical_count};
a23b3715 73 $datahash{loc_conflictvariant} = $result->{conflict_count};
3ec8b047 74 $datahash{loc_revertvariant} = $result->{reversion_count};
75 # Get the number of total and genealogical variants, and number of
76 # conflicted/reverted locations, as we go below.
a23b3715 77 my $totalvariant = 0;
78 my $genvariant = 0;
3ec8b047 79 my $conflictloc = 0;
80 my $revertloc = 0;
a23b3715 81 my @unknown;
82 foreach my $loc ( @{$result->{variants}} ) {
83 # A transition is the relationship type between parent and child.
84 # Find each genealogical transition and get the relationship type (if any)
85 # Find each non-genealogical transition and get the relationship type (if any)
3ec8b047 86 my( $loc_conflict, $loc_reversion );
a23b3715 87 foreach my $rdghash( @{$loc->{readings}} ) {
88 # Weed out singletons
89 my @roots = @{$rdghash->{independent_occurrence}};
90 next if @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'}
91 && !$rdghash->{'follow_unknown'};
a23b3715 92 my $rdg = $tradition->collation->reading( $rdghash->{readingid} );
3ec8b047 93 my $type;
94 if( $rdghash->{'is_conflict'} ) {
95 $type = 'conflict';
96 $loc_conflict = 1;
97 } elsif( $rdghash->{'is_reverted'} ) {
98 $type = 'reverted';
99 $loc_reversion = 1;
100 } elsif( @roots == 1 ) {
101 $type = 'genealogical';
102 $genvariant++;
103 } else {
104 warn "Reading $rdg neither conflict, genealogical, nor reverted. What?";
105 $type = 'ERROR';
106 }
107 my $typekey = substr( $type, 0, 3 ) . '_';
108
109 # Add relation stats for reading parents. If the reading is reverted,
110 # treat it as genealogical for the parent.
111 _add_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, $rdg,
112 ( $type eq 'reverted' ? 'genealogical' : $type ),
be590045 113 $rdghash->{'source_parents'}, \%datahash, \@unknown );
3ec8b047 114 # Add relation stats for reading reversions if they exist.
115 if( $type eq 'reverted' ) {
116 # Get relationship between reverted readings and their non-matching
117 # parents.
118 _add_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, $rdg,
119 $type, $rdghash->{'reversion_parents'}, \%datahash, \@unknown );
a23b3715 120 }
3ec8b047 121
a23b3715 122 $totalvariant++;
3ec8b047 123 }
124 if( $loc_conflict ) {
125 $conflictloc++;
126 } elsif( $loc_reversion ) {
127 $revertloc++;
a23b3715 128 }
129 }
130
131 # Add in the sums for the whole location
132 $datahash{'loc_genvariant'} = $genvariant;
133 $datahash{'loc_totalvariant'} = $totalvariant;
3ec8b047 134 $datahash{'loc_conflict'} = $conflictloc;
135 $datahash{'loc_reverted'} = $revertloc;
a23b3715 136 $datahash{'percent_genealogical'} = $datahash{loc_genealogical} / $datahash{loc_total};
137 $datahash{'percent_genvariant'} = $genvariant / $totalvariant;
3ec8b047 138 $datahash{'percent_genorrevert'} = ( $genvariant + $datahash{loc_revertvariant} ) / $totalvariant;
a23b3715 139 foreach my $type ( @relation_types ) {
140 $datahash{"percent_gen_$type"} = $datahash{"gen_$type"} / $totalvariant;
141 $datahash{"percent_con_$type"} = $datahash{"con_$type"} / $totalvariant;
142 $datahash{"percent_${type}_gen"} =
143 $datahash{"gen_$type"} + $datahash{"con_$type"} == 0 ? 0 :
144 $datahash{"gen_$type"} / ( $datahash{"gen_$type"} + $datahash{"con_$type"} );
145 }
146
147 # Write them out to CSV.
148 my @csvalues = map { $datahash{$_} } @resultfields;
149 if( $csv->combine( @csvalues ) ) {
150 say $fh $csv->string;
151 } else {
152 say "combine() failed on argument: " . $csv->error_input;
153 }
154 map { say sprintf( "Unknown transition %s -> %s (%s) :%s", @$_ ) } @unknown;
155}
156
157close $fh;
3ec8b047 158
159sub _add_reading_relations {
160 my( $rid, $rank, $robj, $type, $parenthash, $datahash, $unknown ) = @_;
161 foreach my $p ( keys %$parenthash ) {
162 my $pdata = $parenthash->{$p};
163 my $relation;
164 if( $pdata->{relation} ) {
165 $relation = $pdata->{relation}->{type};
166 } else {
167 $relation = 'unknown';
168 if( !$robj ) {
169 say "Unknown relation on missing reading object $rid at rank $rank";
170 } elsif( !$pdata ) {
171 say "Unknown relation on missing parent object for $rid at rank $rank";
172 } else {
173 push( @$unknown, [ $pdata->{label}, $robj->id, $robj->text, $type ] );
174 }
175 }
176 my $typekey = substr( $type, 0, 3 ) . "_$relation";
177 $datahash->{$typekey}++;
178 ## TODO distinguish parent-bad vs. rdg-bad
179 if( $robj && $robj->grammar_invalid ) {
180 $datahash->{$typekey.'_ungramm'} = 1;
181 } elsif( $robj && $robj->is_nonsense ) {
182 $datahash->{$typekey.'_nonsense'} = 1;
183 }
184 }
185}