analysis script for upcoming presentation
[scpubgit/stemmatology.git] / script / analyze.pl
1 #!/usr/bin/env perl
2
3 use feature 'say';
4 use lib 'lib';
5 use strict;
6 use warnings;
7 use Text::CSV_XS;
8 use Text::Tradition::Analysis qw/ run_analysis /;
9 use Text::Tradition::Directory;
10
11 binmode STDOUT, ':utf8';
12 binmode STDERR, ':utf8';
13
14 my $dir = Text::Tradition::Directory->new(
15     'dsn' => 'dbi:SQLite:dbname=stemmaweb/db/traditions.db',
16     );
17
18 my $scope = $dir->new_scope();
19 my $lookfor = shift @ARGV || '';
20 my $collapse = [ @ARGV ];
21
22 my @relation_types = qw/ orthographic spelling grammatical lexical
23         transposition addition deletion wordsimilar unknown /;
24
25 my @resultfields = qw/
26         text_name loc_total loc_totalvariant loc_genealogical loc_genvariant 
27         loc_conflict loc_conflictvariant percent_genealogical percent_genvariant /;
28 map { push( @resultfields, "gen_$_", "gen_${_}_nonsense", "gen_${_}_ungramm" ) }
29         @relation_types;
30 map { push( @resultfields, "con_$_", "con_${_}_nonsense", "con_${_}_ungramm" ) }
31         @relation_types;
32 map { push( @resultfields, "percent_gen_$_", "percent_${_}_gen" ) } @relation_types;
33 map { push( @resultfields, "percent_con_$_" ) } @relation_types;
34         
35 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
36 open my $fh, ">:encoding(UTF-8)", "analysis.csv" or die "analysis.csv: $!";
37 if( $csv->combine( @resultfields ) ) {
38         say $fh $csv->string;
39 } else {
40         say "combine() failed on argument: " . $csv->error_input;
41 }
42
43 foreach my $tinfo( $dir->traditionlist ) {
44         next if $tinfo->{'name'} eq 'xxxxx';
45         next unless $tinfo->{'id'} eq $lookfor
46                 || $tinfo->{'name'} =~ /$lookfor/;
47         my $tradition = $dir->lookup( $tinfo->{'id'} );
48         say "Analyzing tradition " . $tradition->name;
49         my %datahash;
50         # Initialize everything with zeroes
51         map { $datahash{$_} = 0 } @resultfields;
52         # Put in the real text ID and name
53         $datahash{text_id} = $tinfo->{'id'};
54         $datahash{text_name} = $tradition->name;
55         
56         # Run the analysis for each row in @rows
57         my %opts = ( exclude_type1 => 1 );
58         if( @$collapse ) {
59                 $opts{merge_types} = $collapse;
60         }
61         
62         my $result = run_analysis( $tradition, %opts );
63         $datahash{loc_total} = $result->{variant_count};
64         $datahash{loc_genealogical} = $result->{genealogical_count};
65         $datahash{loc_conflict} = $result->{variant_count} - $result->{genealogical_count};
66         $datahash{loc_conflictvariant} = $result->{conflict_count};
67         # Get the number of total and genealogical variants as we go below.
68         my $totalvariant = 0;
69         my $genvariant = 0;
70         my @unknown;
71         foreach my $loc ( @{$result->{variants}} ) {
72                 # A transition is the relationship type between parent and child.
73                 # Find each genealogical transition and get the relationship type (if any)
74                 # Find each non-genealogical transition and get the relationship type (if any)
75                 # Someday, look for reversions
76                 foreach my $rdghash( @{$loc->{readings}} ) {
77                         # Weed out singletons
78                         my @roots = @{$rdghash->{independent_occurrence}};
79                         next if @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'}
80                                 && !$rdghash->{'follow_unknown'};
81                         # TODO Weed out punctuation
82                         my $rdg = $tradition->collation->reading( $rdghash->{readingid} );
83                         my $typekey = @roots == 1 ? 'gen_' : 'con_';
84                         foreach my $p ( keys %{$rdghash->{reading_parents}} ) {
85                                 my $pdata = $rdghash->{reading_parents}->{$p};
86                                 my $relation;
87                                 if( $pdata->{relation} ) {
88                                         $relation = $pdata->{relation}->{type};
89                                 } else {
90                                         $relation = 'unknown';
91                                         if( !$rdg ) {
92                                                 say "Unknown relation on missing reading object " 
93                                                         . $rdghash->{readingid} . " at rank " . $loc->{id};
94                                         } elsif( !$pdata ) {
95                                                 say "Unknown relation on missing parent object for " 
96                                                         . $rdghash->{readingid} . " at rank " . $loc->{id};
97                                         
98                                         } else {
99                                                 push( @unknown, [ $pdata->{label}, $rdg->id, $rdg->text, 
100                                                         ( @roots == 1 ? 'genealogical' : 'conflicting' ) ] );
101                                         }
102                                 }
103                                 $typekey .= $relation;
104                                 $datahash{$typekey}++;
105                                 ## TODO distinguish parent-bad vs. rdg-bad
106                                 if( $rdg && $rdg->grammar_invalid ) {
107                                         $datahash{$typekey.'_ungramm'} = 1;
108                                 } elsif( $rdg && $rdg->is_nonsense ) {
109                                         $datahash{$typekey.'_nonsense'} = 1;
110                                 }
111                         }
112                         $totalvariant++;
113                         $genvariant++ if @roots == 1;
114                 }
115         }
116         
117         # Add in the sums for the whole location
118         $datahash{'loc_genvariant'} = $genvariant;      
119         $datahash{'loc_totalvariant'} = $totalvariant;
120         $datahash{'percent_genealogical'} = $datahash{loc_genealogical} / $datahash{loc_total};
121         $datahash{'percent_genvariant'} = $genvariant / $totalvariant;
122         foreach my $type ( @relation_types ) {
123                 $datahash{"percent_gen_$type"} = $datahash{"gen_$type"} / $totalvariant;
124                 $datahash{"percent_con_$type"} = $datahash{"con_$type"} / $totalvariant;
125                 $datahash{"percent_${type}_gen"} = 
126                         $datahash{"gen_$type"} + $datahash{"con_$type"} == 0 ? 0 :
127                         $datahash{"gen_$type"} / ( $datahash{"gen_$type"} + $datahash{"con_$type"} );
128         }
129         
130         # Write them out to CSV.
131         my @csvalues = map { $datahash{$_} } @resultfields;
132         if( $csv->combine( @csvalues ) ) {
133                 say $fh $csv->string;
134         } else {
135                 say "combine() failed on argument: " . $csv->error_input;
136         }
137         map { say sprintf( "Unknown transition %s -> %s (%s) :%s", @$_ ) } @unknown;
138 }
139
140 close $fh;