analysis script for upcoming presentation
[scpubgit/stemmatology.git] / script / analyze.pl
CommitLineData
a23b3715 1#!/usr/bin/env perl
2
3use feature 'say';
4use lib 'lib';
5use strict;
6use warnings;
7use Text::CSV_XS;
8use Text::Tradition::Analysis qw/ run_analysis /;
9use Text::Tradition::Directory;
10
11binmode STDOUT, ':utf8';
12binmode STDERR, ':utf8';
13
14my $dir = Text::Tradition::Directory->new(
15 'dsn' => 'dbi:SQLite:dbname=stemmaweb/db/traditions.db',
16 );
17
18my $scope = $dir->new_scope();
19my $lookfor = shift @ARGV || '';
20my $collapse = [ @ARGV ];
21
22my @relation_types = qw/ orthographic spelling grammatical lexical
23 transposition addition deletion wordsimilar unknown /;
24
25my @resultfields = qw/
26 text_name loc_total loc_totalvariant loc_genealogical loc_genvariant
27 loc_conflict loc_conflictvariant percent_genealogical percent_genvariant /;
28map { push( @resultfields, "gen_$_", "gen_${_}_nonsense", "gen_${_}_ungramm" ) }
29 @relation_types;
30map { push( @resultfields, "con_$_", "con_${_}_nonsense", "con_${_}_ungramm" ) }
31 @relation_types;
32map { push( @resultfields, "percent_gen_$_", "percent_${_}_gen" ) } @relation_types;
33map { push( @resultfields, "percent_con_$_" ) } @relation_types;
34
35my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
36open my $fh, ">:encoding(UTF-8)", "analysis.csv" or die "analysis.csv: $!";
37if( $csv->combine( @resultfields ) ) {
38 say $fh $csv->string;
39} else {
40 say "combine() failed on argument: " . $csv->error_input;
41}
42
43foreach my $tinfo( $dir->traditionlist ) {
44 next if $tinfo->{'name'} eq 'xxxxx';
45 next unless $tinfo->{'id'} eq $lookfor
46 || $tinfo->{'name'} =~ /$lookfor/;
47 my $tradition = $dir->lookup( $tinfo->{'id'} );
48 say "Analyzing tradition " . $tradition->name;
49 my %datahash;
50 # Initialize everything with zeroes
51 map { $datahash{$_} = 0 } @resultfields;
52 # Put in the real text ID and name
53 $datahash{text_id} = $tinfo->{'id'};
54 $datahash{text_name} = $tradition->name;
55
56 # Run the analysis for each row in @rows
57 my %opts = ( exclude_type1 => 1 );
58 if( @$collapse ) {
59 $opts{merge_types} = $collapse;
60 }
61
62 my $result = run_analysis( $tradition, %opts );
63 $datahash{loc_total} = $result->{variant_count};
64 $datahash{loc_genealogical} = $result->{genealogical_count};
65 $datahash{loc_conflict} = $result->{variant_count} - $result->{genealogical_count};
66 $datahash{loc_conflictvariant} = $result->{conflict_count};
67 # Get the number of total and genealogical variants as we go below.
68 my $totalvariant = 0;
69 my $genvariant = 0;
70 my @unknown;
71 foreach my $loc ( @{$result->{variants}} ) {
72 # A transition is the relationship type between parent and child.
73 # Find each genealogical transition and get the relationship type (if any)
74 # Find each non-genealogical transition and get the relationship type (if any)
75 # Someday, look for reversions
76 foreach my $rdghash( @{$loc->{readings}} ) {
77 # Weed out singletons
78 my @roots = @{$rdghash->{independent_occurrence}};
79 next if @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'}
80 && !$rdghash->{'follow_unknown'};
81 # TODO Weed out punctuation
82 my $rdg = $tradition->collation->reading( $rdghash->{readingid} );
83 my $typekey = @roots == 1 ? 'gen_' : 'con_';
84 foreach my $p ( keys %{$rdghash->{reading_parents}} ) {
85 my $pdata = $rdghash->{reading_parents}->{$p};
86 my $relation;
87 if( $pdata->{relation} ) {
88 $relation = $pdata->{relation}->{type};
89 } else {
90 $relation = 'unknown';
91 if( !$rdg ) {
92 say "Unknown relation on missing reading object "
93 . $rdghash->{readingid} . " at rank " . $loc->{id};
94 } elsif( !$pdata ) {
95 say "Unknown relation on missing parent object for "
96 . $rdghash->{readingid} . " at rank " . $loc->{id};
97
98 } else {
99 push( @unknown, [ $pdata->{label}, $rdg->id, $rdg->text,
100 ( @roots == 1 ? 'genealogical' : 'conflicting' ) ] );
101 }
102 }
103 $typekey .= $relation;
104 $datahash{$typekey}++;
105 ## TODO distinguish parent-bad vs. rdg-bad
106 if( $rdg && $rdg->grammar_invalid ) {
107 $datahash{$typekey.'_ungramm'} = 1;
108 } elsif( $rdg && $rdg->is_nonsense ) {
109 $datahash{$typekey.'_nonsense'} = 1;
110 }
111 }
112 $totalvariant++;
113 $genvariant++ if @roots == 1;
114 }
115 }
116
117 # Add in the sums for the whole location
118 $datahash{'loc_genvariant'} = $genvariant;
119 $datahash{'loc_totalvariant'} = $totalvariant;
120 $datahash{'percent_genealogical'} = $datahash{loc_genealogical} / $datahash{loc_total};
121 $datahash{'percent_genvariant'} = $genvariant / $totalvariant;
122 foreach my $type ( @relation_types ) {
123 $datahash{"percent_gen_$type"} = $datahash{"gen_$type"} / $totalvariant;
124 $datahash{"percent_con_$type"} = $datahash{"con_$type"} / $totalvariant;
125 $datahash{"percent_${type}_gen"} =
126 $datahash{"gen_$type"} + $datahash{"con_$type"} == 0 ? 0 :
127 $datahash{"gen_$type"} / ( $datahash{"gen_$type"} + $datahash{"con_$type"} );
128 }
129
130 # Write them out to CSV.
131 my @csvalues = map { $datahash{$_} } @resultfields;
132 if( $csv->combine( @csvalues ) ) {
133 say $fh $csv->string;
134 } else {
135 say "combine() failed on argument: " . $csv->error_input;
136 }
137 map { say sprintf( "Unknown transition %s -> %s (%s) :%s", @$_ ) } @unknown;
138}
139
140close $fh;