notice and save changes to graph name / stemma identifier (tla/stemmaweb#28)
[scpubgit/stemmatology.git] / analysis / script / analyze.pl
CommitLineData
a23b3715 1#!/usr/bin/env perl
2
3use feature 'say';
4use lib 'lib';
5use strict;
6use warnings;
5c2fdc10 7use Getopt::Long;
a23b3715 8use Text::CSV_XS;
9use Text::Tradition::Analysis qw/ run_analysis /;
10use Text::Tradition::Directory;
5c2fdc10 11use TryCatch;
a23b3715 12
13binmode STDOUT, ':utf8';
14binmode STDERR, ':utf8';
15
5c2fdc10 16my( $dsn, $dbuser, $dbpass );
3bce8f64 17my $filename = 'analysis.csv';
5c2fdc10 18GetOptions(
19 'dsn=s' => \$dsn,
20 'u|user=s' => \$dbuser,
21 'p|pass=s' => \$dbpass,
3bce8f64 22 'f|file=s' => \$filename
5c2fdc10 23);
24
25my %dbopts = ( dsn => $dsn );
26if( $dbuser || $dbpass ) {
27 $dbopts{extra_args} = { user => $dbuser, password => $dbpass }
28}
29
30my $dir = Text::Tradition::Directory->new( %dbopts );
31my $scope = $dir->new_scope();
a23b3715 32my $lookfor = shift @ARGV || '';
ee26c4d9 33my %collapse;
5c2fdc10 34if( @ARGV ) {
35 say "Merging relationship types @ARGV";
36 map { $collapse{$_} = 1 } @ARGV;
37}
a23b3715 38
3ec8b047 39my @relation_types = grep { !$collapse{$_} }
ee26c4d9 40 qw/ orthographic spelling grammatical lexical transposition repetition
3bce8f64 41 uncertain other addition deletion wordsimilar unknown source /;
a23b3715 42
43my @resultfields = qw/
3bce8f64 44 text_name loc_total loc_singleton multiparent loc_totalvariant loc_genealogical loc_genvariant
6a5f4aa7 45 loc_conflict loc_conflictvariant loc_reverted loc_revertvariant /;
ee26c4d9 46map { push( @resultfields, "gen_$_", "con_$_", "rev_$_" ) } @relation_types;
a23b3715 47
48my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
3bce8f64 49open my $fh, ">:encoding(UTF-8)", $filename or die "$filename: $!";
a23b3715 50if( $csv->combine( @resultfields ) ) {
51 say $fh $csv->string;
52} else {
53 say "combine() failed on argument: " . $csv->error_input;
54}
55
56foreach my $tinfo( $dir->traditionlist ) {
57 next if $tinfo->{'name'} eq 'xxxxx';
5c2fdc10 58 next if $tinfo->{'name'} =~ /158/;
59 next if $tinfo->{'name'} =~ /Heinrichi part/;
60 if( $lookfor ) {
61 next unless $tinfo->{'id'} eq $lookfor
62 || $tinfo->{'name'} =~ /$lookfor/;
63 }
a23b3715 64 my $tradition = $dir->lookup( $tinfo->{'id'} );
5c2fdc10 65 next unless $tradition->stemma_count;
a23b3715 66 say "Analyzing tradition " . $tradition->name;
5c2fdc10 67 ## HACK
68 my $MAXRANK;
69 if( $tradition->name =~ /Chronicle/ ) {
70 $MAXRANK = $tradition->collation->reading('L1545')->rank;
71 }
a23b3715 72 my %datahash;
73 # Initialize everything with zeroes
74 map { $datahash{$_} = 0 } @resultfields;
75 # Put in the real text ID and name
76 $datahash{text_id} = $tinfo->{'id'};
77 $datahash{text_name} = $tradition->name;
78
79 # Run the analysis for each row in @rows
ead7032e 80 my %opts = (
81 exclude_type1 => 1,
ee26c4d9 82 merge_types => [ 'punctuation' ] );
83 if( keys %collapse ) {
84 push( @{$opts{merge_types}}, keys %collapse );
a23b3715 85 }
86
5c2fdc10 87 my $result;
6a5f4aa7 88 try {
5c2fdc10 89 $result = run_analysis( $tradition, %opts );
6a5f4aa7 90 } catch {
91 say "Analysis run failed on tradition " . $tradition->name . ": @_";
92 next;
93 }
a23b3715 94 $datahash{loc_total} = $result->{variant_count};
3bce8f64 95 $datahash{multiparent} = 0;
6a5f4aa7 96 #$datahash{loc_genealogical} = $result->{genealogical_count};
97 #$datahash{loc_conflictvariant} = $result->{conflict_count};
98 #$datahash{loc_revertvariant} = $result->{reversion_count};
99 # Get the totals by location and by variant as we go.
a23b3715 100 my $totalvariant = 0;
6a5f4aa7 101 my $singleton = 0;
102 my $genloc = 0;
a23b3715 103 my $genvariant = 0;
3ec8b047 104 my $conflictloc = 0;
6a5f4aa7 105 my $conflictvariant = 0;
3ec8b047 106 my $revertloc = 0;
6a5f4aa7 107 my $revertvariant = 0;
108 my $msgd; # for the HACK
a23b3715 109 my @unknown;
110 foreach my $loc ( @{$result->{variants}} ) {
111 # A transition is the relationship type between parent and child.
112 # Find each genealogical transition and get the relationship type (if any)
113 # Find each non-genealogical transition and get the relationship type (if any)
3ec8b047 114 my( $loc_conflict, $loc_reversion );
ee26c4d9 115 if( exists $loc->{unsolved} ) {
6a5f4aa7 116 # Not solved; remove it from the total.
117 say "Skipping unsolved location at " . $loc->{id};
118 $datahash{loc_total}--;
ee26c4d9 119 next;
5c2fdc10 120 } elsif( $MAXRANK && $loc->{id} > $MAXRANK ) {
6a5f4aa7 121 # HACK until Chronicle tagging is done
122 say "Skipping ranks above $MAXRANK"
5c2fdc10 123 unless $msgd;
124 $msgd = 1;
125 next;
ee26c4d9 126 }
a23b3715 127 foreach my $rdghash( @{$loc->{readings}} ) {
128 # Weed out singletons
6a5f4aa7 129 $totalvariant++;
a23b3715 130 my @roots = @{$rdghash->{independent_occurrence}};
6a5f4aa7 131 if( @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'}
132 && !$rdghash->{'follow_unknown'} ) {
133 $singleton++;
134 next;
135 }
a23b3715 136 my $rdg = $tradition->collation->reading( $rdghash->{readingid} );
3ec8b047 137 my $type;
138 if( $rdghash->{'is_conflict'} ) {
139 $type = 'conflict';
140 $loc_conflict = 1;
6a5f4aa7 141 $conflictvariant++;
3ec8b047 142 } elsif( $rdghash->{'is_reverted'} ) {
143 $type = 'reverted';
144 $loc_reversion = 1;
6a5f4aa7 145 $revertvariant++;
3ec8b047 146 } elsif( @roots == 1 ) {
147 $type = 'genealogical';
148 $genvariant++;
149 } else {
150 warn "Reading $rdg neither conflict, genealogical, nor reverted. What?";
151 $type = 'ERROR';
152 }
153 my $typekey = substr( $type, 0, 3 ) . '_';
154
155 # Add relation stats for reading parents. If the reading is reverted,
861e4c6d 156 # the 'parent' is the reversion parent rather than the parents of the
157 # reading source.
3bce8f64 158 my $phash = $type eq 'reverted'
159 ? $rdghash->{'reversion_parents'} : $rdghash->{'source_parents'};
3ec8b047 160 _add_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, $rdg,
3bce8f64 161 $type, $phash, \%datahash, \@unknown );
3ec8b047 162 }
163 if( $loc_conflict ) {
164 $conflictloc++;
165 } elsif( $loc_reversion ) {
166 $revertloc++;
6a5f4aa7 167 } else {
168 $genloc++;
a23b3715 169 }
170 }
171
172 # Add in the sums for the whole location
6a5f4aa7 173 $datahash{loc_totalvariant} = $totalvariant;
174 $datahash{loc_genealogical} = $genloc;
175 $datahash{loc_genvariant} = $genvariant;
176 $datahash{loc_conflict} = $conflictloc;
177 $datahash{loc_conflictvariant} = $conflictvariant;
178 $datahash{loc_reverted} = $revertloc;
179 $datahash{loc_revertvariant} = $revertvariant;
180 $datahash{loc_singleton} = $singleton;
181 $datahash{percent_genealogical} = $datahash{loc_genealogical} / $datahash{loc_total};
182 $datahash{percent_genvariant} = $genvariant / $totalvariant;
183 $datahash{percent_genorrevert} = ( $genvariant + $datahash{loc_revertvariant} ) / $totalvariant;
a23b3715 184
185 # Write them out to CSV.
186 my @csvalues = map { $datahash{$_} } @resultfields;
187 if( $csv->combine( @csvalues ) ) {
188 say $fh $csv->string;
189 } else {
190 say "combine() failed on argument: " . $csv->error_input;
191 }
5c2fdc10 192 map { say STDERR sprintf( "Unknown transition %s -> %s (%s) :%s", @$_ ) } @unknown;
a23b3715 193}
194
195close $fh;
3ec8b047 196
197sub _add_reading_relations {
198 my( $rid, $rank, $robj, $type, $parenthash, $datahash, $unknown ) = @_;
3bce8f64 199 my @kp = keys ( %$parenthash );
200 unless( @kp ) {
201 _increment_typekey( $datahash, $type, 'source' );
202 return;
203 }
204 if( @kp > 1 ) {
205 $datahash->{multiparent} = @kp - 1;
206 }
207 foreach my $p ( @kp ) {
3ec8b047 208 my $pdata = $parenthash->{$p};
209 my $relation;
210 if( $pdata->{relation} ) {
3bce8f64 211 $relation = $pdata->{relation}->{transposed}
212 ? 'transposition' : $pdata->{relation}->{type};
3ec8b047 213 } else {
214 $relation = 'unknown';
215 if( !$robj ) {
216 say "Unknown relation on missing reading object $rid at rank $rank";
217 } elsif( !$pdata ) {
218 say "Unknown relation on missing parent object for $rid at rank $rank";
219 } else {
220 push( @$unknown, [ $pdata->{label}, $robj->id, $robj->text, $type ] );
221 }
222 }
3bce8f64 223 _increment_typekey( $datahash, $type, $relation );
224 }
225}
226
227sub _increment_typekey {
228 my( $datahash, $type, $relation ) = @_;
229 my $typekey = substr( $type, 0, 3 ) . "_$relation";
230 unless( exists $datahash->{$typekey} ) {
231 $DB::single = 1;
232 warn "No field for $typekey";
3ec8b047 233 }
3bce8f64 234 $datahash->{$typekey}++;
235# # TODO distinguish parent-bad vs. rdg-bad
236# if( $robj && $robj->grammar_invalid ) {
237# $datahash->{$typekey.'_ungramm'} = 1;
238# } elsif( $robj && $robj->is_nonsense ) {
239# $datahash->{$typekey.'_nonsense'} = 1;
240# }
5c2fdc10 241}