Commit | Line | Data |
a23b3715 |
1 | #!/usr/bin/env perl |
2 | |
3 | use feature 'say'; |
4 | use lib 'lib'; |
5 | use strict; |
6 | use warnings; |
7 | use Text::CSV_XS; |
8 | use Text::Tradition::Analysis qw/ run_analysis /; |
9 | use Text::Tradition::Directory; |
10 | |
11 | binmode STDOUT, ':utf8'; |
12 | binmode STDERR, ':utf8'; |
13 | |
14 | my $dir = Text::Tradition::Directory->new( |
15 | 'dsn' => 'dbi:SQLite:dbname=stemmaweb/db/traditions.db', |
16 | ); |
17 | |
18 | my $scope = $dir->new_scope(); |
19 | my $lookfor = shift @ARGV || ''; |
20 | my $collapse = [ @ARGV ]; |
21 | |
22 | my @relation_types = qw/ orthographic spelling grammatical lexical |
23 | transposition addition deletion wordsimilar unknown /; |
24 | |
25 | my @resultfields = qw/ |
26 | text_name loc_total loc_totalvariant loc_genealogical loc_genvariant |
27 | loc_conflict loc_conflictvariant percent_genealogical percent_genvariant /; |
28 | map { push( @resultfields, "gen_$_", "gen_${_}_nonsense", "gen_${_}_ungramm" ) } |
29 | @relation_types; |
30 | map { push( @resultfields, "con_$_", "con_${_}_nonsense", "con_${_}_ungramm" ) } |
31 | @relation_types; |
32 | map { push( @resultfields, "percent_gen_$_", "percent_${_}_gen" ) } @relation_types; |
33 | map { push( @resultfields, "percent_con_$_" ) } @relation_types; |
34 | |
35 | my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } ); |
36 | open my $fh, ">:encoding(UTF-8)", "analysis.csv" or die "analysis.csv: $!"; |
37 | if( $csv->combine( @resultfields ) ) { |
38 | say $fh $csv->string; |
39 | } else { |
40 | say "combine() failed on argument: " . $csv->error_input; |
41 | } |
42 | |
43 | foreach my $tinfo( $dir->traditionlist ) { |
44 | next if $tinfo->{'name'} eq 'xxxxx'; |
45 | next unless $tinfo->{'id'} eq $lookfor |
46 | || $tinfo->{'name'} =~ /$lookfor/; |
47 | my $tradition = $dir->lookup( $tinfo->{'id'} ); |
48 | say "Analyzing tradition " . $tradition->name; |
49 | my %datahash; |
50 | # Initialize everything with zeroes |
51 | map { $datahash{$_} = 0 } @resultfields; |
52 | # Put in the real text ID and name |
53 | $datahash{text_id} = $tinfo->{'id'}; |
54 | $datahash{text_name} = $tradition->name; |
55 | |
56 | # Run the analysis for each row in @rows |
57 | my %opts = ( exclude_type1 => 1 ); |
58 | if( @$collapse ) { |
59 | $opts{merge_types} = $collapse; |
60 | } |
61 | |
62 | my $result = run_analysis( $tradition, %opts ); |
63 | $datahash{loc_total} = $result->{variant_count}; |
64 | $datahash{loc_genealogical} = $result->{genealogical_count}; |
65 | $datahash{loc_conflict} = $result->{variant_count} - $result->{genealogical_count}; |
66 | $datahash{loc_conflictvariant} = $result->{conflict_count}; |
67 | # Get the number of total and genealogical variants as we go below. |
68 | my $totalvariant = 0; |
69 | my $genvariant = 0; |
70 | my @unknown; |
71 | foreach my $loc ( @{$result->{variants}} ) { |
72 | # A transition is the relationship type between parent and child. |
73 | # Find each genealogical transition and get the relationship type (if any) |
74 | # Find each non-genealogical transition and get the relationship type (if any) |
75 | # Someday, look for reversions |
76 | foreach my $rdghash( @{$loc->{readings}} ) { |
77 | # Weed out singletons |
78 | my @roots = @{$rdghash->{independent_occurrence}}; |
79 | next if @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'} |
80 | && !$rdghash->{'follow_unknown'}; |
81 | # TODO Weed out punctuation |
82 | my $rdg = $tradition->collation->reading( $rdghash->{readingid} ); |
83 | my $typekey = @roots == 1 ? 'gen_' : 'con_'; |
84 | foreach my $p ( keys %{$rdghash->{reading_parents}} ) { |
85 | my $pdata = $rdghash->{reading_parents}->{$p}; |
86 | my $relation; |
87 | if( $pdata->{relation} ) { |
88 | $relation = $pdata->{relation}->{type}; |
89 | } else { |
90 | $relation = 'unknown'; |
91 | if( !$rdg ) { |
92 | say "Unknown relation on missing reading object " |
93 | . $rdghash->{readingid} . " at rank " . $loc->{id}; |
94 | } elsif( !$pdata ) { |
95 | say "Unknown relation on missing parent object for " |
96 | . $rdghash->{readingid} . " at rank " . $loc->{id}; |
97 | |
98 | } else { |
99 | push( @unknown, [ $pdata->{label}, $rdg->id, $rdg->text, |
100 | ( @roots == 1 ? 'genealogical' : 'conflicting' ) ] ); |
101 | } |
102 | } |
103 | $typekey .= $relation; |
104 | $datahash{$typekey}++; |
105 | ## TODO distinguish parent-bad vs. rdg-bad |
106 | if( $rdg && $rdg->grammar_invalid ) { |
107 | $datahash{$typekey.'_ungramm'} = 1; |
108 | } elsif( $rdg && $rdg->is_nonsense ) { |
109 | $datahash{$typekey.'_nonsense'} = 1; |
110 | } |
111 | } |
112 | $totalvariant++; |
113 | $genvariant++ if @roots == 1; |
114 | } |
115 | } |
116 | |
117 | # Add in the sums for the whole location |
118 | $datahash{'loc_genvariant'} = $genvariant; |
119 | $datahash{'loc_totalvariant'} = $totalvariant; |
120 | $datahash{'percent_genealogical'} = $datahash{loc_genealogical} / $datahash{loc_total}; |
121 | $datahash{'percent_genvariant'} = $genvariant / $totalvariant; |
122 | foreach my $type ( @relation_types ) { |
123 | $datahash{"percent_gen_$type"} = $datahash{"gen_$type"} / $totalvariant; |
124 | $datahash{"percent_con_$type"} = $datahash{"con_$type"} / $totalvariant; |
125 | $datahash{"percent_${type}_gen"} = |
126 | $datahash{"gen_$type"} + $datahash{"con_$type"} == 0 ? 0 : |
127 | $datahash{"gen_$type"} / ( $datahash{"gen_$type"} + $datahash{"con_$type"} ); |
128 | } |
129 | |
130 | # Write them out to CSV. |
131 | my @csvalues = map { $datahash{$_} } @resultfields; |
132 | if( $csv->combine( @csvalues ) ) { |
133 | say $fh $csv->string; |
134 | } else { |
135 | say "combine() failed on argument: " . $csv->error_input; |
136 | } |
137 | map { say sprintf( "Unknown transition %s -> %s (%s) :%s", @$_ ) } @unknown; |
138 | } |
139 | |
140 | close $fh; |