8 use Text::Tradition::Analysis qw/ run_analysis /;
9 use Text::Tradition::Directory;
11 binmode STDOUT, ':utf8';
12 binmode STDERR, ':utf8';
14 my $dir = Text::Tradition::Directory->new(
15 'dsn' => 'dbi:SQLite:dbname=stemmaweb/db/traditions.db',
17 my $calcdsn = 'dbi:SQLite:dbname=db/graphs.db';
19 my $scope = $dir->new_scope();
20 my $lookfor = shift @ARGV || '';
22 map { $collapse{$_} = 1 } @ARGV;
24 my @relation_types = grep { !$collapse{$_} }
25 qw/ orthographic spelling grammatical lexical transposition addition deletion
26 wordsimilar unknown /;
28 my @resultfields = qw/
29 text_name loc_total loc_totalvariant loc_genealogical loc_genvariant
30 loc_conflict loc_conflictvariant loc_reverted loc_revertvariant
31 percent_genealogical percent_genvariant percent_genorrevert /;
32 map { push( @resultfields, "gen_$_", "gen_${_}_nonsense", "gen_${_}_ungramm" ) }
34 map { push( @resultfields, "con_$_", "con_${_}_nonsense", "con_${_}_ungramm" ) }
36 map { push( @resultfields, "rev_$_", "rev_${_}_nonsense", "rev_${_}_ungramm" ) }
38 map { push( @resultfields, "percent_gen_$_", "percent_${_}_gen" ) } @relation_types;
39 map { push( @resultfields, "percent_con_$_", "percent_rev_$_" ) } @relation_types;
41 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
42 open my $fh, ">:encoding(UTF-8)", "analysis.csv" or die "analysis.csv: $!";
43 if( $csv->combine( @resultfields ) ) {
46 say "combine() failed on argument: " . $csv->error_input;
49 foreach my $tinfo( $dir->traditionlist ) {
50 next if $tinfo->{'name'} eq 'xxxxx';
51 next unless $tinfo->{'id'} eq $lookfor
52 || $tinfo->{'name'} =~ /$lookfor/;
53 my $tradition = $dir->lookup( $tinfo->{'id'} );
54 say "Analyzing tradition " . $tradition->name;
56 # Initialize everything with zeroes
57 map { $datahash{$_} = 0 } @resultfields;
58 # Put in the real text ID and name
59 $datahash{text_id} = $tinfo->{'id'};
60 $datahash{text_name} = $tradition->name;
62 # Run the analysis for each row in @rows
63 my %opts = ( exclude_type1 => 1, calcdsn => $calcdsn );
64 if( keys %collapse ) {
65 $opts{merge_types} = [ keys %collapse ];
68 my $result = run_analysis( $tradition, %opts );
69 $datahash{loc_total} = $result->{variant_count};
70 $datahash{loc_genealogical} = $result->{genealogical_count};
71 $datahash{loc_conflictvariant} = $result->{conflict_count};
72 $datahash{loc_revertvariant} = $result->{reversion_count};
73 # Get the number of total and genealogical variants, and number of
74 # conflicted/reverted locations, as we go below.
80 foreach my $loc ( @{$result->{variants}} ) {
81 # A transition is the relationship type between parent and child.
82 # Find each genealogical transition and get the relationship type (if any)
83 # Find each non-genealogical transition and get the relationship type (if any)
84 my( $loc_conflict, $loc_reversion );
85 foreach my $rdghash( @{$loc->{readings}} ) {
87 my @roots = @{$rdghash->{independent_occurrence}};
88 next if @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'}
89 && !$rdghash->{'follow_unknown'};
90 # TODO Weed out punctuation
91 my $rdg = $tradition->collation->reading( $rdghash->{readingid} );
93 if( $rdghash->{'is_conflict'} ) {
96 } elsif( $rdghash->{'is_reverted'} ) {
99 } elsif( @roots == 1 ) {
100 $type = 'genealogical';
103 warn "Reading $rdg neither conflict, genealogical, nor reverted. What?";
106 my $typekey = substr( $type, 0, 3 ) . '_';
108 # Add relation stats for reading parents. If the reading is reverted,
109 # treat it as genealogical for the parent.
110 _add_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, $rdg,
111 ( $type eq 'reverted' ? 'genealogical' : $type ),
112 $rdghash->{'source_parents'}, \%datahash, \@unknown );
113 # Add relation stats for reading reversions if they exist.
114 if( $type eq 'reverted' ) {
115 # Get relationship between reverted readings and their non-matching
117 _add_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, $rdg,
118 $type, $rdghash->{'reversion_parents'}, \%datahash, \@unknown );
123 if( $loc_conflict ) {
125 } elsif( $loc_reversion ) {
130 # Add in the sums for the whole location
131 $datahash{'loc_genvariant'} = $genvariant;
132 $datahash{'loc_totalvariant'} = $totalvariant;
133 $datahash{'loc_conflict'} = $conflictloc;
134 $datahash{'loc_reverted'} = $revertloc;
135 $datahash{'percent_genealogical'} = $datahash{loc_genealogical} / $datahash{loc_total};
136 $datahash{'percent_genvariant'} = $genvariant / $totalvariant;
137 $datahash{'percent_genorrevert'} = ( $genvariant + $datahash{loc_revertvariant} ) / $totalvariant;
138 foreach my $type ( @relation_types ) {
139 $datahash{"percent_gen_$type"} = $datahash{"gen_$type"} / $totalvariant;
140 $datahash{"percent_con_$type"} = $datahash{"con_$type"} / $totalvariant;
141 $datahash{"percent_${type}_gen"} =
142 $datahash{"gen_$type"} + $datahash{"con_$type"} == 0 ? 0 :
143 $datahash{"gen_$type"} / ( $datahash{"gen_$type"} + $datahash{"con_$type"} );
146 # Write them out to CSV.
147 my @csvalues = map { $datahash{$_} } @resultfields;
148 if( $csv->combine( @csvalues ) ) {
149 say $fh $csv->string;
151 say "combine() failed on argument: " . $csv->error_input;
153 map { say sprintf( "Unknown transition %s -> %s (%s) :%s", @$_ ) } @unknown;
158 sub _add_reading_relations {
159 my( $rid, $rank, $robj, $type, $parenthash, $datahash, $unknown ) = @_;
160 foreach my $p ( keys %$parenthash ) {
161 my $pdata = $parenthash->{$p};
163 if( $pdata->{relation} ) {
164 $relation = $pdata->{relation}->{type};
166 $relation = 'unknown';
168 say "Unknown relation on missing reading object $rid at rank $rank";
170 say "Unknown relation on missing parent object for $rid at rank $rank";
172 push( @$unknown, [ $pdata->{label}, $robj->id, $robj->text, $type ] );
175 my $typekey = substr( $type, 0, 3 ) . "_$relation";
176 $datahash->{$typekey}++;
177 ## TODO distinguish parent-bad vs. rdg-bad
178 if( $robj && $robj->grammar_invalid ) {
179 $datahash->{$typekey.'_ungramm'} = 1;
180 } elsif( $robj && $robj->is_nonsense ) {
181 $datahash->{$typekey.'_nonsense'} = 1;