9 use Text::Tradition::Analysis qw/ run_analysis /;
10 use Text::Tradition::Directory;
13 binmode STDOUT, ':utf8';
14 binmode STDERR, ':utf8';
16 my( $dsn, $dbuser, $dbpass );
17 my $filename = 'analysis.csv';
20 'u|user=s' => \$dbuser,
21 'p|pass=s' => \$dbpass,
22 'f|file=s' => \$filename
25 my %dbopts = ( dsn => $dsn );
26 if( $dbuser || $dbpass ) {
27 $dbopts{extra_args} = { user => $dbuser, password => $dbpass }
30 my $dir = Text::Tradition::Directory->new( %dbopts );
31 my $scope = $dir->new_scope();
32 my $lookfor = shift @ARGV || '';
35 say "Merging relationship types @ARGV";
36 map { $collapse{$_} = 1 } @ARGV;
39 my @relation_types = grep { !$collapse{$_} }
40 qw/ orthographic spelling grammatical lexical transposition repetition
41 uncertain other addition deletion wordsimilar unknown source /;
43 my @resultfields = qw/
44 text_name loc_total loc_singleton multiparent loc_totalvariant loc_genealogical loc_genvariant
45 loc_conflict loc_conflictvariant loc_reverted loc_revertvariant /;
46 map { push( @resultfields, "gen_$_", "con_$_", "rev_$_" ) } @relation_types;
48 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
49 open my $fh, ">:encoding(UTF-8)", $filename or die "$filename: $!";
50 if( $csv->combine( @resultfields ) ) {
53 say "combine() failed on argument: " . $csv->error_input;
56 foreach my $tinfo( $dir->traditionlist ) {
57 next if $tinfo->{'name'} eq 'xxxxx';
58 next if $tinfo->{'name'} =~ /158/;
59 next if $tinfo->{'name'} =~ /Heinrichi part/;
61 next unless $tinfo->{'id'} eq $lookfor
62 || $tinfo->{'name'} =~ /$lookfor/;
64 my $tradition = $dir->lookup( $tinfo->{'id'} );
65 next unless $tradition->stemma_count;
66 say "Analyzing tradition " . $tradition->name;
69 if( $tradition->name =~ /Chronicle/ ) {
70 $MAXRANK = $tradition->collation->reading('L1545')->rank;
73 # Initialize everything with zeroes
74 map { $datahash{$_} = 0 } @resultfields;
75 # Put in the real text ID and name
76 $datahash{text_id} = $tinfo->{'id'};
77 $datahash{text_name} = $tradition->name;
79 # Run the analysis for each row in @rows
82 merge_types => [ 'punctuation' ] );
83 if( keys %collapse ) {
84 push( @{$opts{merge_types}}, keys %collapse );
89 $result = run_analysis( $tradition, %opts );
91 say "Analysis run failed on tradition " . $tradition->name . ": @_";
94 $datahash{loc_total} = $result->{variant_count};
95 $datahash{multiparent} = 0;
96 #$datahash{loc_genealogical} = $result->{genealogical_count};
97 #$datahash{loc_conflictvariant} = $result->{conflict_count};
98 #$datahash{loc_revertvariant} = $result->{reversion_count};
99 # Get the totals by location and by variant as we go.
100 my $totalvariant = 0;
105 my $conflictvariant = 0;
107 my $revertvariant = 0;
108 my $msgd; # for the HACK
110 foreach my $loc ( @{$result->{variants}} ) {
111 # A transition is the relationship type between parent and child.
112 # Find each genealogical transition and get the relationship type (if any)
113 # Find each non-genealogical transition and get the relationship type (if any)
114 my( $loc_conflict, $loc_reversion );
115 if( exists $loc->{unsolved} ) {
116 # Not solved; remove it from the total.
117 say "Skipping unsolved location at " . $loc->{id};
118 $datahash{loc_total}--;
120 } elsif( $MAXRANK && $loc->{id} > $MAXRANK ) {
121 # HACK until Chronicle tagging is done
122 say "Skipping ranks above $MAXRANK"
127 foreach my $rdghash( @{$loc->{readings}} ) {
128 # Weed out singletons
130 my @roots = @{$rdghash->{independent_occurrence}};
131 if( @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'}
132 && !$rdghash->{'follow_unknown'} ) {
136 my $rdg = $tradition->collation->reading( $rdghash->{readingid} );
138 if( $rdghash->{'is_conflict'} ) {
142 } elsif( $rdghash->{'is_reverted'} ) {
146 } elsif( @roots == 1 ) {
147 $type = 'genealogical';
150 warn "Reading $rdg neither conflict, genealogical, nor reverted. What?";
153 my $typekey = substr( $type, 0, 3 ) . '_';
155 # Add relation stats for reading parents. If the reading is reverted,
156 # treat it as genealogical for the parent.
157 my $phash = $type eq 'reverted'
158 ? $rdghash->{'reversion_parents'} : $rdghash->{'source_parents'};
159 _add_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, $rdg,
160 $type, $phash, \%datahash, \@unknown );
162 if( $loc_conflict ) {
164 } elsif( $loc_reversion ) {
171 # Add in the sums for the whole location
172 $datahash{loc_totalvariant} = $totalvariant;
173 $datahash{loc_genealogical} = $genloc;
174 $datahash{loc_genvariant} = $genvariant;
175 $datahash{loc_conflict} = $conflictloc;
176 $datahash{loc_conflictvariant} = $conflictvariant;
177 $datahash{loc_reverted} = $revertloc;
178 $datahash{loc_revertvariant} = $revertvariant;
179 $datahash{loc_singleton} = $singleton;
180 $datahash{percent_genealogical} = $datahash{loc_genealogical} / $datahash{loc_total};
181 $datahash{percent_genvariant} = $genvariant / $totalvariant;
182 $datahash{percent_genorrevert} = ( $genvariant + $datahash{loc_revertvariant} ) / $totalvariant;
184 # Write them out to CSV.
185 my @csvalues = map { $datahash{$_} } @resultfields;
186 if( $csv->combine( @csvalues ) ) {
187 say $fh $csv->string;
189 say "combine() failed on argument: " . $csv->error_input;
191 map { say STDERR sprintf( "Unknown transition %s -> %s (%s) :%s", @$_ ) } @unknown;
196 sub _add_reading_relations {
197 my( $rid, $rank, $robj, $type, $parenthash, $datahash, $unknown ) = @_;
198 my @kp = keys ( %$parenthash );
200 _increment_typekey( $datahash, $type, 'source' );
204 $datahash->{multiparent} = @kp - 1;
206 foreach my $p ( @kp ) {
207 my $pdata = $parenthash->{$p};
209 if( $pdata->{relation} ) {
210 $relation = $pdata->{relation}->{transposed}
211 ? 'transposition' : $pdata->{relation}->{type};
213 $relation = 'unknown';
215 say "Unknown relation on missing reading object $rid at rank $rank";
217 say "Unknown relation on missing parent object for $rid at rank $rank";
219 push( @$unknown, [ $pdata->{label}, $robj->id, $robj->text, $type ] );
222 _increment_typekey( $datahash, $type, $relation );
226 sub _increment_typekey {
227 my( $datahash, $type, $relation ) = @_;
228 my $typekey = substr( $type, 0, 3 ) . "_$relation";
229 unless( exists $datahash->{$typekey} ) {
231 warn "No field for $typekey";
233 $datahash->{$typekey}++;
234 # # TODO distinguish parent-bad vs. rdg-bad
235 # if( $robj && $robj->grammar_invalid ) {
236 # $datahash->{$typekey.'_ungramm'} = 1;
237 # } elsif( $robj && $robj->is_nonsense ) {
238 # $datahash->{$typekey.'_nonsense'} = 1;