9 use Text::Tradition::Analysis qw/ run_analysis /;
10 use Text::Tradition::Directory;
13 binmode STDOUT, ':utf8';
14 binmode STDERR, ':utf8';
16 my( $dsn, $dbuser, $dbpass );
17 my $filename = 'exclusions.csv';
20 'u|user=s' => \$dbuser,
21 'p|pass=s' => \$dbpass,
22 'f|file=s' => \$filename
25 my %dbopts = ( dsn => $dsn );
26 if( $dbuser || $dbpass ) {
27 $dbopts{extra_args} = { user => $dbuser, password => $dbpass }
30 my $dir = Text::Tradition::Directory->new( %dbopts );
31 my $scope = $dir->new_scope();
32 my $lookfor = shift @ARGV || '';
34 ## Set up the relationship types we will exclude in turn. False means "run
35 ## analysis with basic set of exclusions", i.e. orth/spelling/punct, and exclude
36 ## the variants in question later. True means "explicitly exclude this type too
38 my @relation_types = qw/ none sameword grammatical lexical uncertain other
39 addition deletion transposition /;
41 # Set up the things we want to calculate for each text
42 my @calcs = qw/ total genealogical excoincidental reverted exgenealogical coincidental /;
43 my @resultfields = ( 'text_name' );
44 foreach my $rt ( @relation_types ) {
45 foreach my $cc ( @calcs ) {
46 push( @resultfields, sprintf( "%s_ex_%s", $cc, $rt ) );
50 my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
51 open my $fh, ">:encoding(UTF-8)", $filename or die "$filename: $!";
52 if( $csv->combine( @resultfields ) ) {
55 say "combine() failed on argument: " . $csv->error_input;
58 foreach my $tinfo( $dir->traditionlist ) {
59 next if $tinfo->{'name'} eq 'xxxxx';
60 next if $tinfo->{'name'} =~ /158/;
61 next if $tinfo->{'name'} =~ /Heinrichi part/;
63 next unless $tinfo->{'id'} eq $lookfor
64 || $tinfo->{'name'} =~ /$lookfor/;
66 my $tradition = $dir->lookup( $tinfo->{'id'} );
67 next unless $tradition->stemma_count;
68 say "Analyzing tradition " . $tradition->name;
71 if( $tradition->name =~ /Chronicle/ ) {
72 $MAXRANK = $tradition->collation->reading('L1545')->rank;
75 # Initialize everything with zeroes
76 map { $datahash{$_} = 0 } @resultfields;
77 # Put in the real text ID and name
78 $datahash{text_id} = $tinfo->{'id'};
79 $datahash{text_name} = $tradition->name;
84 $fullresult = run_analysis( $tradition, exclude_type1 => 1 );
85 $noorthresult = run_analysis( $tradition, exclude_type1 => 1,
86 merge_types => [ qw/ orthographic spelling punctuation / ] );
88 say "Analysis run failed on tradition " . $tradition->name . ": @_";
91 foreach my $rtype ( @relation_types ) {
92 say "...calculating on exclusion of $rtype";
93 my $result = $rtype eq 'none' ? $fullresult : $noorthresult;
95 # Get the totals by location and by variant as we go.
99 my $conflictvariant = 0;
100 my $revertvariant = 0;
101 my $msgd; # for the HACK
103 foreach my $loc ( @{$result->{variants}} ) {
104 # A transition is the relationship type between parent and child.
105 # Find each genealogical transition
106 # Find each non-genealogical transition
107 if( exists $loc->{unsolved} ) {
108 # Not solved; remove it from the total.
109 say "Skipping unsolved location at " . $loc->{id};
111 } elsif( $MAXRANK && $loc->{id} > $MAXRANK ) {
112 # HACK until Chronicle tagging is done
113 say "Skipping ranks above $MAXRANK"
118 foreach my $rdghash( @{$loc->{readings}} ) {
119 # Weed out singletons
121 my @roots = @{$rdghash->{independent_occurrence}};
122 if( @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'}
123 && !$rdghash->{'follow_unknown'} ) {
128 if( $rdghash->{'is_conflict'} ) {
130 } elsif( $rdghash->{'is_reverted'} ) {
132 } elsif( @roots == 1 ) {
133 $type = 'genealogical';
135 warn 'Reading ' . $rdghash->{readingid} . ' neither conflict, genealogical, nor reverted. What?';
138 # Get the relationship type stats for reading parents.
139 my $rdg = $tradition->collation->reading( $rdghash->{readingid} );
141 my $phash = $type eq 'reverted'
142 ? $rdghash->{'reversion_parents'} : $rdghash->{'source_parents'};
143 my @rels = _get_reading_relations( $rdghash->{'readingid'}, $loc->{'id'},
144 $rdg, $type, $phash, \%datahash, \@unknown );
145 # If this is one of our exclusions, take it out of the total.
146 foreach my $rel ( @rels ) {
147 if( $rel eq $rtype ) {
151 # Otherwise add the variant type to our count.
152 $conflictvariant++ if $type eq 'conflict';
153 $revertvariant++ if $type eq 'reverted';
154 $genvariant++ if $type eq 'genealogical';
160 # Add in the sums for the whole location
161 $datahash{"total_ex_$rtype"} = $totalvariant - $singleton;
162 $datahash{"genealogical_ex_$rtype"} = $genvariant;
163 $datahash{"reverted_ex_$rtype"} = $revertvariant;
164 $datahash{"coincidental_ex_$rtype"} = $conflictvariant;
165 $datahash{"excoincidental_ex_$rtype"} = $genvariant + $revertvariant;
166 $datahash{"exgenealogical_ex_$rtype"} = $conflictvariant + $revertvariant;
169 # Write them out to CSV.
170 my @csvalues = map { $datahash{$_} } @resultfields;
171 if( $csv->combine( @csvalues ) ) {
172 say $fh $csv->string;
174 say "combine() failed on argument: " . $csv->error_input;
180 sub _get_reading_relations {
181 my( $rid, $rank, $robj, $type, $parenthash, $datahash, $unknown ) = @_;
182 my @kp = keys ( %$parenthash );
183 return ( 'source' ) unless @kp; # In case there is no parent reading to relate.
186 foreach my $p ( @kp ) {
187 my $pdata = $parenthash->{$p};
189 if( $pdata->{relation} ) {
190 $relation = $pdata->{relation}->{transposed}
191 ? 'transposition' : $pdata->{relation}->{type};
193 $relation = 'unknown';
195 say "Unknown relation on missing reading object $rid at rank $rank";
197 say "Unknown relation on missing parent object for $rid at rank $rank";
199 push( @$unknown, [ $pdata->{label}, $robj->id, $robj->text, $type ] );
202 push( @rels, $relation );