make French morphology use Lingua objects; add tests
[scpubgit/stemmatology.git] / script / statistics.pl
1 #!/usr/bin/env perl
2
3 use strict;
4 use warnings;
5 use lib 'lib';
6 use feature 'say';
7 use Text::Tradition::Directory;
8 use Text::Tradition::Analysis qw/ run_analysis /;
9
10 binmode STDOUT, ':utf8';
11 binmode STDERR, ':utf8';
12 eval { no warnings; binmode $DB::OUT, ':utf8'; $DB::deep = 1000 };
13
14 my $args;
15 my $db = 'SQLite';
16 if( $ARGV[0] && $ARGV[0] eq 'mysql' ) {
17         $db = shift @ARGV;
18 }
19 if( $db eq 'mysql' ) {
20         $args = { 'dsn' => 'dbi:mysql:dbname=stemmaweb',
21                           'extra_args' => { 'user' => 'stemmaweb', 'password' => 'l@chmann' } };
22 } else {
23         $args = { 'dsn' => 'dbi:SQLite:dbname=stemmaweb/db/traditions.db' };
24 }
25 # the rest of @ARGV is tradition names
26
27 my $dir = Text::Tradition::Directory->new( $args );
28 my @traditions;
29 my @tlist = $dir->traditionlist;
30 if( @ARGV ) {
31         # Get only the traditions named.
32         foreach my $tid ( @tlist ) {
33                 push( @traditions, $tid->{'id'} )
34                         if grep { $tid->{'name'} =~ /\Q$_\E/ } @ARGV;
35         }
36 } else {
37         @traditions = map { $_->{'id'} } @tlist;
38 }
39
40 # Initialize our stats collection
41 my $omit = '(omitted)';
42
43 # Run the analysis of each tradition
44 # Look through the results
45 foreach my $tid ( @traditions ) {
46         my $scope = $dir->new_scope();
47         my $tradition = $dir->lookup( $tid );
48         printf( "\n**** TRADITION %s ****\n", $tradition->name );
49         my $c = $tradition->collation;
50         my $results = run_analysis( $tradition );
51         my %stats;
52         my %rels_found;
53         foreach my $row ( @{$results->{'variants'}} ) {
54                 # say sprintf( "=== Looking at rank %d (%s) ===", $row->{'id'},
55                 #       $row->{'genealogical'} ? 'genealogical' : 'not genealogical' );
56                 my $rdgdir = {};
57                 map { $rdgdir->{$_->{'readingid'}} = $_ } @{$row->{'readings'}};
58                 # Look for reading parents and the relationships between them.
59                 my %seen_rel;
60                 foreach my $rdg ( keys %$rdgdir ) {
61                         my $rhash = $rdgdir->{$rdg};
62                         my $parents = $rhash->{'reading_parents'};
63                         if( $parents && @$parents ) {
64                                 say sprintf( " - reading %s ( %s ) has %d possible origins",
65                                         $rdg, $rhash->{'text'}, scalar @$parents )
66                                         unless @$parents == 1;
67                                 foreach my $p ( @$parents ) {
68                                         # Is there a relationship here?
69                                         my $rel = $c->get_relationship( $rdg, $p );
70                                         my $type;
71                                         if( $rel ) {
72                                                 # Yes there is - get its type and figure stuff out.
73                                                 $type = $rel->type;
74                                         } elsif( $rdg eq $omit ) {
75                                                 $type = 'deletion';
76                                         } elsif( $p eq $omit ) {
77                                                 $type = 'addition';
78                                         } # TODO need to manage transposition
79                                         if( $type ) {
80                                                 # Note that there was an instability of this type
81                                                 $seen_rel{$type} = 1;
82                                                 $stats{$type} = {} unless exists $stats{$type};
83                                                 # Calculate, in this spot, how often the form shifted
84                                                 # vs. how often it stayed the same.
85                                                 # Add the number of times this form appeared
86                                                 add_to_hash( $stats{$type}, 'shifts', 
87                                                                          $rhash->{'independent_occurrence'} );
88                                                 # Add the number of times this form was followed
89                                                 add_to_hash( $stats{$type}, 'follows', $rhash->{'followed'} );
90                                                 # TODO work out whether not_followed gets included after iteration
91                                         }
92                                 } # foreach parent
93                                 
94                         } # if parents
95                         foreach my $k ( keys %seen_rel ) {
96                                 add_to_hash( \%rels_found, $k, 1 );
97                         }
98                 } # foreach rdg
99         }
100         # Print out the stats
101         # First see how stable the text was
102         my $total = $c->end->rank - 1;
103         say sprintf( "Total locations %d, total variant locations %d", 
104                                  $total, $results->{'variant_count'} );
105         say $results->{'genealogical_count'} 
106                 . " variant locations entirely followed the stemma";
107         say $results->{'conflict_count'}
108                 . " variant readings conflicted with the stemma";
109         foreach my $k ( keys %rels_found ) {
110                 my $shifts = $rels_found{$k};
111                 say "Had $shifts total $k-type shifts in $total locations";
112         }
113         foreach my $k ( keys %stats ) {
114                 say "\tType $k:";
115                 say sprintf( "\tUnstable readings shifted %d times, and were followed %d times",
116                                          $stats{$k}->{'shifts'}, $stats{$k}->{'follows'} );
117         }
118 }
119
120 sub add_to_hash {
121         my( $hash, $key, $num ) = @_;
122         unless( exists $hash->{$key} ) {
123                 $hash->{$key} = 0;
124         }
125         $hash->{$key} += $num;
126 }