add a stats script, evade a perl bug, go back to hackier a.c. wit handling
Tara L Andrews [Fri, 2 Mar 2012 15:53:45 +0000 (16:53 +0100)]
lib/Text/Tradition/Analysis.pm
script/statistics.pl [new file with mode: 0755]

index 7dd592a..a6c3948 100644 (file)
@@ -121,8 +121,8 @@ sub run_analysis {
        my $c = $tradition->collation;
 
        my $stemma_id = $opts{'stemma_id'} || 0;
-       my @ranks = @{$opts{'ranks'}} if ref( $opts{'ranks'} ) eq 'ARRAY';
-       my @collapse = @{$opts{'merge_types'}} if ref( $opts{'merge_types'} ) eq 'ARRAY';
+       my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : ();
+       my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : ();
 
        # Get the stemma        
        my $stemma = $tradition->stemma( $stemma_id );
@@ -194,22 +194,20 @@ by the witnesses listed in $groups->[$n].
 sub group_variants {
        my( $tradition, $rank, $lacunose, $collapse ) = @_;
        my $c = $tradition->collation;
-       # All the regexps here are to get rid of space characters in witness names.
        my $aclabel = $c->ac_label;
-       $aclabel =~ s/\s/_/g;
        # Get the alignment table readings
        my %readings_at_rank;
        my @gap_wits;
-       foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) {
+       foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) {
                my $rdg = $tablewit->{'tokens'}->[$rank-1];
                my $wit = $tablewit->{'witness'};
-               $wit =~ s/\s/_/g;
+               $wit =~ s/^(.*)\Q$aclabel\E$/${1}_ac/;
                if( $rdg && $rdg->{'t'}->is_lacuna ) {
-                       _add_to_witlist( $wit, $lacunose, $aclabel );
+                       _add_to_witlist( $wit, $lacunose, '_ac' );
                } elsif( $rdg ) {
                        $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'};
                } else {
-                       _add_to_witlist( $wit, \@gap_wits, $aclabel );
+                       _add_to_witlist( $wit, \@gap_wits, '_ac' );
                }
        }
        
@@ -219,12 +217,12 @@ sub group_variants {
                # Skip readings that have been collapsed into others.
                next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id};
                my @wits = $rdg->witnesses;
-               map { s/\s/_/g } @wits;
+               map { s/\Q$aclabel\E$/_ac/ } @wits;
                if( $collapse ) {
                        my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; };
                        foreach my $other ( $rdg->related_readings( $filter ) ) {
                                my @otherwits = $other->witnesses;
-                               map { s/\s/_/g } @otherwits;
+                               map { s/\Q$aclabel\E$/_ac/ } @otherwits;
                                push( @wits, @otherwits );
                                $grouped_readings{$other->id} = 0;
                        }
diff --git a/script/statistics.pl b/script/statistics.pl
new file mode 100755 (executable)
index 0000000..1490f8f
--- /dev/null
@@ -0,0 +1,126 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use lib 'lib';
+use feature 'say';
+use Text::Tradition::Directory;
+use Text::Tradition::Analysis qw/ run_analysis /;
+
+binmode STDOUT, ':utf8';
+binmode STDERR, ':utf8';
+eval { no warnings; binmode $DB::OUT, ':utf8'; $DB::deep = 1000 };
+
+my $args;
+my $db = 'SQLite';
+if( $ARGV[0] && $ARGV[0] eq 'mysql' ) {
+       $db = shift @ARGV;
+}
+if( $db eq 'mysql' ) {
+       $args = { 'dsn' => 'dbi:mysql:dbname=stemmaweb',
+                         'extra_args' => { 'user' => 'stemmaweb', 'password' => 'l@chmann' } };
+} else {
+       $args = { 'dsn' => 'dbi:SQLite:dbname=stemmaweb/db/traditions.db' };
+}
+# the rest of @ARGV is tradition names
+
+my $dir = Text::Tradition::Directory->new( $args );
+my @traditions;
+my @tlist = $dir->traditionlist;
+if( @ARGV ) {
+       # Get only the traditions named.
+       foreach my $tid ( @tlist ) {
+               push( @traditions, $tid->{'id'} )
+                       if grep { $tid->{'name'} =~ /\Q$_\E/ } @ARGV;
+       }
+} else {
+       @traditions = map { $_->{'id'} } @tlist;
+}
+
+# Initialize our stats collection
+my $omit = '(omitted)';
+
+# Run the analysis of each tradition
+# Look through the results
+foreach my $tid ( @traditions ) {
+       my $scope = $dir->new_scope();
+       my $tradition = $dir->lookup( $tid );
+       printf( "\n**** TRADITION %s ****\n", $tradition->name );
+       my $c = $tradition->collation;
+       my $results = run_analysis( $tradition );
+       my %stats;
+       my %rels_found;
+       foreach my $row ( @{$results->{'variants'}} ) {
+               # say sprintf( "=== Looking at rank %d (%s) ===", $row->{'id'},
+               #       $row->{'genealogical'} ? 'genealogical' : 'not genealogical' );
+               my $rdgdir = {};
+               map { $rdgdir->{$_->{'readingid'}} = $_ } @{$row->{'readings'}};
+               # Look for reading parents and the relationships between them.
+               my %seen_rel;
+               foreach my $rdg ( keys %$rdgdir ) {
+                       my $rhash = $rdgdir->{$rdg};
+                       my $parents = $rhash->{'reading_parents'};
+                       if( $parents && @$parents ) {
+                               say sprintf( " - reading %s ( %s ) has %d possible origins",
+                                       $rdg, $rhash->{'text'}, scalar @$parents )
+                                       unless @$parents == 1;
+                               foreach my $p ( @$parents ) {
+                                       # Is there a relationship here?
+                                       my $rel = $c->get_relationship( $rdg, $p );
+                                       my $type;
+                                       if( $rel ) {
+                                               # Yes there is - get its type and figure stuff out.
+                                               $type = $rel->type;
+                                       } elsif( $rdg eq $omit ) {
+                                               $type = 'deletion';
+                                       } elsif( $p eq $omit ) {
+                                               $type = 'addition';
+                                       } # TODO need to manage transposition
+                                       if( $type ) {
+                                               # Note that there was an instability of this type
+                                               $seen_rel{$type} = 1;
+                                               $stats{$type} = {} unless exists $stats{$type};
+                                               # Calculate, in this spot, how often the form shifted
+                                               # vs. how often it stayed the same.
+                                               # Add the number of times this form appeared
+                                               add_to_hash( $stats{$type}, 'shifts', 
+                                                                        $rhash->{'independent_occurrence'} );
+                                               # Add the number of times this form was followed
+                                               add_to_hash( $stats{$type}, 'follows', $rhash->{'followed'} );
+                                               # TODO work out whether not_followed gets included after iteration
+                                       }
+                               } # foreach parent
+                               
+                       } # if parents
+                       foreach my $k ( keys %seen_rel ) {
+                               add_to_hash( \%rels_found, $k, 1 );
+                       }
+               } # foreach rdg
+       }
+       # Print out the stats
+       # First see how stable the text was
+       my $total = $c->end->rank - 1;
+       say sprintf( "Total locations %d, total variant locations %d", 
+                                $total, $results->{'variant_count'} );
+       say $results->{'genealogical_count'} 
+               . " variant locations entirely followed the stemma";
+       say $results->{'conflict_count'}
+               . " variant readings conflicted with the stemma";
+       foreach my $k ( keys %rels_found ) {
+               my $shifts = $rels_found{$k};
+               say "Had $shifts total $k-type shifts in $total locations";
+       }
+       foreach my $k ( keys %stats ) {
+               say "\tType $k:";
+               say sprintf( "\tUnstable readings shifted %d times, and were followed %d times",
+                                        $stats{$k}->{'shifts'}, $stats{$k}->{'follows'} );
+       }
+}
+
+sub add_to_hash {
+       my( $hash, $key, $num ) = @_;
+       unless( exists $hash->{$key} ) {
+               $hash->{$key} = 0;
+       }
+       $hash->{$key} += $num;
+}
\ No newline at end of file