From: Tara L Andrews Date: Fri, 2 Mar 2012 15:53:45 +0000 (+0100) Subject: add a stats script, evade a perl bug, go back to hackier a.c. wit handling X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=1d73ecad0553ddaa59760867355ab975bb3186ed;p=scpubgit%2Fstemmatology.git add a stats script, evade a perl bug, go back to hackier a.c. wit handling --- diff --git a/lib/Text/Tradition/Analysis.pm b/lib/Text/Tradition/Analysis.pm index 7dd592a..a6c3948 100644 --- a/lib/Text/Tradition/Analysis.pm +++ b/lib/Text/Tradition/Analysis.pm @@ -121,8 +121,8 @@ sub run_analysis { my $c = $tradition->collation; my $stemma_id = $opts{'stemma_id'} || 0; - my @ranks = @{$opts{'ranks'}} if ref( $opts{'ranks'} ) eq 'ARRAY'; - my @collapse = @{$opts{'merge_types'}} if ref( $opts{'merge_types'} ) eq 'ARRAY'; + my @ranks = ref( $opts{'ranks'} ) eq 'ARRAY' ? @{$opts{'ranks'}} : (); + my @collapse = ref( $opts{'merge_types'} ) eq 'ARRAY' ? @{$opts{'merge_types'}} : (); # Get the stemma my $stemma = $tradition->stemma( $stemma_id ); @@ -194,22 +194,20 @@ by the witnesses listed in $groups->[$n]. sub group_variants { my( $tradition, $rank, $lacunose, $collapse ) = @_; my $c = $tradition->collation; - # All the regexps here are to get rid of space characters in witness names. my $aclabel = $c->ac_label; - $aclabel =~ s/\s/_/g; # Get the alignment table readings my %readings_at_rank; my @gap_wits; - foreach my $tablewit ( @{$tradition->collation->alignment_table->{'alignment'}} ) { + foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) { my $rdg = $tablewit->{'tokens'}->[$rank-1]; my $wit = $tablewit->{'witness'}; - $wit =~ s/\s/_/g; + $wit =~ s/^(.*)\Q$aclabel\E$/${1}_ac/; if( $rdg && $rdg->{'t'}->is_lacuna ) { - _add_to_witlist( $wit, $lacunose, $aclabel ); + _add_to_witlist( $wit, $lacunose, '_ac' ); } elsif( $rdg ) { $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'}; } else { - _add_to_witlist( $wit, \@gap_wits, $aclabel ); + _add_to_witlist( $wit, \@gap_wits, '_ac' ); } } @@ -219,12 +217,12 @@ sub group_variants { # Skip readings that have been collapsed into others. next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id}; my @wits = $rdg->witnesses; - map { s/\s/_/g } @wits; + map { s/\Q$aclabel\E$/_ac/ } @wits; if( $collapse ) { my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; }; foreach my $other ( $rdg->related_readings( $filter ) ) { my @otherwits = $other->witnesses; - map { s/\s/_/g } @otherwits; + map { s/\Q$aclabel\E$/_ac/ } @otherwits; push( @wits, @otherwits ); $grouped_readings{$other->id} = 0; } diff --git a/script/statistics.pl b/script/statistics.pl new file mode 100755 index 0000000..1490f8f --- /dev/null +++ b/script/statistics.pl @@ -0,0 +1,126 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use lib 'lib'; +use feature 'say'; +use Text::Tradition::Directory; +use Text::Tradition::Analysis qw/ run_analysis /; + +binmode STDOUT, ':utf8'; +binmode STDERR, ':utf8'; +eval { no warnings; binmode $DB::OUT, ':utf8'; $DB::deep = 1000 }; + +my $args; +my $db = 'SQLite'; +if( $ARGV[0] && $ARGV[0] eq 'mysql' ) { + $db = shift @ARGV; +} +if( $db eq 'mysql' ) { + $args = { 'dsn' => 'dbi:mysql:dbname=stemmaweb', + 'extra_args' => { 'user' => 'stemmaweb', 'password' => 'l@chmann' } }; +} else { + $args = { 'dsn' => 'dbi:SQLite:dbname=stemmaweb/db/traditions.db' }; +} +# the rest of @ARGV is tradition names + +my $dir = Text::Tradition::Directory->new( $args ); +my @traditions; +my @tlist = $dir->traditionlist; +if( @ARGV ) { + # Get only the traditions named. + foreach my $tid ( @tlist ) { + push( @traditions, $tid->{'id'} ) + if grep { $tid->{'name'} =~ /\Q$_\E/ } @ARGV; + } +} else { + @traditions = map { $_->{'id'} } @tlist; +} + +# Initialize our stats collection +my $omit = '(omitted)'; + +# Run the analysis of each tradition +# Look through the results +foreach my $tid ( @traditions ) { + my $scope = $dir->new_scope(); + my $tradition = $dir->lookup( $tid ); + printf( "\n**** TRADITION %s ****\n", $tradition->name ); + my $c = $tradition->collation; + my $results = run_analysis( $tradition ); + my %stats; + my %rels_found; + foreach my $row ( @{$results->{'variants'}} ) { + # say sprintf( "=== Looking at rank %d (%s) ===", $row->{'id'}, + # $row->{'genealogical'} ? 'genealogical' : 'not genealogical' ); + my $rdgdir = {}; + map { $rdgdir->{$_->{'readingid'}} = $_ } @{$row->{'readings'}}; + # Look for reading parents and the relationships between them. + my %seen_rel; + foreach my $rdg ( keys %$rdgdir ) { + my $rhash = $rdgdir->{$rdg}; + my $parents = $rhash->{'reading_parents'}; + if( $parents && @$parents ) { + say sprintf( " - reading %s ( %s ) has %d possible origins", + $rdg, $rhash->{'text'}, scalar @$parents ) + unless @$parents == 1; + foreach my $p ( @$parents ) { + # Is there a relationship here? + my $rel = $c->get_relationship( $rdg, $p ); + my $type; + if( $rel ) { + # Yes there is - get its type and figure stuff out. + $type = $rel->type; + } elsif( $rdg eq $omit ) { + $type = 'deletion'; + } elsif( $p eq $omit ) { + $type = 'addition'; + } # TODO need to manage transposition + if( $type ) { + # Note that there was an instability of this type + $seen_rel{$type} = 1; + $stats{$type} = {} unless exists $stats{$type}; + # Calculate, in this spot, how often the form shifted + # vs. how often it stayed the same. + # Add the number of times this form appeared + add_to_hash( $stats{$type}, 'shifts', + $rhash->{'independent_occurrence'} ); + # Add the number of times this form was followed + add_to_hash( $stats{$type}, 'follows', $rhash->{'followed'} ); + # TODO work out whether not_followed gets included after iteration + } + } # foreach parent + + } # if parents + foreach my $k ( keys %seen_rel ) { + add_to_hash( \%rels_found, $k, 1 ); + } + } # foreach rdg + } + # Print out the stats + # First see how stable the text was + my $total = $c->end->rank - 1; + say sprintf( "Total locations %d, total variant locations %d", + $total, $results->{'variant_count'} ); + say $results->{'genealogical_count'} + . " variant locations entirely followed the stemma"; + say $results->{'conflict_count'} + . " variant readings conflicted with the stemma"; + foreach my $k ( keys %rels_found ) { + my $shifts = $rels_found{$k}; + say "Had $shifts total $k-type shifts in $total locations"; + } + foreach my $k ( keys %stats ) { + say "\tType $k:"; + say sprintf( "\tUnstable readings shifted %d times, and were followed %d times", + $stats{$k}->{'shifts'}, $stats{$k}->{'follows'} ); + } +} + +sub add_to_hash { + my( $hash, $key, $num ) = @_; + unless( exists $hash->{$key} ) { + $hash->{$key} = 0; + } + $hash->{$key} += $num; +} \ No newline at end of file