From: Tara L Andrews <tla@mit.edu>
Date: Mon, 16 Jul 2012 23:24:00 +0000 (+0200)
Subject: analysis script for upcoming presentation
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?p=scpubgit%2Fstemmatology.git;a=commitdiff_plain;h=a23b3715cfb0f935843b2e0043fdbdeb0a0a706c

analysis script for upcoming presentation
---

diff --git a/script/analyze.pl b/script/analyze.pl
new file mode 100755
index 0000000..b34249c
--- /dev/null
+++ b/script/analyze.pl
@@ -0,0 +1,140 @@
+#!/usr/bin/env perl
+
+use feature 'say';
+use lib 'lib';
+use strict;
+use warnings;
+use Text::CSV_XS;
+use Text::Tradition::Analysis qw/ run_analysis /;
+use Text::Tradition::Directory;
+
+binmode STDOUT, ':utf8';
+binmode STDERR, ':utf8';
+
+my $dir = Text::Tradition::Directory->new(
+    'dsn' => 'dbi:SQLite:dbname=stemmaweb/db/traditions.db',
+    );
+
+my $scope = $dir->new_scope();
+my $lookfor = shift @ARGV || '';
+my $collapse = [ @ARGV ];
+
+my @relation_types = qw/ orthographic spelling grammatical lexical
+	transposition addition deletion wordsimilar unknown /;
+
+my @resultfields = qw/
+	text_name loc_total loc_totalvariant loc_genealogical loc_genvariant 
+	loc_conflict loc_conflictvariant percent_genealogical percent_genvariant /;
+map { push( @resultfields, "gen_$_", "gen_${_}_nonsense", "gen_${_}_ungramm" ) }
+	@relation_types;
+map { push( @resultfields, "con_$_", "con_${_}_nonsense", "con_${_}_ungramm" ) }
+	@relation_types;
+map { push( @resultfields, "percent_gen_$_", "percent_${_}_gen" ) } @relation_types;
+map { push( @resultfields, "percent_con_$_" ) } @relation_types;
+	
+my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
+open my $fh, ">:encoding(UTF-8)", "analysis.csv" or die "analysis.csv: $!";
+if( $csv->combine( @resultfields ) ) {
+	say $fh $csv->string;
+} else {
+	say "combine() failed on argument: " . $csv->error_input;
+}
+
+foreach my $tinfo( $dir->traditionlist ) {
+	next if $tinfo->{'name'} eq 'xxxxx';
+	next unless $tinfo->{'id'} eq $lookfor
+		|| $tinfo->{'name'} =~ /$lookfor/;
+	my $tradition = $dir->lookup( $tinfo->{'id'} );
+	say "Analyzing tradition " . $tradition->name;
+	my %datahash;
+	# Initialize everything with zeroes
+	map { $datahash{$_} = 0 } @resultfields;
+	# Put in the real text ID and name
+	$datahash{text_id} = $tinfo->{'id'};
+	$datahash{text_name} = $tradition->name;
+	
+	# Run the analysis for each row in @rows
+	my %opts = ( exclude_type1 => 1 );
+	if( @$collapse ) {
+		$opts{merge_types} = $collapse;
+	}
+	
+	my $result = run_analysis( $tradition, %opts );
+	$datahash{loc_total} = $result->{variant_count};
+	$datahash{loc_genealogical} = $result->{genealogical_count};
+	$datahash{loc_conflict} = $result->{variant_count} - $result->{genealogical_count};
+	$datahash{loc_conflictvariant} = $result->{conflict_count};
+	# Get the number of total and genealogical variants as we go below.
+	my $totalvariant = 0;
+	my $genvariant = 0;
+	my @unknown;
+	foreach my $loc ( @{$result->{variants}} ) {
+		# A transition is the relationship type between parent and child.
+		# Find each genealogical transition and get the relationship type (if any)
+		# Find each non-genealogical transition and get the relationship type (if any)
+		# Someday, look for reversions
+		foreach my $rdghash( @{$loc->{readings}} ) {
+			# Weed out singletons
+			my @roots = @{$rdghash->{independent_occurrence}};
+			next if @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'}
+				&& !$rdghash->{'follow_unknown'};
+			# TODO Weed out punctuation
+			my $rdg = $tradition->collation->reading( $rdghash->{readingid} );
+			my $typekey = @roots == 1 ? 'gen_' : 'con_';
+			foreach my $p ( keys %{$rdghash->{reading_parents}} ) {
+				my $pdata = $rdghash->{reading_parents}->{$p};
+				my $relation;
+				if( $pdata->{relation} ) {
+					$relation = $pdata->{relation}->{type};
+				} else {
+					$relation = 'unknown';
+					if( !$rdg ) {
+						say "Unknown relation on missing reading object " 
+							. $rdghash->{readingid} . " at rank " . $loc->{id};
+					} elsif( !$pdata ) {
+						say "Unknown relation on missing parent object for " 
+							. $rdghash->{readingid} . " at rank " . $loc->{id};
+					
+					} else {
+						push( @unknown, [ $pdata->{label}, $rdg->id, $rdg->text, 
+							( @roots == 1 ? 'genealogical' : 'conflicting' ) ] );
+					}
+				}
+				$typekey .= $relation;
+				$datahash{$typekey}++;
+				## TODO distinguish parent-bad vs. rdg-bad
+				if( $rdg && $rdg->grammar_invalid ) {
+					$datahash{$typekey.'_ungramm'} = 1;
+				} elsif( $rdg && $rdg->is_nonsense ) {
+					$datahash{$typekey.'_nonsense'} = 1;
+				}
+			}
+			$totalvariant++;
+			$genvariant++ if @roots == 1;
+		}
+	}
+	
+	# Add in the sums for the whole location
+	$datahash{'loc_genvariant'} = $genvariant;	
+	$datahash{'loc_totalvariant'} = $totalvariant;
+	$datahash{'percent_genealogical'} = $datahash{loc_genealogical} / $datahash{loc_total};
+	$datahash{'percent_genvariant'} = $genvariant / $totalvariant;
+	foreach my $type ( @relation_types ) {
+		$datahash{"percent_gen_$type"} = $datahash{"gen_$type"} / $totalvariant;
+		$datahash{"percent_con_$type"} = $datahash{"con_$type"} / $totalvariant;
+		$datahash{"percent_${type}_gen"} = 
+			$datahash{"gen_$type"} + $datahash{"con_$type"} == 0 ? 0 :
+			$datahash{"gen_$type"} / ( $datahash{"gen_$type"} + $datahash{"con_$type"} );
+	}
+	
+	# Write them out to CSV.
+	my @csvalues = map { $datahash{$_} } @resultfields;
+	if( $csv->combine( @csvalues ) ) {
+		say $fh $csv->string;
+	} else {
+		say "combine() failed on argument: " . $csv->error_input;
+	}
+	map { say sprintf( "Unknown transition %s -> %s (%s) :%s", @$_ ) } @unknown;
+}
+
+close $fh;