From: Tara L Andrews Date: Mon, 29 Oct 2012 16:57:37 +0000 (+0100) Subject: another script to exclude relatioship types in turn from the analysis X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=3157336785d7a4a1e64ba00269cb139db335ea0f;p=scpubgit%2Fstemmatology.git another script to exclude relatioship types in turn from the analysis --- diff --git a/analysis/script/exclude.pl b/analysis/script/exclude.pl new file mode 100755 index 0000000..074cc86 --- /dev/null +++ b/analysis/script/exclude.pl @@ -0,0 +1,192 @@ +#!/usr/bin/env perl + +use feature 'say'; +use lib 'lib'; +use strict; +use warnings; +use Getopt::Long; +use Text::CSV_XS; +use Text::Tradition::Analysis qw/ run_analysis /; +use Text::Tradition::Directory; +use TryCatch; + +binmode STDOUT, ':utf8'; +binmode STDERR, ':utf8'; + +my( $dsn, $dbuser, $dbpass ); +my $filename = 'analysis.csv'; +GetOptions( + 'dsn=s' => \$dsn, + 'u|user=s' => \$dbuser, + 'p|pass=s' => \$dbpass, + 'f|file=s' => \$filename +); + +my %dbopts = ( dsn => $dsn ); +if( $dbuser || $dbpass ) { + $dbopts{extra_args} = { user => $dbuser, password => $dbpass } +} + +my $dir = Text::Tradition::Directory->new( %dbopts ); +my $scope = $dir->new_scope(); +my $lookfor = shift @ARGV || ''; +my %collapse; +if( @ARGV ) { + say "Merging relationship types @ARGV"; + map { $collapse{$_} = 1 } @ARGV; +} + +## Set up the relationship types we will exclude in turn. False means "run +## analysis with basic set of exclusions", i.e. orth/spelling/punct, and exclude +## the variants in question later. True means "explicitly exclude this type too +## at analysis time." +my %relation_types = ( + sameword => undef, + grammatical => 1, + lexical => 1, + uncertain => 1, + other => 1, + addition => undef, + deletion => undef +); + +# Set up the things we want to calculate for each text +my @calcs = qw/ total genealogical excoincidental reverted exgenealogical coincidental /; +my @resultfields = ( 'text_name' ); +foreach my $rt ( keys %relation_types ) { + foreach my $cc ( @calcs ) { + push( @resultfields, sprintf( "%s_ex_%s", $cc, $rt ) ); + } +} + +my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } ); +open my $fh, ">:encoding(UTF-8)", $filename or die "$filename: $!"; +if( $csv->combine( @resultfields ) ) { + say $fh $csv->string; +} else { + say "combine() failed on argument: " . $csv->error_input; +} + +foreach my $tinfo( $dir->traditionlist ) { + next if $tinfo->{'name'} eq 'xxxxx'; + next if $tinfo->{'name'} =~ /158/; + next if $tinfo->{'name'} =~ /Heinrichi part/; + if( $lookfor ) { + next unless $tinfo->{'id'} eq $lookfor + || $tinfo->{'name'} =~ /$lookfor/; + } + my $tradition = $dir->lookup( $tinfo->{'id'} ); + next unless $tradition->stemma_count; + say "Analyzing tradition " . $tradition->name; + ## HACK + my $MAXRANK; + if( $tradition->name =~ /Chronicle/ ) { + $MAXRANK = $tradition->collation->reading('L1545')->rank; + } + my %datahash; + # Initialize everything with zeroes + map { $datahash{$_} = 0 } @resultfields; + # Put in the real text ID and name + $datahash{text_id} = $tinfo->{'id'}; + $datahash{text_name} = $tradition->name; + + # Run the analysis for each row in @rows + my $vanilla; # Store the run with no extra exclusions + my $result; + foreach my $type ( keys %relation_types ) { + say "...calculating on exclusion of $type"; + if( $relation_types{$type} ) { + $result = run_exclude( $tradition, $type ); + } elsif( !$vanilla ) { + $result = run_exclude( $tradition ); + $vanilla = $result; + } else { + $result = $vanilla; + } + + # Get the totals by location and by variant as we go. + my $totalvariant = 0; + my $singleton = 0; + my $genvariant = 0; + my $conflictvariant = 0; + my $revertvariant = 0; + my $msgd; # for the HACK + my @unknown; + foreach my $loc ( @{$result->{variants}} ) { + # A transition is the relationship type between parent and child. + # Find each genealogical transition + # Find each non-genealogical transition + if( exists $loc->{unsolved} ) { + # Not solved; remove it from the total. + say "Skipping unsolved location at " . $loc->{id}; + next; + } elsif( $MAXRANK && $loc->{id} > $MAXRANK ) { + # HACK until Chronicle tagging is done + say "Skipping ranks above $MAXRANK" + unless $msgd; + $msgd = 1; + next; + } + foreach my $rdghash( @{$loc->{readings}} ) { + # Weed out singletons + $totalvariant++; + my @roots = @{$rdghash->{independent_occurrence}}; + if( @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'} + && !$rdghash->{'follow_unknown'} ) { + $singleton++; + next; + } + my $type; + if( $rdghash->{'is_conflict'} ) { + $type = 'conflict'; + $conflictvariant++; + } elsif( $rdghash->{'is_reverted'} ) { + $type = 'reverted'; + $revertvariant++; + } elsif( @roots == 1 ) { + $type = 'genealogical'; + $genvariant++; + } else { + warn 'Reading ' . $rdghash->{readingid} . ' neither conflict, genealogical, nor reverted. What?'; + $type = 'ERROR'; + } + } + } + + # Add in the sums for the whole location + $datahash{"total_$type"} = $totalvariant - $singleton; + $datahash{"genealogical_ex_$type"} = $genvariant; + $datahash{"reverted_ex_$type"} = $revertvariant; + $datahash{"coincidental_ex_$type"} = $conflictvariant; + $datahash{"excoincidental_ex_type"} = $genvariant + $revertvariant; + $datahash{"exgenealogical_ex_type"} = $conflictvariant + $revertvariant; + } + + # Write them out to CSV. + my @csvalues = map { $datahash{$_} } @resultfields; + if( $csv->combine( @csvalues ) ) { + say $fh $csv->string; + } else { + say "combine() failed on argument: " . $csv->error_input; + } +} + +close $fh; + +sub run_exclude { + my( $tradition, $type ) = @_; + my $merge = [ qw/ orthographic spelling punctuation / ]; + if( $type && $relation_types{$type} ) { + push( @$merge, $type ); + } + + my $result; + try { + $result = run_analysis( $tradition, exclude_type1 => 1, + merge_types => $merge ); + } catch { + say "Analysis run failed on tradition " . $tradition->name . ": @_"; + return; + } + return $result; +}