From: tla Date: Sat, 27 Oct 2012 12:08:25 +0000 (+0200) Subject: ignore collation rels; make multiparent and source relations visible in counts; new... X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=3bce8f646dafaceb98d875eb6cf3274e88d0bce7;p=scpubgit%2Fstemmatology.git ignore collation rels; make multiparent and source relations visible in counts; new script for variation magnitude --- diff --git a/analysis/lib/Text/Tradition/Analysis.pm b/analysis/lib/Text/Tradition/Analysis.pm index aa02e85..2d15b8f 100644 --- a/analysis/lib/Text/Tradition/Analysis.pm +++ b/analysis/lib/Text/Tradition/Analysis.pm @@ -835,6 +835,9 @@ sub _resolve_parent_relationships { my $phash = { 'label' => $prep }; if( $pobj ) { my $rel = $c->get_relationship( $p, $rid ); + if( $rel && $rel->type eq 'collated' ) { + $rel = undef; + } if( $rel ) { _add_to_hash( $rel, $phash ); } elsif( $rdg ) { diff --git a/analysis/script/analyze.pl b/analysis/script/analyze.pl index 6770ff4..18aa4c3 100755 --- a/analysis/script/analyze.pl +++ b/analysis/script/analyze.pl @@ -14,10 +14,12 @@ binmode STDOUT, ':utf8'; binmode STDERR, ':utf8'; my( $dsn, $dbuser, $dbpass ); +my $filename = 'analysis.csv'; GetOptions( 'dsn=s' => \$dsn, 'u|user=s' => \$dbuser, 'p|pass=s' => \$dbpass, + 'f|file=s' => \$filename ); my %dbopts = ( dsn => $dsn ); @@ -36,15 +38,15 @@ if( @ARGV ) { my @relation_types = grep { !$collapse{$_} } qw/ orthographic spelling grammatical lexical transposition repetition - uncertain other addition deletion wordsimilar unknown /; + uncertain other addition deletion wordsimilar unknown source /; my @resultfields = qw/ - text_name loc_total loc_singleton loc_totalvariant loc_genealogical loc_genvariant + text_name loc_total loc_singleton multiparent loc_totalvariant loc_genealogical loc_genvariant loc_conflict loc_conflictvariant loc_reverted loc_revertvariant /; map { push( @resultfields, "gen_$_", "con_$_", "rev_$_" ) } @relation_types; my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } ); -open my $fh, ">:encoding(UTF-8)", "analysis.csv" or die "analysis.csv: $!"; +open my $fh, ">:encoding(UTF-8)", $filename or die "$filename: $!"; if( $csv->combine( @resultfields ) ) { say $fh $csv->string; } else { @@ -90,6 +92,7 @@ foreach my $tinfo( $dir->traditionlist ) { next; } $datahash{loc_total} = $result->{variant_count}; + $datahash{multiparent} = 0; #$datahash{loc_genealogical} = $result->{genealogical_count}; #$datahash{loc_conflictvariant} = $result->{conflict_count}; #$datahash{loc_revertvariant} = $result->{reversion_count}; @@ -151,17 +154,10 @@ foreach my $tinfo( $dir->traditionlist ) { # Add relation stats for reading parents. If the reading is reverted, # treat it as genealogical for the parent. + my $phash = $type eq 'reverted' + ? $rdghash->{'reversion_parents'} : $rdghash->{'source_parents'}; _add_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, $rdg, - ( $type eq 'reverted' ? 'genealogical' : $type ), - $rdghash->{'source_parents'}, \%datahash, \@unknown ); - # Add relation stats for reading reversions if they exist. - if( $type eq 'reverted' ) { - # Get relationship between reverted readings and their non-matching - # parents. - _add_reading_relations( $rdghash->{'readingid'}, $loc->{'id'}, $rdg, - $type, $rdghash->{'reversion_parents'}, \%datahash, \@unknown ); - } - + $type, $phash, \%datahash, \@unknown ); } if( $loc_conflict ) { $conflictloc++; @@ -199,11 +195,20 @@ close $fh; sub _add_reading_relations { my( $rid, $rank, $robj, $type, $parenthash, $datahash, $unknown ) = @_; - foreach my $p ( keys %$parenthash ) { + my @kp = keys ( %$parenthash ); + unless( @kp ) { + _increment_typekey( $datahash, $type, 'source' ); + return; + } + if( @kp > 1 ) { + $datahash->{multiparent} = @kp - 1; + } + foreach my $p ( @kp ) { my $pdata = $parenthash->{$p}; my $relation; if( $pdata->{relation} ) { - $relation = $pdata->{relation}->{type}; + $relation = $pdata->{relation}->{transposed} + ? 'transposition' : $pdata->{relation}->{type}; } else { $relation = 'unknown'; if( !$robj ) { @@ -214,13 +219,22 @@ sub _add_reading_relations { push( @$unknown, [ $pdata->{label}, $robj->id, $robj->text, $type ] ); } } - my $typekey = substr( $type, 0, 3 ) . "_$relation"; - $datahash->{$typekey}++; - ## TODO distinguish parent-bad vs. rdg-bad -# if( $robj && $robj->grammar_invalid ) { -# $datahash->{$typekey.'_ungramm'} = 1; -# } elsif( $robj && $robj->is_nonsense ) { -# $datahash->{$typekey.'_nonsense'} = 1; -# } + _increment_typekey( $datahash, $type, $relation ); + } +} + +sub _increment_typekey { + my( $datahash, $type, $relation ) = @_; + my $typekey = substr( $type, 0, 3 ) . "_$relation"; + unless( exists $datahash->{$typekey} ) { + $DB::single = 1; + warn "No field for $typekey"; } + $datahash->{$typekey}++; +# # TODO distinguish parent-bad vs. rdg-bad +# if( $robj && $robj->grammar_invalid ) { +# $datahash->{$typekey.'_ungramm'} = 1; +# } elsif( $robj && $robj->is_nonsense ) { +# $datahash->{$typekey.'_nonsense'} = 1; +# } } diff --git a/analysis/script/magnitude.pl b/analysis/script/magnitude.pl new file mode 100755 index 0000000..afcd917 --- /dev/null +++ b/analysis/script/magnitude.pl @@ -0,0 +1,82 @@ +#!/usr/bin/env perl + +use feature 'say'; +use lib 'lib'; +use strict; +use warnings; +use Getopt::Long; +use Set::Scalar; +use Text::CSV_XS; +use Text::Tradition::Analysis qw/ group_variants /; +use Text::Tradition::Directory; +use TryCatch; + +binmode STDOUT, ':utf8'; +binmode STDERR, ':utf8'; + +my( $dsn, $dbuser, $dbpass ); +my $filename = 'magnitude.csv'; +GetOptions( + 'dsn=s' => \$dsn, + 'u|user=s' => \$dbuser, + 'p|pass=s' => \$dbpass, + 'f|file=s' => \$filename +); + +my %dbopts = ( dsn => $dsn ); +if( $dbuser || $dbpass ) { + $dbopts{extra_args} = { user => $dbuser, password => $dbpass } +} + +my $dir = Text::Tradition::Directory->new( %dbopts ); +my $scope = $dir->new_scope(); +my $lookfor = shift @ARGV || ''; +my $collapse = Set::Scalar->new(); +if( @ARGV ) { + say "Merging relationship types @ARGV"; + map { $collapse->insert($_) } @ARGV; +} + +my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } ); +open my $fh, ">:encoding(UTF-8)", $filename or die "$filename: $!"; + +foreach my $tinfo( $dir->traditionlist ) { + next if $tinfo->{'name'} eq 'xxxxx'; + next if $tinfo->{'name'} =~ /158/; + next if $tinfo->{'name'} =~ /Heinrichi part/; + if( $lookfor ) { + next unless $tinfo->{'id'} eq $lookfor + || $tinfo->{'name'} =~ /$lookfor/; + } + my $tradition = $dir->lookup( $tinfo->{'id'} ); + say "Counting variation in tradition " . $tradition->name; + + # Group the variants for each rank, and count the number of + # reading groupings. + my $lcph = Set::Scalar->new(); # placeholder for lacunae + my $moved = {}; + my %magnitudes; + my $max = 1; + foreach my $rk ( 1 .. $tradition->collation->end->rank ) { + my $missing = $lcph->clone(); + my $rankgroup = group_variants( $tradition, $rk, $missing, $moved, $collapse ); + my $numr = scalar keys %$rankgroup; + $numr++ if $missing->size; + $max = $numr if $numr > $max; + if( exists $magnitudes{$numr} ) { + $magnitudes{$numr}++ + } else { + $magnitudes{$numr} = 1; + } + } + + # Write them out to CSV. + my @csvalues = map { $magnitudes{$_} || 0 } 2..$max; + if( $csv->combine( $tradition->name, @csvalues ) ) { + say $fh $csv->string; + } else { + say "combine() failed on argument: " . $csv->error_input; + } +} + +close $fh;