From: Tara L Andrews Date: Thu, 11 Oct 2012 05:55:26 +0000 (+0200) Subject: move analysis script; update Analysis.pm for new relationship regime X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=ee26c4d95146c9798effed3f75c676049a8a1a68;p=scpubgit%2Fstemmatology.git move analysis script; update Analysis.pm for new relationship regime --- diff --git a/analysis/lib/Text/Tradition/Analysis.pm b/analysis/lib/Text/Tradition/Analysis.pm index 7777f86..45810b5 100644 --- a/analysis/lib/Text/Tradition/Analysis.pm +++ b/analysis/lib/Text/Tradition/Analysis.pm @@ -341,7 +341,7 @@ sub group_variants { # Otherwise, record it... $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'}; # ...and grab any transpositions, and their relations. - my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings(); + my @transp = grep { $_->rank != $rank } _all_related( $rdg->{'t'} ); foreach my $trdg ( @transp ) { next if exists $readings_at_rank{$trdg->id}; $has_transposition = 1; @@ -402,6 +402,14 @@ sub group_variants { return $grouped_readings; } +sub _all_related { + my $rdg = shift; + my $c = $rdg->collation; + my @all = map { $c->reading( $_ ) } $c->relations->graph->all_neighbors( $rdg ); + return @all; +} + + # Helper function to query the alignment table for all witnesses (a.c. included) # that have a given reading at its rank. sub _table_witnesses { @@ -517,6 +525,7 @@ sub _graph_for_grouping { # needed to make up the groups. $graph = $stemma->situation_graph( $extant, $acwits, $aclabel ); } catch ( Text::Tradition::Error $e ) { + $DB::single = 1; throw( "Could not extend graph with given extant and a.c. witnesses: " . $e->message ); } catch { @@ -825,7 +834,14 @@ sub _resolve_parent_relationships { $phash->{'is_ungrammatical'} = $pobj->grammar_invalid; } } elsif( $p eq '(omitted)' ) { - $phash->{relation} = { type => 'addition' }; + # Check to see if the reading in question is a repetition. + my @reps = $rdg->related_readings( 'repetition' ); + if( @reps ) { + $phash->{relation} = { type => 'repetition', + annotation => "of reading @reps" }; + } else { + $phash->{relation} = { type => 'addition' }; + } } # Save it $rdgparents->{$p} = $phash; diff --git a/base/script/analyze.pl b/analysis/script/analyze.pl similarity index 73% rename from base/script/analyze.pl rename to analysis/script/analyze.pl index 21a124a..bc7997c 100755 --- a/base/script/analyze.pl +++ b/analysis/script/analyze.pl @@ -12,30 +12,32 @@ binmode STDOUT, ':utf8'; binmode STDERR, ':utf8'; my $dir = Text::Tradition::Directory->new( - 'dsn' => 'dbi:SQLite:dbname=stemmaweb/db/traditions.db', + 'dsn' => 'dbi:SQLite:dbname=../../stemmaweb/db/traditions.db', ); -my $calcdsn = 'dbi:SQLite:dbname=db/graphs.db'; - -my $scope = $dir->new_scope(); + my $scope = $dir->new_scope(); my $lookfor = shift @ARGV || ''; -my @collapse = @ARGV; +my %collapse; +map { $collapse{$_} = 1 } @ARGV; my @relation_types = grep { !$collapse{$_} } - qw/ orthographic spelling grammatical lexical transposition addition deletion - wordsimilar unknown /; + qw/ orthographic spelling grammatical lexical transposition repetition + uncertain other addition deletion wordsimilar unknown /; my @resultfields = qw/ text_name loc_total loc_totalvariant loc_genealogical loc_genvariant loc_conflict loc_conflictvariant loc_reverted loc_revertvariant percent_genealogical percent_genvariant percent_genorrevert /; -map { push( @resultfields, "gen_$_", "gen_${_}_nonsense", "gen_${_}_ungramm" ) } - @relation_types; -map { push( @resultfields, "con_$_", "con_${_}_nonsense", "con_${_}_ungramm" ) } - @relation_types; -map { push( @resultfields, "rev_$_", "rev_${_}_nonsense", "rev_${_}_ungramm" ) } - @relation_types; -map { push( @resultfields, "percent_gen_$_", "percent_${_}_gen" ) } @relation_types; -map { push( @resultfields, "percent_con_$_", "percent_rev_$_" ) } @relation_types; +map { push( @resultfields, "gen_$_", "con_$_", "rev_$_" ) } @relation_types; +# map { push( @resultfields, "gen_$_", "gen_${_}_nonsense", "gen_${_}_ungramm" ) } +# @relation_types; +# map { push( @resultfields, "con_$_", "con_${_}_nonsense", "con_${_}_ungramm" ) } +# @relation_types; +# map { push( @resultfields, "rev_$_", "rev_${_}_nonsense", "rev_${_}_ungramm" ) } +# @relation_types; +map { push( @resultfields, "percent_gen_$_", "percent_${_}_gen", + "percent_${_}_notcon" ) } @relation_types; +map { push( @resultfields, "percent_con_$_", "percent_rev_$_", + "percent_noncon_$_" ) } @relation_types; my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } ); open my $fh, ">:encoding(UTF-8)", "analysis.csv" or die "analysis.csv: $!"; @@ -61,10 +63,9 @@ foreach my $tinfo( $dir->traditionlist ) { # Run the analysis for each row in @rows my %opts = ( exclude_type1 => 1, - merge_types => [ 'punctuation' ], - calcdsn => $calcdsn ); - if( @collapse ) { - push( @{$opts{merge_types}}, @collapse ); + merge_types => [ 'punctuation' ] ); + if( keys %collapse ) { + push( @{$opts{merge_types}}, keys %collapse ); } my $result = run_analysis( $tradition, %opts ); @@ -84,6 +85,10 @@ foreach my $tinfo( $dir->traditionlist ) { # Find each genealogical transition and get the relationship type (if any) # Find each non-genealogical transition and get the relationship type (if any) my( $loc_conflict, $loc_reversion ); + if( exists $loc->{unsolved} ) { + say STDERR "Skipping unsolved location at " . $loc->{id}; + next; + } foreach my $rdghash( @{$loc->{readings}} ) { # Weed out singletons my @roots = @{$rdghash->{independent_occurrence}}; @@ -137,11 +142,19 @@ foreach my $tinfo( $dir->traditionlist ) { $datahash{'percent_genvariant'} = $genvariant / $totalvariant; $datahash{'percent_genorrevert'} = ( $genvariant + $datahash{loc_revertvariant} ) / $totalvariant; foreach my $type ( @relation_types ) { - $datahash{"percent_gen_$type"} = $datahash{"gen_$type"} / $totalvariant; - $datahash{"percent_con_$type"} = $datahash{"con_$type"} / $totalvariant; + my $pgtype = $datahash{"gen_$type"}; + my $pctype = $datahash{"con_$type"}; + my $prtype = $datahash{"rev_$type"}; + $datahash{"percent_gen_$type"} = $pgtype / $totalvariant; + $datahash{"percent_con_$type"} = $pctype / $totalvariant; + $datahash{"percent_rev_$type"} = $prtype / $totalvariant; + $datahash{"percent_notcon_$type"} = ( $pgtype + $prtype ) / $totalvariant; $datahash{"percent_${type}_gen"} = - $datahash{"gen_$type"} + $datahash{"con_$type"} == 0 ? 0 : - $datahash{"gen_$type"} / ( $datahash{"gen_$type"} + $datahash{"con_$type"} ); + $pgtype + $pctype + $prtype == 0 + ? 0 : $pgtype / ( $pgtype + $pctype + $prtype ); + $datahash{"percent_${type}_notcon"} = + $pgtype + $pctype + $prtype == 0 + ? 0 : ( $pgtype + $prtype ) / ( $pgtype + $pctype + $prtype ); } # Write them out to CSV. @@ -176,10 +189,10 @@ sub _add_reading_relations { my $typekey = substr( $type, 0, 3 ) . "_$relation"; $datahash->{$typekey}++; ## TODO distinguish parent-bad vs. rdg-bad - if( $robj && $robj->grammar_invalid ) { - $datahash->{$typekey.'_ungramm'} = 1; - } elsif( $robj && $robj->is_nonsense ) { - $datahash->{$typekey.'_nonsense'} = 1; - } +# if( $robj && $robj->grammar_invalid ) { +# $datahash->{$typekey.'_ungramm'} = 1; +# } elsif( $robj && $robj->is_nonsense ) { +# $datahash->{$typekey.'_nonsense'} = 1; +# } } } \ No newline at end of file