[scpubgit/stemmatology.git] / script / analyze.pl

#!/usr/bin/env perl

use feature 'say';
use lib 'lib';
use strict;
use warnings;
use Text::CSV_XS;
use Text::Tradition::Analysis qw/ run_analysis /;
use Text::Tradition::Directory;

binmode STDOUT, ':utf8';
binmode STDERR, ':utf8';

my $dir = Text::Tradition::Directory->new(
    'dsn' => 'dbi:SQLite:dbname=stemmaweb/db/traditions.db',
    );

my $scope = $dir->new_scope();
my $lookfor = shift @ARGV || '';
my $collapse = [ @ARGV ];

my @relation_types = qw/ orthographic spelling grammatical lexical
	transposition addition deletion wordsimilar unknown /;

my @resultfields = qw/
	text_name loc_total loc_totalvariant loc_genealogical loc_genvariant 
	loc_conflict loc_conflictvariant percent_genealogical percent_genvariant /;
map { push( @resultfields, "gen_$_", "gen_${_}_nonsense", "gen_${_}_ungramm" ) }
	@relation_types;
map { push( @resultfields, "con_$_", "con_${_}_nonsense", "con_${_}_ungramm" ) }
	@relation_types;
map { push( @resultfields, "percent_gen_$_", "percent_${_}_gen" ) } @relation_types;
map { push( @resultfields, "percent_con_$_" ) } @relation_types;
	
my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
open my $fh, ">:encoding(UTF-8)", "analysis.csv" or die "analysis.csv: $!";
if( $csv->combine( @resultfields ) ) {
	say $fh $csv->string;
} else {
	say "combine() failed on argument: " . $csv->error_input;
}

foreach my $tinfo( $dir->traditionlist ) {
	next if $tinfo->{'name'} eq 'xxxxx';
	next unless $tinfo->{'id'} eq $lookfor
		|| $tinfo->{'name'} =~ /$lookfor/;
	my $tradition = $dir->lookup( $tinfo->{'id'} );
	say "Analyzing tradition " . $tradition->name;
	my %datahash;
	# Initialize everything with zeroes
	map { $datahash{$_} = 0 } @resultfields;
	# Put in the real text ID and name
	$datahash{text_id} = $tinfo->{'id'};
	$datahash{text_name} = $tradition->name;
	
	# Run the analysis for each row in @rows
	my %opts = ( exclude_type1 => 1 );
	if( @$collapse ) {
		$opts{merge_types} = $collapse;
	}
	
	my $result = run_analysis( $tradition, %opts );
	$datahash{loc_total} = $result->{variant_count};
	$datahash{loc_genealogical} = $result->{genealogical_count};
	$datahash{loc_conflict} = $result->{variant_count} - $result->{genealogical_count};
	$datahash{loc_conflictvariant} = $result->{conflict_count};
	# Get the number of total and genealogical variants as we go below.
	my $totalvariant = 0;
	my $genvariant = 0;
	my @unknown;
	foreach my $loc ( @{$result->{variants}} ) {
		# A transition is the relationship type between parent and child.
		# Find each genealogical transition and get the relationship type (if any)
		# Find each non-genealogical transition and get the relationship type (if any)
		# Someday, look for reversions
		foreach my $rdghash( @{$loc->{readings}} ) {
			# Weed out singletons
			my @roots = @{$rdghash->{independent_occurrence}};
			next if @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'}
				&& !$rdghash->{'follow_unknown'};
			# TODO Weed out punctuation
			my $rdg = $tradition->collation->reading( $rdghash->{readingid} );
			my $typekey = @roots == 1 ? 'gen_' : 'con_';
			foreach my $p ( keys %{$rdghash->{reading_parents}} ) {
				my $pdata = $rdghash->{reading_parents}->{$p};
				my $relation;
				if( $pdata->{relation} ) {
					$relation = $pdata->{relation}->{type};
				} else {
					$relation = 'unknown';
					if( !$rdg ) {
						say "Unknown relation on missing reading object " 
							. $rdghash->{readingid} . " at rank " . $loc->{id};
					} elsif( !$pdata ) {
						say "Unknown relation on missing parent object for " 
							. $rdghash->{readingid} . " at rank " . $loc->{id};
					
					} else {
						push( @unknown, [ $pdata->{label}, $rdg->id, $rdg->text, 
							( @roots == 1 ? 'genealogical' : 'conflicting' ) ] );
					}
				}
				$typekey .= $relation;
				$datahash{$typekey}++;
				## TODO distinguish parent-bad vs. rdg-bad
				if( $rdg && $rdg->grammar_invalid ) {
					$datahash{$typekey.'_ungramm'} = 1;
				} elsif( $rdg && $rdg->is_nonsense ) {
					$datahash{$typekey.'_nonsense'} = 1;
				}
			}
			$totalvariant++;
			$genvariant++ if @roots == 1;
		}
	}
	
	# Add in the sums for the whole location
	$datahash{'loc_genvariant'} = $genvariant;	
	$datahash{'loc_totalvariant'} = $totalvariant;
	$datahash{'percent_genealogical'} = $datahash{loc_genealogical} / $datahash{loc_total};
	$datahash{'percent_genvariant'} = $genvariant / $totalvariant;
	foreach my $type ( @relation_types ) {
		$datahash{"percent_gen_$type"} = $datahash{"gen_$type"} / $totalvariant;
		$datahash{"percent_con_$type"} = $datahash{"con_$type"} / $totalvariant;
		$datahash{"percent_${type}_gen"} = 
			$datahash{"gen_$type"} + $datahash{"con_$type"} == 0 ? 0 :
			$datahash{"gen_$type"} / ( $datahash{"gen_$type"} + $datahash{"con_$type"} );
	}
	
	# Write them out to CSV.
	my @csvalues = map { $datahash{$_} } @resultfields;
	if( $csv->combine( @csvalues ) ) {
		say $fh $csv->string;
	} else {
		say "combine() failed on argument: " . $csv->error_input;
	}
	map { say sprintf( "Unknown transition %s -> %s (%s) :%s", @$_ ) } @unknown;
}

close $fh;
Commit	Line	Data
a23b3715	1	#!/usr/bin/env perl
	2
	3	use feature 'say';
	4	use lib 'lib';
	5	use strict;
	6	use warnings;
	7	use Text::CSV_XS;
	8	use Text::Tradition::Analysis qw/ run_analysis /;
	9	use Text::Tradition::Directory;
	10
	11	binmode STDOUT, ':utf8';
	12	binmode STDERR, ':utf8';
	13
	14	my $dir = Text::Tradition::Directory->new(
	15	'dsn' => 'dbi:SQLite:dbname=stemmaweb/db/traditions.db',
	16	);
	17
	18	my $scope = $dir->new_scope();
	19	my $lookfor = shift @ARGV \|\| '';
	20	my $collapse = [ @ARGV ];
	21
	22	my @relation_types = qw/ orthographic spelling grammatical lexical
	23	transposition addition deletion wordsimilar unknown /;
	24
	25	my @resultfields = qw/
	26	text_name loc_total loc_totalvariant loc_genealogical loc_genvariant
	27	loc_conflict loc_conflictvariant percent_genealogical percent_genvariant /;
	28	map { push( @resultfields, "gen_$_", "gen_${_}_nonsense", "gen_${_}_ungramm" ) }
	29	@relation_types;
	30	map { push( @resultfields, "con_$_", "con_${_}_nonsense", "con_${_}_ungramm" ) }
	31	@relation_types;
	32	map { push( @resultfields, "percent_gen_$_", "percent_${_}_gen" ) } @relation_types;
	33	map { push( @resultfields, "percent_con_$_" ) } @relation_types;
	34
	35	my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
	36	open my $fh, ">:encoding(UTF-8)", "analysis.csv" or die "analysis.csv: $!";
	37	if( $csv->combine( @resultfields ) ) {
	38	say $fh $csv->string;
	39	} else {
	40	say "combine() failed on argument: " . $csv->error_input;
	41	}
	42
	43	foreach my $tinfo( $dir->traditionlist ) {
	44	next if $tinfo->{'name'} eq 'xxxxx';
	45	next unless $tinfo->{'id'} eq $lookfor
	46	\|\| $tinfo->{'name'} =~ /$lookfor/;
	47	my $tradition = $dir->lookup( $tinfo->{'id'} );
	48	say "Analyzing tradition " . $tradition->name;
	49	my %datahash;
	50	# Initialize everything with zeroes
	51	map { $datahash{$_} = 0 } @resultfields;
	52	# Put in the real text ID and name
	53	$datahash{text_id} = $tinfo->{'id'};
	54	$datahash{text_name} = $tradition->name;
	55
	56	# Run the analysis for each row in @rows
	57	my %opts = ( exclude_type1 => 1 );
	58	if( @$collapse ) {
	59	$opts{merge_types} = $collapse;
	60	}
	61
	62	my $result = run_analysis( $tradition, %opts );
	63	$datahash{loc_total} = $result->{variant_count};
	64	$datahash{loc_genealogical} = $result->{genealogical_count};
65	$datahash{loc_conflict} = $result->{variant_count} - $result->{genealogical_count};
66	$datahash{loc_conflictvariant} = $result->{conflict_count};
67	# Get the number of total and genealogical variants as we go below.
68	my $totalvariant = 0;
69	my $genvariant = 0;
70	my @unknown;
71	foreach my $loc ( @{$result->{variants}} ) {
72	# A transition is the relationship type between parent and child.
73	# Find each genealogical transition and get the relationship type (if any)
74	# Find each non-genealogical transition and get the relationship type (if any)
75	# Someday, look for reversions
76	foreach my $rdghash( @{$loc->{readings}} ) {
77	# Weed out singletons
78	my @roots = @{$rdghash->{independent_occurrence}};
79	next if @roots == 1 && !$rdghash->{'followed'} && !$rdghash->{'not_followed'}
80	&& !$rdghash->{'follow_unknown'};
81	# TODO Weed out punctuation
82	my $rdg = $tradition->collation->reading( $rdghash->{readingid} );
83	my $typekey = @roots == 1 ? 'gen_' : 'con_';
84	foreach my $p ( keys %{$rdghash->{reading_parents}} ) {
85	my $pdata = $rdghash->{reading_parents}->{$p};
86	my $relation;
87	if( $pdata->{relation} ) {
88	$relation = $pdata->{relation}->{type};
89	} else {
90	$relation = 'unknown';
91	if( !$rdg ) {
92	say "Unknown relation on missing reading object "
93	. $rdghash->{readingid} . " at rank " . $loc->{id};
94	} elsif( !$pdata ) {
95	say "Unknown relation on missing parent object for "
96	. $rdghash->{readingid} . " at rank " . $loc->{id};
97
98	} else {
99	push( @unknown, [ $pdata->{label}, $rdg->id, $rdg->text,
100	( @roots == 1 ? 'genealogical' : 'conflicting' ) ] );
101	}
102	}
103	$typekey .= $relation;
104	$datahash{$typekey}++;
105	## TODO distinguish parent-bad vs. rdg-bad
106	if( $rdg && $rdg->grammar_invalid ) {
107	$datahash{$typekey.'_ungramm'} = 1;
108	} elsif( $rdg && $rdg->is_nonsense ) {
109	$datahash{$typekey.'_nonsense'} = 1;
110	}
111	}
112	$totalvariant++;
113	$genvariant++ if @roots == 1;
114	}
115	}
116
117	# Add in the sums for the whole location
118	$datahash{'loc_genvariant'} = $genvariant;
119	$datahash{'loc_totalvariant'} = $totalvariant;
120	$datahash{'percent_genealogical'} = $datahash{loc_genealogical} / $datahash{loc_total};
121	$datahash{'percent_genvariant'} = $genvariant / $totalvariant;
122	foreach my $type ( @relation_types ) {
123	$datahash{"percent_gen_$type"} = $datahash{"gen_$type"} / $totalvariant;
124	$datahash{"percent_con_$type"} = $datahash{"con_$type"} / $totalvariant;
125	$datahash{"percent_${type}_gen"} =
126	$datahash{"gen_$type"} + $datahash{"con_$type"} == 0 ? 0 :
127	$datahash{"gen_$type"} / ( $datahash{"gen_$type"} + $datahash{"con_$type"} );
128	}
129
130	# Write them out to CSV.
131	my @csvalues = map { $datahash{$_} } @resultfields;
132	if( $csv->combine( @csvalues ) ) {
133	say $fh $csv->string;
134	} else {
135	say "combine() failed on argument: " . $csv->error_input;
136	}
137	map { say sprintf( "Unknown transition %s -> %s (%s) :%s", @$_ ) } @unknown;
138	}
139
140	close $fh;