first forays in variant analysis
[scpubgit/stemmatology.git] / group_vars.pl
CommitLineData
a5d53377 1#!/usr/bin/env perl
2
3use lib 'lib';
4use strict;
5use warnings;
6use Text::Tradition;
7use Text::Tradition::Stemma;
8
9binmode STDERR, ":utf8";
10binmode STDOUT, ":utf8";
11eval { no warnings; binmode $DB::OUT, ":utf8"; };
12
13my $informat = 'TEI';
14my $inbase;
15my $linear = 1;
16
17# Parse the tradition data
18
19my $input = $ARGV[0];
20my @lines;
21open( INFILE, "$input" ) or die "Could not read $input";
22binmode INFILE, ':utf8';
23@lines = <INFILE>;
24close INFILE;
25$input = join( '', @lines );
26
27my %args = ( $informat => $input,
28 'linear' => $linear );
29$args{'base'} = $inbase if $inbase;
30 my $tradition = Text::Tradition->new( %args );
31
32# Parse the stemma hypothesis
33my $stemma = Text::Tradition::Stemma->new(
34 'collation' => $tradition->collation,
35 'dot' => $ARGV[1],
36 );
37
38# We have the collation, so get the alignment table with witnesses in rows.
39
40my $all_wits_table = $tradition->collation->make_alignment_table( 1 );
41
42# For each column in the alignment table, we want to see if the existing
43# groupings of witnesses match our stemma hypothesis. First let's just go
44# through the groupings.
45
46# Strip the list of sigla and save it for correlation to the readings.
47my $col_wits = shift @$all_wits_table;
48
49# For each column in the table, group the readings by witness.
50
51my $useful_vars = 0;
52foreach my $i ( 0 .. $#$all_wits_table ) {
53 my $rdg_wits = {};
54 my $col_rdgs = shift @$all_wits_table;
55 foreach my $j ( 0 .. $#{$col_rdgs} ) {
56 my $rdg = $col_rdgs->[$j];
57 $rdg = '' unless $rdg; # We care about empty readings
58 $rdg = undef if $rdg eq '#LACUNA#'; # ... unless they're lacunas
59 if( $rdg ) {
60 $rdg_wits->{$rdg} = [] unless $rdg_wits->{$rdg};
61 add_variant_wit( $rdg_wits->{$rdg}, $col_wits->[$j] );
62 }
63 }
64
65 my( $groups, $readings ) = useful_variant( $rdg_wits );
66 next unless $groups && $readings;
67
68 # For all the groups with more than one member, make a group that contains
69 # all contiguous vertices to connect them.
70 # TODO Need to do pairwise comparison of groups - a variant location can
71 # have both coincidental and genealogical variants!
72 my %contig;
73 my $conflict;
74 foreach my $g ( @$groups ) {
75 my @members = split( /,/, $g );
76 next unless @members > 1;
77 map { $contig{$_} = $g } @members;
78 my $curr = pop @members;
79 foreach my $m ( @members ) {
80 foreach my $v ( $stemma->apsp->path_vertices( $curr, $m ) ) {
81 $contig{$v} = $g unless exists $contig{$v};
82 next if $contig{$v} eq $g;
83 # print STDERR "Conflict at $v between group $g and group "
84 # . $contig{$v} . "\n";
85 $conflict = 1;
86 }
87 }
88 }
89 print join( " / ", @$groups ) . ' - ' . join( " / ", @$readings ) . ' - ';
90 print $conflict ? "coincidental" : "genealogical";
91 print "\n";
92 $useful_vars++;
93
94}
95print "Found $useful_vars useful variants\n";
96
97# Add the variant, subject to a.c. representation logic.
98# This assumes that we will see the 'main' version before the a.c. version.
99sub add_variant_wit {
100 my( $arr, $wit ) = @_;
101 my $acstr = $tradition->collation->ac_label;
102 my $skip;
103 if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
104 my $real = $1;
105 $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
106 }
107 push( @$arr, $wit ) unless $skip;
108}
109
110# Return an answer if the variant is useful, i.e. if there are at least 2 variants
111# with at least 2 witnesses each.
112sub useful_variant {
113 my( $readings ) = @_;
114 my $total = keys %$readings;
115 foreach my $var ( keys %$readings ) {
116 $total-- if @{$readings->{$var}} == 1;
117 }
118 return( undef, undef ) if $total <= 1;
119 my( $groups, $text );
120 foreach my $var ( keys %$readings ) {
121 push( @$groups, join( ',', @{$readings->{$var}} ) );
122 push( @$text, $var );
123 }
124 return( $groups, $text );
125}