[scpubgit/stemmatology.git] / group_vars.pl

#!/usr/bin/env perl

use lib 'lib';
use strict;
use warnings;
use Text::Tradition;
use Text::Tradition::Stemma;

binmode STDERR, ":utf8";
binmode STDOUT, ":utf8";
eval { no warnings; binmode $DB::OUT, ":utf8"; };

my $informat = 'TEI';
my $inbase;
my $linear = 1;

# Parse the tradition data

my $input = $ARGV[0];
my @lines;
open( INFILE, "$input" ) or die "Could not read $input";
binmode INFILE, ':utf8';
@lines = <INFILE>;
close INFILE;
$input = join( '', @lines );

my %args = ( $informat => $input,
             'linear' => $linear );
$args{'base'} = $inbase if $inbase;
 my $tradition = Text::Tradition->new( %args );

# Parse the stemma hypothesis
my $stemma = Text::Tradition::Stemma->new( 
    'collation' => $tradition->collation,
    'dot' => $ARGV[1],
    );

# We have the collation, so get the alignment table with witnesses in rows.

my $all_wits_table = $tradition->collation->make_alignment_table( 1 );

# For each column in the alignment table, we want to see if the existing
# groupings of witnesses match our stemma hypothesis.  First let's just go 
# through the groupings.

# Strip the list of sigla and save it for correlation to the readings.
my $col_wits = shift @$all_wits_table;
    
# For each column in the table, group the readings by witness.

my $used_vars = 0;
foreach my $i ( 0 .. $#$all_wits_table ) {
    my $rdg_wits = {};
    my $col_rdgs = shift @$all_wits_table;
    foreach my $j ( 0 .. $#{$col_rdgs} ) {
        my $rdg = $col_rdgs->[$j];
        $rdg = '' unless $rdg;   # We care about empty readings
        $rdg = undef if $rdg eq '#LACUNA#'; # ... unless they're lacunas
        if( $rdg ) {
            $rdg_wits->{$rdg} = [] unless $rdg_wits->{$rdg};
            add_variant_wit( $rdg_wits->{$rdg}, $col_wits->[$j] );
        }
    }
    
    my( $groups, $readings ) = useful_variant( $rdg_wits );
    next unless $groups && $readings;        
    
    # We can look up witnesses for a reading; we also want to look up readings
    # for a given witness.
    my $group_readings = {};
    foreach my $x ( 0 .. $#$groups ) {
        $group_readings->{wit_stringify( $groups->[$x] )} = $readings->[$x];
    }
    
    # For all the groups with more than one member, collect the list of all
    # contiguous vertices needed to connect them.
    # TODO: deal with a.c. reading logic
    my $conflict = analyze_variant_location( $group_readings, $groups, $stemma->apsp );
    print wit_stringify( $groups ) . ' - ' . join( " / ", @$readings ) . "\n";
    foreach my $rdg ( keys %$conflict ) {
        my $var = $conflict->{$rdg};
        print "\tReadings '$rdg' and '$var' are not genealogical\n";
    }
    
    # Now run the same analysis given a distance tree.
    my $distance_apsp = $stemma->distance_trees->[0]->APSP_Floyd_Warshall();
    $conflict = analyze_variant_location( $group_readings, $groups, $distance_apsp );
    foreach my $rdg ( keys %$conflict ) {
        my $var = $conflict->{$rdg};
        print "\tReadings '$rdg' and '$var' disregarded by parsimony\n";
    }

    # Record that we used this variant in an analysis
    $used_vars++;
    
}
print "Found $used_vars useful variants in this analysis\n";

sub analyze_variant_location {
    my( $group_readings, $groups, $apsp ) = @_;
    my %contig;
    my $conflict = {};
    foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
        my @members = @$g;
        my $gst = wit_stringify( $g );
        map { $contig{$_} = $gst } @members; # The witnesses need themselves to be 
                                             # in their collection.
        next unless @members > 1;
        my $curr = pop @members;
        foreach my $m ( @members ) {
            foreach my $v ( $apsp->path_vertices( $curr, $m ) ) {
                $contig{$v} = $gst unless exists $contig{$v};
                next if $contig{$v} eq $gst;
                # print STDERR "Conflict at $v between group $gst and group " 
                #     . $contig{$v} . "\n";
                # Record what is conflicting.
                $conflict->{$group_readings->{$gst}} = $group_readings->{$contig{$v}};
            }
        }
    }
    return $conflict;
}

# Add the variant, subject to a.c. representation logic.
# This assumes that we will see the 'main' version before the a.c. version.
sub add_variant_wit {
    my( $arr, $wit ) = @_;
    my $acstr = $tradition->collation->ac_label;
    my $skip;
    if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
        my $real = $1;
        $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
    } 
    push( @$arr, $wit ) unless $skip;
}

# Return an answer if the variant is useful, i.e. if there are at least 2 variants
# with at least 2 witnesses each.
sub useful_variant {
    my( $readings ) = @_;
    my $total = keys %$readings;
    foreach my $var ( keys %$readings ) {
        $total-- if @{$readings->{$var}} == 1;
    }
    return( undef, undef ) if $total <= 1;
    my( $groups, $text );
    foreach my $var ( keys %$readings ) {
        push( @$groups, $readings->{$var} );
        push( @$text, $var );
    }
    return( $groups, $text );
}

# Take an array of witness groupings and produce a string like
# A,B / C,D,E / F

sub wit_stringify {
    my $groups = shift;
    my @gst;
    # If we were passed an array of witnesses instead of an array of 
    # groupings, then "group" the witnesses first.
    unless( ref( $groups->[0] ) ) {
        my $mkgrp = [ $groups ];
        $groups = $mkgrp;
    }
    foreach my $g ( @$groups ) {
        push( @gst, join( ',', @$g ) );
    }
    return join( ' / ', @gst );
}
Commit	Line	Data
a5d53377	1	#!/usr/bin/env perl
	2
	3	use lib 'lib';
	4	use strict;
	5	use warnings;
	6	use Text::Tradition;
	7	use Text::Tradition::Stemma;
	8
	9	binmode STDERR, ":utf8";
	10	binmode STDOUT, ":utf8";
	11	eval { no warnings; binmode $DB::OUT, ":utf8"; };
	12
	13	my $informat = 'TEI';
	14	my $inbase;
	15	my $linear = 1;
	16
	17	# Parse the tradition data
	18
	19	my $input = $ARGV[0];
	20	my @lines;
	21	open( INFILE, "$input" ) or die "Could not read $input";
	22	binmode INFILE, ':utf8';
	23	@lines = <INFILE>;
	24	close INFILE;
	25	$input = join( '', @lines );
	26
	27	my %args = ( $informat => $input,
	28	'linear' => $linear );
	29	$args{'base'} = $inbase if $inbase;
	30	my $tradition = Text::Tradition->new( %args );
	31
	32	# Parse the stemma hypothesis
	33	my $stemma = Text::Tradition::Stemma->new(
	34	'collation' => $tradition->collation,
	35	'dot' => $ARGV[1],
	36	);
	37
	38	# We have the collation, so get the alignment table with witnesses in rows.
	39
	40	my $all_wits_table = $tradition->collation->make_alignment_table( 1 );
	41
	42	# For each column in the alignment table, we want to see if the existing
	43	# groupings of witnesses match our stemma hypothesis. First let's just go
	44	# through the groupings.
	45
	46	# Strip the list of sigla and save it for correlation to the readings.
	47	my $col_wits = shift @$all_wits_table;
	48
	49	# For each column in the table, group the readings by witness.
	50
40f19742	51	my $used_vars = 0;
a5d53377	52	foreach my $i ( 0 .. $#$all_wits_table ) {
	53	my $rdg_wits = {};
	54	my $col_rdgs = shift @$all_wits_table;
	55	foreach my $j ( 0 .. $#{$col_rdgs} ) {
	56	my $rdg = $col_rdgs->[$j];
	57	$rdg = '' unless $rdg; # We care about empty readings
	58	$rdg = undef if $rdg eq '#LACUNA#'; # ... unless they're lacunas
	59	if( $rdg ) {
	60	$rdg_wits->{$rdg} = [] unless $rdg_wits->{$rdg};
	61	add_variant_wit( $rdg_wits->{$rdg}, $col_wits->[$j] );
	62	}
	63	}
	64
	65	my( $groups, $readings ) = useful_variant( $rdg_wits );
	66	next unless $groups && $readings;
	67
40f19742	68	# We can look up witnesses for a reading; we also want to look up readings
	69	# for a given witness.
	70	my $group_readings = {};
	71	foreach my $x ( 0 .. $#$groups ) {
	72	$group_readings->{wit_stringify( $groups->[$x] )} = $readings->[$x];
	73	}
	74
	75	# For all the groups with more than one member, collect the list of all
	76	# contiguous vertices needed to connect them.
	77	# TODO: deal with a.c. reading logic
	78	my $conflict = analyze_variant_location( $group_readings, $groups, $stemma->apsp );
	79	print wit_stringify( $groups ) . ' - ' . join( " / ", @$readings ) . "\n";
	80	foreach my $rdg ( keys %$conflict ) {
	81	my $var = $conflict->{$rdg};
	82	print "\tReadings '$rdg' and '$var' are not genealogical\n";
	83	}
	84
	85	# Now run the same analysis given a distance tree.
	86	my $distance_apsp = $stemma->distance_trees->[0]->APSP_Floyd_Warshall();
	87	$conflict = analyze_variant_location( $group_readings, $groups, $distance_apsp );
	88	foreach my $rdg ( keys %$conflict ) {
	89	my $var = $conflict->{$rdg};
	90	print "\tReadings '$rdg' and '$var' disregarded by parsimony\n";
	91	}
	92
	93	# Record that we used this variant in an analysis
	94	$used_vars++;
	95
	96	}
	97	print "Found $used_vars useful variants in this analysis\n";
	98
	99	sub analyze_variant_location {
	100	my( $group_readings, $groups, $apsp ) = @_;
a5d53377	101	my %contig;
40f19742	102	my $conflict = {};
	103	foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
	104	my @members = @$g;
	105	my $gst = wit_stringify( $g );
	106	map { $contig{$_} = $gst } @members; # The witnesses need themselves to be
	107	# in their collection.
a5d53377	108	next unless @members > 1;
a5d53377	109	my $curr = pop @members;
a5d53377	110	foreach my $m ( @members ) {
40f19742	111	foreach my $v ( $apsp->path_vertices( $curr, $m ) ) {
	112	$contig{$v} = $gst unless exists $contig{$v};
	113	next if $contig{$v} eq $gst;
	114	# print STDERR "Conflict at $v between group $gst and group "
a5d53377	115	# . $contig{$v} . "\n";
40f19742	116	# Record what is conflicting.
40f19742	117	$conflict->{$group_readings->{$gst}} = $group_readings->{$contig{$v}};
a5d53377	118	}
	119	}
	120	}
40f19742	121	return $conflict;
a5d53377	122	}
a5d53377	123
	124	# Add the variant, subject to a.c. representation logic.
	125	# This assumes that we will see the 'main' version before the a.c. version.
	126	sub add_variant_wit {
	127	my( $arr, $wit ) = @_;
	128	my $acstr = $tradition->collation->ac_label;
	129	my $skip;
	130	if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
	131	my $real = $1;
	132	$skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
	133	}
	134	push( @$arr, $wit ) unless $skip;
	135	}
	136
	137	# Return an answer if the variant is useful, i.e. if there are at least 2 variants
	138	# with at least 2 witnesses each.
	139	sub useful_variant {
	140	my( $readings ) = @_;
	141	my $total = keys %$readings;
	142	foreach my $var ( keys %$readings ) {
	143	$total-- if @{$readings->{$var}} == 1;
	144	}
	145	return( undef, undef ) if $total <= 1;
	146	my( $groups, $text );
	147	foreach my $var ( keys %$readings ) {
40f19742	148	push( @$groups, $readings->{$var} );
a5d53377	149	push( @$text, $var );
	150	}
	151	return( $groups, $text );
	152	}
40f19742	153
	154	# Take an array of witness groupings and produce a string like
	155	# A,B / C,D,E / F
	156
	157	sub wit_stringify {
	158	my $groups = shift;
	159	my @gst;
	160	# If we were passed an array of witnesses instead of an array of
	161	# groupings, then "group" the witnesses first.
	162	unless( ref( $groups->[0] ) ) {
	163	my $mkgrp = [ $groups ];
	164	$groups = $mkgrp;
	165	}
	166	foreach my $g ( @$groups ) {
	167	push( @gst, join( ',', @$g ) );
	168	}
	169	return join( ' / ', @gst );
	170	}
	171