[scpubgit/stemmatology.git] / lib / Text / Tradition / Analysis.pm

package Text::Tradition::Analysis;

use strict;
use warnings;
use Text::Tradition;
use Text::Tradition::Stemma;

sub new {
	my( $class, $args ) = @_;
	my $self = {};
	# Our object needs to have a stemma graph and a variant table.
	my( $svg, $variants ) = run_analysis( $args->{'file'}, $args->{'stemmadot'} );
	$self->{'svg'} = $svg;
	$self->{'variants'} = $variants;
	
	bless( $self, $class );	
	return $self;
}

sub run_analysis {
	my( $file, $stemmadot ) = @_;
	# What we will return
	my $svg;
	my $variants = [];
	
	# Read in the file and stemma	
	my $tradition = Text::Tradition->new( 
		'input'  => 'Self',
		'file'   => $file,
		'linear' => 1,
		);
	my $stemma = Text::Tradition::Stemma->new(
		'collation' => $tradition->collation,
		'dot' => $stemmadot,
		);
	# We will return the stemma picture
	$svg = $stemma->as_svg;
	### DIRTY HACK
	$svg =~ s/transform=\"scale\(1 1\)/transform=\"scale\(0.7 0.7\)/;
	
	# We have the collation, so get the alignment table with witnesses in rows.
	# Also return the reading objects in the table, rather than just the words.
	
	my $all_wits_table = $tradition->collation->make_alignment_table( 'refs' );
	
	# For each column in the alignment table, we want to see if the existing
	# groupings of witnesses match our stemma hypothesis. We also want, at the
	# end, to produce an HTML table with all the variants.
	my $html_columns = 0;
	my $html_data = [];
	my $total = 0; # Keep track of the total number of variant locations
	
	# Strip the list of sigla and save it for correlation to the readings.
	my $col_wits = shift @$all_wits_table;
	
	# We will return a data structure, an array for each row that looks like:
	# { id = X, genealogical = Y, readings = [ text = X, group = Y], empty = N }
	foreach my $i ( 0 .. $#$all_wits_table ) {
		# For each column in the table, group the readings by witness.
		my $rdg_wits = {};
		my $col_rdgs = shift @$all_wits_table;
		my $rank;
		foreach my $j ( 0 .. $#{$col_rdgs} ) {
			my $rdg = $col_rdgs->[$j];
			my $rdg_text = '(omitted)';  # Initialize in case of empty reading
			if( $rdg ) {
				$rdg_text = $rdg->is_lacuna ? undef : $rdg->text; # Don't count lacunae
				# Get the rank from any real reading; they should be identical.
				$rank = $rdg->rank unless $rank || $rdg->is_lacuna;
			}
			if( defined $rdg_text ) {
				# Initialize the witness array if we haven't got one yet
				$rdg_wits->{$rdg_text} = [] unless $rdg_wits->{$rdg_text};
				# Add the relevant witness, subject to a.c. logic
				add_variant_wit( $rdg_wits->{$rdg_text}, $col_wits->[$j],
					$tradition->collation->ac_label );
			}
		}
		
		# See if this column has any potentially genealogical variants.
		# If not, skip to the next.
		$total++ unless scalar keys %$rdg_wits == 1;
		my( $groups, $readings ) = useful_variant( $rdg_wits );
		next unless $groups && $readings;  
		
		# Initialize the data structure for the row that we will return
		my $variant_row = {'id' => $rank, 'readings' => [] };
		# Keep track of our widest row
		$html_columns = scalar @$groups if scalar @$groups > $html_columns;
		
		# We can already look up witnesses for a reading; we also want to look
		# up readings for a given witness.
		my $group_readings = {};
		foreach my $x ( 0 .. $#$groups ) {
			$group_readings->{wit_stringify( $groups->[$x] )} = $readings->[$x];
		}
		
		# For all the groups with more than one member, collect the list of all
		# contiguous vertices needed to connect them.
		# TODO: deal with a.c. reading logic
		my $sc = analyze_variant_location( $group_readings, $groups, $stemma->apsp );
		$variant_row->{'genealogical'} = keys %$sc ? 1 : undef;
		foreach my $grp ( sort keys %$group_readings ) {
			my $rdg = $group_readings->{$grp};
			push( @{$variant_row->{'readings'}}, { 'text' => $rdg, 'group' => $grp } );
		}
		
		# Now run the same analysis given the calculated distance tree(s).
# 		foreach my $tree ( 0 .. $#{$stemma->distance_trees} ) {
# 			my $dc = analyze_variant_location( $group_readings, $groups,    
# 											   $stemma->distance_apsps->[$tree] );
# 			foreach my $rdg ( keys %$dc ) {
# 				my $var = $dc->{$rdg};
# 			}
# 		}
	
		# Record that we used this variant in an analysis
		push( @$variants, $variant_row );
	}
	
	# Go through our variant rows and add the number of empty columns we need.
	foreach my $row ( @$variants ) {
		my $empty = $html_columns - scalar @{$row->{'readings'}};
		$row->{'empty'} = $empty;
	}
	
	return( $svg, $variants );
}

sub analyze_variant_location {
    my( $group_readings, $groups, $apsp ) = @_;
    my %contig;
    my $conflict = {};
    foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
        my @members = @$g;
        my $gst = wit_stringify( $g );
        map { $contig{$_} = $gst } @members; # The witnesses need themselves to be 
                                             # in their collection.
        next unless @members > 1;
        my $curr = pop @members;
        foreach my $m ( @members ) {
            foreach my $v ( $apsp->path_vertices( $curr, $m ) ) {
                $contig{$v} = $gst unless exists $contig{$v};
                next if $contig{$v} eq $gst;
                # print STDERR "Conflict at $v between group $gst and group " 
                #     . $contig{$v} . "\n";
                # Record what is conflicting.
                $conflict->{$group_readings->{$gst}} = $group_readings->{$contig{$v}};
            }
        }
    }
    return $conflict;
}

# Add the variant, subject to a.c. representation logic.
# This assumes that we will see the 'main' version before the a.c. version.
sub add_variant_wit {
    my( $arr, $wit, $acstr ) = @_;
    my $skip;
    if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
        my $real = $1;
        $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
    } 
    push( @$arr, $wit ) unless $skip;
}

# Return an answer if the variant is useful, i.e. if there are at least 2 variants
# with at least 2 witnesses each.
sub useful_variant {
    my( $readings ) = @_;
    my $total = keys %$readings;
    foreach my $var ( keys %$readings ) {
        $total-- if @{$readings->{$var}} == 1;
    }
    return( undef, undef ) if $total <= 1;
    my( $groups, $text );
    foreach my $var ( keys %$readings ) {
        push( @$groups, $readings->{$var} );
        push( @$text, $var );
    }
    return( $groups, $text );
}

# Take an array of witness groupings and produce a string like
# ['A','B'] / ['C','D','E'] / ['F']

sub wit_stringify {
    my $groups = shift;
    my @gst;
    # If we were passed an array of witnesses instead of an array of 
    # groupings, then "group" the witnesses first.
    unless( ref( $groups->[0] ) ) {
        my $mkgrp = [ $groups ];
        $groups = $mkgrp;
    }
    foreach my $g ( @$groups ) {
        push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
    }
    return join( ' / ', @gst );
}
    
1;
Commit	Line	Data
d71100ed	1	package Text::Tradition::Analysis;
	2
	3	use strict;
	4	use warnings;
	5	use Text::Tradition;
	6	use Text::Tradition::Stemma;
	7
	8	sub new {
	9	my( $class, $args ) = @_;
	10	my $self = {};
	11	# Our object needs to have a stemma graph and a variant table.
	12	my( $svg, $variants ) = run_analysis( $args->{'file'}, $args->{'stemmadot'} );
	13	$self->{'svg'} = $svg;
	14	$self->{'variants'} = $variants;
	15
	16	bless( $self, $class );
	17	return $self;
	18	}
	19
	20	sub run_analysis {
	21	my( $file, $stemmadot ) = @_;
	22	# What we will return
	23	my $svg;
	24	my $variants = [];
	25
3d79e248	26	# Read in the file and stemma
d71100ed	27	my $tradition = Text::Tradition->new(
3d79e248	28	'input' => 'Self',
3d79e248	29	'file' => $file,
d71100ed	30	'linear' => 1,
	31	);
	32	my $stemma = Text::Tradition::Stemma->new(
	33	'collation' => $tradition->collation,
	34	'dot' => $stemmadot,
	35	);
	36	# We will return the stemma picture
	37	$svg = $stemma->as_svg;
	38	### DIRTY HACK
	39	$svg =~ s/transform=\"scale\(1 1\)/transform=\"scale\(0.7 0.7\)/;
	40
	41	# We have the collation, so get the alignment table with witnesses in rows.
	42	# Also return the reading objects in the table, rather than just the words.
	43
	44	my $all_wits_table = $tradition->collation->make_alignment_table( 'refs' );
	45
	46	# For each column in the alignment table, we want to see if the existing
	47	# groupings of witnesses match our stemma hypothesis. We also want, at the
	48	# end, to produce an HTML table with all the variants.
	49	my $html_columns = 0;
	50	my $html_data = [];
	51	my $total = 0; # Keep track of the total number of variant locations
	52
	53	# Strip the list of sigla and save it for correlation to the readings.
	54	my $col_wits = shift @$all_wits_table;
	55
	56	# We will return a data structure, an array for each row that looks like:
	57	# { id = X, genealogical = Y, readings = [ text = X, group = Y], empty = N }
	58	foreach my $i ( 0 .. $#$all_wits_table ) {
	59	# For each column in the table, group the readings by witness.
	60	my $rdg_wits = {};
	61	my $col_rdgs = shift @$all_wits_table;
	62	my $rank;
	63	foreach my $j ( 0 .. $#{$col_rdgs} ) {
	64	my $rdg = $col_rdgs->[$j];
d71100ed	65	my $rdg_text = '(omitted)'; # Initialize in case of empty reading
	66	if( $rdg ) {
	67	$rdg_text = $rdg->is_lacuna ? undef : $rdg->text; # Don't count lacunae
c7d0f253	68	# Get the rank from any real reading; they should be identical.
c7d0f253	69	$rank = $rdg->rank unless $rank \|\| $rdg->is_lacuna;
d71100ed	70	}
	71	if( defined $rdg_text ) {
	72	# Initialize the witness array if we haven't got one yet
	73	$rdg_wits->{$rdg_text} = [] unless $rdg_wits->{$rdg_text};
	74	# Add the relevant witness, subject to a.c. logic
	75	add_variant_wit( $rdg_wits->{$rdg_text}, $col_wits->[$j],
	76	$tradition->collation->ac_label );
	77	}
	78	}
	79
	80	# See if this column has any potentially genealogical variants.
	81	# If not, skip to the next.
	82	$total++ unless scalar keys %$rdg_wits == 1;
	83	my( $groups, $readings ) = useful_variant( $rdg_wits );
	84	next unless $groups && $readings;
	85
	86	# Initialize the data structure for the row that we will return
	87	my $variant_row = {'id' => $rank, 'readings' => [] };
	88	# Keep track of our widest row
	89	$html_columns = scalar @$groups if scalar @$groups > $html_columns;
	90
	91	# We can already look up witnesses for a reading; we also want to look
	92	# up readings for a given witness.
	93	my $group_readings = {};
	94	foreach my $x ( 0 .. $#$groups ) {
	95	$group_readings->{wit_stringify( $groups->[$x] )} = $readings->[$x];
	96	}
	97
	98	# For all the groups with more than one member, collect the list of all
	99	# contiguous vertices needed to connect them.
	100	# TODO: deal with a.c. reading logic
	101	my $sc = analyze_variant_location( $group_readings, $groups, $stemma->apsp );
	102	$variant_row->{'genealogical'} = keys %$sc ? 1 : undef;
	103	foreach my $grp ( sort keys %$group_readings ) {
	104	my $rdg = $group_readings->{$grp};
	105	push( @{$variant_row->{'readings'}}, { 'text' => $rdg, 'group' => $grp } );
	106	}
	107
	108	# Now run the same analysis given the calculated distance tree(s).
	109	# foreach my $tree ( 0 .. $#{$stemma->distance_trees} ) {
	110	# my $dc = analyze_variant_location( $group_readings, $groups,
	111	# $stemma->distance_apsps->[$tree] );
	112	# foreach my $rdg ( keys %$dc ) {
	113	# my $var = $dc->{$rdg};
	114	# }
	115	# }
	116
	117	# Record that we used this variant in an analysis
	118	push( @$variants, $variant_row );
	119	}
	120
	121	# Go through our variant rows and add the number of empty columns we need.
	122	foreach my $row ( @$variants ) {
	123	my $empty = $html_columns - scalar @{$row->{'readings'}};
	124	$row->{'empty'} = $empty;
	125	}
	126
	127	return( $svg, $variants );
	128	}
	129
	130	sub analyze_variant_location {
	131	my( $group_readings, $groups, $apsp ) = @_;
	132	my %contig;
	133	my $conflict = {};
134	foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) {
135	my @members = @$g;
136	my $gst = wit_stringify( $g );
137	map { $contig{$_} = $gst } @members; # The witnesses need themselves to be
138	# in their collection.
139	next unless @members > 1;
140	my $curr = pop @members;
141	foreach my $m ( @members ) {
142	foreach my $v ( $apsp->path_vertices( $curr, $m ) ) {
143	$contig{$v} = $gst unless exists $contig{$v};
144	next if $contig{$v} eq $gst;
145	# print STDERR "Conflict at $v between group $gst and group "
146	# . $contig{$v} . "\n";
147	# Record what is conflicting.
148	$conflict->{$group_readings->{$gst}} = $group_readings->{$contig{$v}};
149	}
150	}
151	}
152	return $conflict;
153	}
154
155	# Add the variant, subject to a.c. representation logic.
156	# This assumes that we will see the 'main' version before the a.c. version.
157	sub add_variant_wit {
158	my( $arr, $wit, $acstr ) = @_;
159	my $skip;
160	if( $wit =~ /^(.*)\Q$acstr\E$/ ) {
161	my $real = $1;
162	$skip = grep { $_ =~ /^\Q$real\E$/ } @$arr;
163	}
164	push( @$arr, $wit ) unless $skip;
165	}
166
167	# Return an answer if the variant is useful, i.e. if there are at least 2 variants
168	# with at least 2 witnesses each.
169	sub useful_variant {
170	my( $readings ) = @_;
171	my $total = keys %$readings;
172	foreach my $var ( keys %$readings ) {
173	$total-- if @{$readings->{$var}} == 1;
174	}
175	return( undef, undef ) if $total <= 1;
176	my( $groups, $text );
177	foreach my $var ( keys %$readings ) {
178	push( @$groups, $readings->{$var} );
179	push( @$text, $var );
180	}
181	return( $groups, $text );
182	}
183
184	# Take an array of witness groupings and produce a string like
185	# ['A','B'] / ['C','D','E'] / ['F']
186
187	sub wit_stringify {
188	my $groups = shift;
189	my @gst;
190	# If we were passed an array of witnesses instead of an array of
191	# groupings, then "group" the witnesses first.
192	unless( ref( $groups->[0] ) ) {
193	my $mkgrp = [ $groups ];
194	$groups = $mkgrp;
195	}
196	foreach my $g ( @$groups ) {
197	push( @gst, '[' . join( ',', map { "'$_'" } @$g ) . ']' );
198	}
199	return join( ' / ', @gst );
200	}
201
202	1;