From: Tara L Andrews Date: Sat, 1 Oct 2011 21:13:52 +0000 (+0200) Subject: flatten identical nodes by rank; allow alignment table with objects X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=0e476982b28c53e1f1553369add8df566779d78c;p=scpubgit%2Fstemmatology.git flatten identical nodes by rank; allow alignment table with objects --- diff --git a/group_vars.pl b/group_vars.pl index 3302b1a..02650c6 100644 --- a/group_vars.pl +++ b/group_vars.pl @@ -36,37 +36,49 @@ my $stemma = Text::Tradition::Stemma->new( ); # We have the collation, so get the alignment table with witnesses in rows. +# Also return the reading objects in the table, rather than just the words. -my $all_wits_table = $tradition->collation->make_alignment_table( 1 ); +my $all_wits_table = $tradition->collation->make_alignment_table( 'refs' ); # For each column in the alignment table, we want to see if the existing -# groupings of witnesses match our stemma hypothesis. First let's just go -# through the groupings. +# groupings of witnesses match our stemma hypothesis. We also want, at the +# end, to produce an HTML table with all the variants. +my $html_columns = 0; +my $html_data = []; +my $total = 0; # Keep track of the total number of variant locations # Strip the list of sigla and save it for correlation to the readings. my $col_wits = shift @$all_wits_table; -# For each column in the table, group the readings by witness. - -my $used_vars = 0; foreach my $i ( 0 .. $#$all_wits_table ) { + # For each column in the table, group the readings by witness. my $rdg_wits = {}; my $col_rdgs = shift @$all_wits_table; + my $rank; foreach my $j ( 0 .. $#{$col_rdgs} ) { my $rdg = $col_rdgs->[$j]; - $rdg = '' unless $rdg; # We care about empty readings - $rdg = undef if $rdg eq '#LACUNA#'; # ... unless they're lacunas + $rank = $rdg->rank if $rdg; # Save the rank for later display + my $rdg_text = '(omitted)'; # Initialize in case of empty reading if( $rdg ) { - $rdg_wits->{$rdg} = [] unless $rdg_wits->{$rdg}; - add_variant_wit( $rdg_wits->{$rdg}, $col_wits->[$j] ); + $rdg_text = $rdg->is_lacuna ? undef : $rdg->text; # Don't count lacunae + } + if( defined $rdg_text ) { + # Initialize the witness array if we haven't got one yet + $rdg_wits->{$rdg_text} = [] unless $rdg_wits->{$rdg_text}; + # Add the relevant witness, subject to a.c. logic + add_variant_wit( $rdg_wits->{$rdg_text}, $col_wits->[$j] ); } } + # See if this column has any potentially genealogical variants. + # If not, skip to the next. + $total++ unless scalar keys %$rdg_wits == 1; my( $groups, $readings ) = useful_variant( $rdg_wits ); - next unless $groups && $readings; + next unless $groups && $readings; + $html_columns = scalar @$groups if scalar @$groups > $html_columns; - # We can look up witnesses for a reading; we also want to look up readings - # for a given witness. + # We can already look up witnesses for a reading; we also want to look + # up readings for a given witness. my $group_readings = {}; foreach my $x ( 0 .. $#$groups ) { $group_readings->{wit_stringify( $groups->[$x] )} = $readings->[$x]; @@ -75,32 +87,59 @@ foreach my $i ( 0 .. $#$all_wits_table ) { # For all the groups with more than one member, collect the list of all # contiguous vertices needed to connect them. # TODO: deal with a.c. reading logic - my $conflict = analyze_variant_location( $group_readings, $groups, $stemma->apsp ); + my $sc = analyze_variant_location( $group_readings, $groups, $stemma->apsp ); print wit_stringify( $groups ) . ' - ' . join( " / ", @$readings ) . "\n"; - foreach my $rdg ( keys %$conflict ) { - my $var = $conflict->{$rdg}; + foreach my $rdg ( keys %$sc ) { + my $var = $sc->{$rdg}; print "\tReadings '$rdg' and '$var' are not genealogical\n"; } - # Now run the same analysis given a distance tree. - my $distance_apsp = $stemma->distance_trees->[0]->APSP_Floyd_Warshall(); - $conflict = analyze_variant_location( $group_readings, $groups, $distance_apsp ); - foreach my $rdg ( keys %$conflict ) { - my $var = $conflict->{$rdg}; - print "\tReadings '$rdg' and '$var' disregarded by parsimony\n"; + # Now run the same analysis given the calculated distance tree(s). + foreach my $tree ( 0 .. $#{$stemma->distance_trees} ) { + my $dc = analyze_variant_location( $group_readings, $groups, + $stemma->distance_apsps->[$tree] ); + foreach my $rdg ( keys %$dc ) { + my $var = $dc->{$rdg}; + print "\tReadings '$rdg' and '$var' disregarded by parsimony on tree $tree\n"; + } } # Record that we used this variant in an analysis - $used_vars++; - + push( @$html_data, [ $rank, $readings, $sc ] ); } -print "Found $used_vars useful variants in this analysis\n"; + # Save the stemma picture open( STEMMA, ">stemma_graph.svg" ) or die "Could not open stemma graph to write"; binmode STEMMA, ":utf8"; print STEMMA $stemma->as_svg; close STEMMA; +# Save the used variants as an HTML table +open( TABLE, ">variant_table.html" ) or die "Could not save variant table"; +binmode TABLE, ":utf8"; +print TABLE "\n"; +foreach my $row ( @$html_data ) { + my( $rank, $readings, $sc ) = @$row; + # Do we have a stemma conflict or a distance-tree conflict? + my $class = scalar keys %$sc ? 'coincidental' : 'genealogical'; + print TABLE sprintf( "\t\n", "variant-$rank", $class ); + # Table row header should be the graph rank. + print TABLE "\t\t\n"; + my $ctr = 0; + foreach my $rdg ( @$readings ) { + print TABLE sprintf( "\t\t\n", "item-$rank-$ctr", $rdg ); + $ctr++; + } + # Pad out the table - is this necessary I wonder? + while( $ctr++ < $html_columns ) { + print TABLE "\t\t\n"; +} +print TABLE "
$rank%s\n"; + } + print TABLE "\t
\n"; + +printf( "Ran analysis on %d / %d variant locations\n", scalar @$html_data, $total ); + sub analyze_variant_location { my( $group_readings, $groups, $apsp ) = @_; my %contig; diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index bc88107..036624e 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -519,13 +519,11 @@ sub as_csv { return $self->csv; } -# TODO Make an alignment table at the end of initialization to check for -# duplicate nodes from mis-collation. - - +# Make an alignment table - $noderefs controls whether the objects +# in the table are the nodes or simply their readings. sub make_alignment_table { - my( $self, $in_rows ) = shift; + my( $self, $noderefs ) = @_; unless( $self->linear ) { warn "Need a linear graph in order to make an alignment table"; return; @@ -534,38 +532,39 @@ sub make_alignment_table { my @all_pos = sort { $a <=> $b } $self->possible_positions; foreach my $wit ( $self->tradition->witnesses ) { # print STDERR "Making witness row(s) for " . $wit->sigil . "\n"; - my @row = _make_witness_row( $wit->path, \@all_pos ); + my @row = _make_witness_row( $wit->path, \@all_pos, $noderefs ); unshift( @row, $wit->sigil ); push( @$table, \@row ); if( $wit->has_ante_corr ) { - my @ac_row = _make_witness_row( $wit->uncorrected_path, \@all_pos ); + my @ac_row = _make_witness_row( $wit->uncorrected_path, \@all_pos, $noderefs ); unshift( @ac_row, $wit->sigil . $self->ac_label ); push( @$table, \@ac_row ); } } - return $table if $in_rows; - + # Return a table where the witnesses read in columns rather than rows. my $turned = _turn_table( $table ); return $turned; } sub _make_witness_row { - my( $path, $positions ) = @_; + my( $path, $positions, $noderefs ) = @_; my %char_hash; map { $char_hash{$_} = undef } @$positions; foreach my $rdg ( @$path ) { my $rtext = $rdg->text; $rtext = '#LACUNA#' if $rdg->is_lacuna; - $char_hash{$rdg->rank} = $rtext; + $char_hash{$rdg->rank} = $noderefs ? $rdg : $rtext; } my @row = map { $char_hash{$_} } @$positions; # Fill in lacuna markers for undef spots in the row my $last_el = shift @row; my @filled_row = ( $last_el ); foreach my $el ( @row ) { - if( $last_el && $last_el eq '#LACUNA#' && !defined $el ) { - $el = '#LACUNA#'; + # If we are using node reference, make the lacuna node appear many times + # in the table. If not, use the lacuna tag. + if( $last_el && _el_is_lacuna( $last_el ) && !defined $el ) { + $el = $noderefs ? $last_el : '#LACUNA#'; } push( @filled_row, $el ); $last_el = $el; @@ -573,6 +572,15 @@ sub _make_witness_row { return @filled_row; } +# Tiny utility function to say if a table element is a lacuna +sub _el_is_lacuna { + my $el = shift; + return 1 if $el eq '#LACUNA#'; + return 1 if ref( $el ) eq 'Text::Tradition::Collation::Reading' + && $el->is_lacuna; + return 0; +} + # Helper to turn the witnesses along columns rather than rows. Assumes # equal-sized rows. sub _turn_table { @@ -999,6 +1007,25 @@ sub _assign_rank { return @next_nodes; } +# Another method to make up for rough collation methods. If the same reading +# appears multiple times at the same rank, collapse the nodes. +sub flatten_ranks { + my $self = shift; + my %unique_rank_rdg; + foreach my $rdg ( $self->readings ) { + next unless $rdg->has_rank; + my $key = $rdg->rank . "||" . $rdg->text; + if( exists $unique_rank_rdg{$key} ) { + # Combine! + print STDERR "Combining readings at same rank: $key\n"; + $self->merge_readings( $unique_rank_rdg{$key}, $rdg ); + } else { + $unique_rank_rdg{$key} = $rdg; + } + } +} + + sub possible_positions { my $self = shift; my %all_pos; diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm index 5254ea0..544f3a2 100644 --- a/lib/Text/Tradition/Parser/TEI.pm +++ b/lib/Text/Tradition/Parser/TEI.pm @@ -141,6 +141,10 @@ sub parse { $tradition->collation->del_reading( $tradition->collation->reading( $_ ) ); } $tradition->collation->calculate_ranks(); + + # Now that we have ranks, see if we have distinct nodes with identical + # text and identical rank that can be merged. + $tradition->collation->flatten_ranks(); } sub _clean_sequence {