1 package Text::Tradition::StemmaUtil;
6 use vars qw/ @EXPORT_OK /;
7 @EXPORT_OK = qw/ phylip_pars_input /;
9 sub make_character_matrix {
11 # Push the names of the witnesses to initialize the rows of the matrix.
12 my @matrix = map { [ _normalize_witname( $_->{'witness'} ) ] }
13 @{$table->{'alignment'}};
14 foreach my $token_index ( 0 .. $table->{'length'} - 1) {
15 # First implementation: make dumb alignment table, caring about
16 # nothing except which reading is in which position.
17 my @pos_readings = map { $_->{'tokens'}->[$token_index] }
18 @{$table->{'alignment'}};
19 my @pos_text = map { $_ ? $_->{'t'} : $_ } @pos_readings;
20 my @chars = convert_characters( \@pos_text );
21 foreach my $idx ( 0 .. $#matrix ) {
22 push( @{$matrix[$idx]}, $chars[$idx] );
28 # Helper function to make the witness name something legal for pars
30 sub _normalize_witname {
32 $witname =~ s/\s+/ /g;
33 $witname =~ s/[\[\]\(\)\:;,]//g;
34 $witname = substr( $witname, 0, 10 );
35 return sprintf( "%-10s", $witname );
38 sub convert_characters {
40 # This is a simple algorithm that treats every reading as different.
41 # Eventually we will want to be able to specify how relationships
42 # affect the character matrix.
43 my %unique = ( '__UNDEF__' => 'X',
48 foreach my $word ( @$row ) {
49 if( $word && !exists $unique{$word} ) {
50 $unique{$word} = chr( 65 + $ctr );
53 $count{$word}++ if $word;
55 # Try to keep variants under 8 by lacunizing any singletons.
56 if( scalar( keys %unique ) > 8 ) {
57 foreach my $word ( keys %count ) {
58 if( $count{$word} == 1 ) {
63 my %u = reverse %unique;
64 if( scalar( keys %u ) > 8 ) {
65 warn "Have more than 8 variants on this location; phylip will break";
67 my @chars = map { $_ ? $unique{$_} : $unique{'__UNDEF__' } } @$row;
71 sub phylip_pars_input {
73 my $character_matrix = make_character_matrix( $table );
75 my $rows = scalar @{$character_matrix};
76 my $columns = scalar @{$character_matrix->[0]} - 1;
77 $input .= "\t$rows\t$columns\n";
78 foreach my $row ( @{$character_matrix} ) {
79 $input .= join( '', @$row ) . "\n";