Commit | Line | Data |
a5d53377 |
1 | #!/usr/bin/env perl |
2 | |
3 | use lib 'lib'; |
4 | use strict; |
5 | use warnings; |
6 | use Text::Tradition; |
7 | use Text::Tradition::Stemma; |
8 | |
9 | binmode STDERR, ":utf8"; |
10 | binmode STDOUT, ":utf8"; |
11 | eval { no warnings; binmode $DB::OUT, ":utf8"; }; |
12 | |
13 | my $informat = 'TEI'; |
14 | my $inbase; |
15 | my $linear = 1; |
16 | |
17 | # Parse the tradition data |
18 | |
19 | my $input = $ARGV[0]; |
20 | my @lines; |
21 | open( INFILE, "$input" ) or die "Could not read $input"; |
22 | binmode INFILE, ':utf8'; |
23 | @lines = <INFILE>; |
24 | close INFILE; |
25 | $input = join( '', @lines ); |
26 | |
27 | my %args = ( $informat => $input, |
28 | 'linear' => $linear ); |
29 | $args{'base'} = $inbase if $inbase; |
e79c23c7 |
30 | my $tradition = Text::Tradition->new( %args ); |
a5d53377 |
31 | |
32 | # Parse the stemma hypothesis |
33 | my $stemma = Text::Tradition::Stemma->new( |
34 | 'collation' => $tradition->collation, |
35 | 'dot' => $ARGV[1], |
36 | ); |
37 | |
38 | # We have the collation, so get the alignment table with witnesses in rows. |
0e476982 |
39 | # Also return the reading objects in the table, rather than just the words. |
a5d53377 |
40 | |
0e476982 |
41 | my $all_wits_table = $tradition->collation->make_alignment_table( 'refs' ); |
a5d53377 |
42 | |
43 | # For each column in the alignment table, we want to see if the existing |
0e476982 |
44 | # groupings of witnesses match our stemma hypothesis. We also want, at the |
45 | # end, to produce an HTML table with all the variants. |
46 | my $html_columns = 0; |
47 | my $html_data = []; |
48 | my $total = 0; # Keep track of the total number of variant locations |
a5d53377 |
49 | |
50 | # Strip the list of sigla and save it for correlation to the readings. |
51 | my $col_wits = shift @$all_wits_table; |
52 | |
a5d53377 |
53 | foreach my $i ( 0 .. $#$all_wits_table ) { |
0e476982 |
54 | # For each column in the table, group the readings by witness. |
a5d53377 |
55 | my $rdg_wits = {}; |
56 | my $col_rdgs = shift @$all_wits_table; |
0e476982 |
57 | my $rank; |
a5d53377 |
58 | foreach my $j ( 0 .. $#{$col_rdgs} ) { |
59 | my $rdg = $col_rdgs->[$j]; |
0e476982 |
60 | $rank = $rdg->rank if $rdg; # Save the rank for later display |
61 | my $rdg_text = '(omitted)'; # Initialize in case of empty reading |
a5d53377 |
62 | if( $rdg ) { |
0e476982 |
63 | $rdg_text = $rdg->is_lacuna ? undef : $rdg->text; # Don't count lacunae |
64 | } |
65 | if( defined $rdg_text ) { |
66 | # Initialize the witness array if we haven't got one yet |
67 | $rdg_wits->{$rdg_text} = [] unless $rdg_wits->{$rdg_text}; |
68 | # Add the relevant witness, subject to a.c. logic |
69 | add_variant_wit( $rdg_wits->{$rdg_text}, $col_wits->[$j] ); |
a5d53377 |
70 | } |
71 | } |
72 | |
0e476982 |
73 | # See if this column has any potentially genealogical variants. |
74 | # If not, skip to the next. |
75 | $total++ unless scalar keys %$rdg_wits == 1; |
a5d53377 |
76 | my( $groups, $readings ) = useful_variant( $rdg_wits ); |
0e476982 |
77 | next unless $groups && $readings; |
78 | $html_columns = scalar @$groups if scalar @$groups > $html_columns; |
a5d53377 |
79 | |
0e476982 |
80 | # We can already look up witnesses for a reading; we also want to look |
81 | # up readings for a given witness. |
40f19742 |
82 | my $group_readings = {}; |
83 | foreach my $x ( 0 .. $#$groups ) { |
84 | $group_readings->{wit_stringify( $groups->[$x] )} = $readings->[$x]; |
85 | } |
86 | |
87 | # For all the groups with more than one member, collect the list of all |
88 | # contiguous vertices needed to connect them. |
89 | # TODO: deal with a.c. reading logic |
0e476982 |
90 | my $sc = analyze_variant_location( $group_readings, $groups, $stemma->apsp ); |
40f19742 |
91 | print wit_stringify( $groups ) . ' - ' . join( " / ", @$readings ) . "\n"; |
0e476982 |
92 | foreach my $rdg ( keys %$sc ) { |
93 | my $var = $sc->{$rdg}; |
40f19742 |
94 | print "\tReadings '$rdg' and '$var' are not genealogical\n"; |
95 | } |
96 | |
0e476982 |
97 | # Now run the same analysis given the calculated distance tree(s). |
98 | foreach my $tree ( 0 .. $#{$stemma->distance_trees} ) { |
99 | my $dc = analyze_variant_location( $group_readings, $groups, |
100 | $stemma->distance_apsps->[$tree] ); |
101 | foreach my $rdg ( keys %$dc ) { |
102 | my $var = $dc->{$rdg}; |
103 | print "\tReadings '$rdg' and '$var' disregarded by parsimony on tree $tree\n"; |
104 | } |
40f19742 |
105 | } |
106 | |
107 | # Record that we used this variant in an analysis |
0e476982 |
108 | push( @$html_data, [ $rank, $readings, $sc ] ); |
40f19742 |
109 | } |
0e476982 |
110 | |
e79c23c7 |
111 | # Save the stemma picture |
112 | open( STEMMA, ">stemma_graph.svg" ) or die "Could not open stemma graph to write"; |
113 | binmode STEMMA, ":utf8"; |
114 | print STEMMA $stemma->as_svg; |
115 | close STEMMA; |
40f19742 |
116 | |
0e476982 |
117 | # Save the used variants as an HTML table |
118 | open( TABLE, ">variant_table.html" ) or die "Could not save variant table"; |
119 | binmode TABLE, ":utf8"; |
120 | print TABLE "<table>\n"; |
121 | foreach my $row ( @$html_data ) { |
122 | my( $rank, $readings, $sc ) = @$row; |
123 | # Do we have a stemma conflict or a distance-tree conflict? |
124 | my $class = scalar keys %$sc ? 'coincidental' : 'genealogical'; |
125 | print TABLE sprintf( "\t<tr id=\"%s\" class=\"%s\">\n", "variant-$rank", $class ); |
126 | # Table row header should be the graph rank. |
127 | print TABLE "\t\t<th>$rank</th>\n"; |
128 | my $ctr = 0; |
129 | foreach my $rdg ( @$readings ) { |
130 | print TABLE sprintf( "\t\t<td id=\"%s\">%s</td>\n", "item-$rank-$ctr", $rdg ); |
131 | $ctr++; |
132 | } |
133 | # Pad out the table - is this necessary I wonder? |
134 | while( $ctr++ < $html_columns ) { |
135 | print TABLE "\t\t<td/>\n"; |
136 | } |
137 | print TABLE "\t</tr>\n"; |
138 | } |
139 | print TABLE "</table>\n"; |
140 | |
141 | printf( "Ran analysis on %d / %d variant locations\n", scalar @$html_data, $total ); |
142 | |
40f19742 |
143 | sub analyze_variant_location { |
144 | my( $group_readings, $groups, $apsp ) = @_; |
a5d53377 |
145 | my %contig; |
40f19742 |
146 | my $conflict = {}; |
147 | foreach my $g ( sort { scalar @$b <=> scalar @$a } @$groups ) { |
148 | my @members = @$g; |
149 | my $gst = wit_stringify( $g ); |
150 | map { $contig{$_} = $gst } @members; # The witnesses need themselves to be |
151 | # in their collection. |
a5d53377 |
152 | next unless @members > 1; |
a5d53377 |
153 | my $curr = pop @members; |
154 | foreach my $m ( @members ) { |
40f19742 |
155 | foreach my $v ( $apsp->path_vertices( $curr, $m ) ) { |
156 | $contig{$v} = $gst unless exists $contig{$v}; |
157 | next if $contig{$v} eq $gst; |
158 | # print STDERR "Conflict at $v between group $gst and group " |
a5d53377 |
159 | # . $contig{$v} . "\n"; |
40f19742 |
160 | # Record what is conflicting. |
161 | $conflict->{$group_readings->{$gst}} = $group_readings->{$contig{$v}}; |
a5d53377 |
162 | } |
163 | } |
164 | } |
40f19742 |
165 | return $conflict; |
a5d53377 |
166 | } |
a5d53377 |
167 | |
168 | # Add the variant, subject to a.c. representation logic. |
169 | # This assumes that we will see the 'main' version before the a.c. version. |
170 | sub add_variant_wit { |
171 | my( $arr, $wit ) = @_; |
172 | my $acstr = $tradition->collation->ac_label; |
173 | my $skip; |
174 | if( $wit =~ /^(.*)\Q$acstr\E$/ ) { |
175 | my $real = $1; |
176 | $skip = grep { $_ =~ /^\Q$real\E$/ } @$arr; |
177 | } |
178 | push( @$arr, $wit ) unless $skip; |
179 | } |
180 | |
181 | # Return an answer if the variant is useful, i.e. if there are at least 2 variants |
182 | # with at least 2 witnesses each. |
183 | sub useful_variant { |
184 | my( $readings ) = @_; |
185 | my $total = keys %$readings; |
186 | foreach my $var ( keys %$readings ) { |
187 | $total-- if @{$readings->{$var}} == 1; |
188 | } |
189 | return( undef, undef ) if $total <= 1; |
190 | my( $groups, $text ); |
191 | foreach my $var ( keys %$readings ) { |
40f19742 |
192 | push( @$groups, $readings->{$var} ); |
a5d53377 |
193 | push( @$text, $var ); |
194 | } |
195 | return( $groups, $text ); |
196 | } |
40f19742 |
197 | |
198 | # Take an array of witness groupings and produce a string like |
199 | # A,B / C,D,E / F |
200 | |
201 | sub wit_stringify { |
202 | my $groups = shift; |
203 | my @gst; |
204 | # If we were passed an array of witnesses instead of an array of |
205 | # groupings, then "group" the witnesses first. |
206 | unless( ref( $groups->[0] ) ) { |
207 | my $mkgrp = [ $groups ]; |
208 | $groups = $mkgrp; |
209 | } |
210 | foreach my $g ( @$groups ) { |
211 | push( @gst, join( ',', @$g ) ); |
212 | } |
213 | return join( ' / ', @gst ); |
214 | } |
215 | |