Commit | Line | Data |
d9e873d0 |
1 | package Text::Tradition::Parser::Tabular; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Text::CSV_XS; |
6 | |
7 | =head1 NAME |
8 | |
9 | Text::Tradition::Parser::Tabular |
10 | |
3b853983 |
11 | =head1 SYNOPSIS |
12 | |
13 | use Text::Tradition; |
14 | |
15 | my $t_from_file = Text::Tradition->new( |
16 | 'name' => 'my text', |
17 | 'input' => 'Tabular', |
18 | 'file' => '/path/to/collation.csv', |
19 | 'sep_char' => ',' |
20 | ); |
21 | |
22 | my $t_from_string = Text::Tradition->new( |
23 | 'name' => 'my text', |
24 | 'input' => 'Tabular', |
25 | 'string' => $tab_separated_collation, |
26 | 'sep_char' => "\t", |
27 | ); |
28 | |
d9e873d0 |
29 | =head1 DESCRIPTION |
30 | |
31 | Parser module for Text::Tradition to read an alignment table format, such as CSV. |
32 | |
33 | =head1 METHODS |
34 | |
e867486f |
35 | =head2 B<parse>( $tradition, $option_hash ) |
3b853983 |
36 | |
37 | Takes an initialized tradition and a set of options; creates the |
38 | appropriate nodes and edges on the graph, as well as the appropriate |
39 | witness objects. The $option_hash must contain either a 'file' or a |
40 | 'string' argument with the table to be parsed; it may also contain a |
41 | 'sep_char' argument to specify how the fields are separated. |
42 | |
43 | The table should have witnesses arranged in columns, with the witness sigla |
44 | in the first row. Empty cells are interpreted as omissions (and thus |
45 | stemmatologically relevant.) Longer lacunae in the text, to be disregarded |
46 | in cladistic analysis, may be represented by filling the appropriate cells |
47 | with the tag '#LACUNA#'. |
48 | |
49 | If a witness name ends in the collation's ac_label, it will be treated as |
50 | an 'ante-correction' version of the 'main' witness whose sigil it shares. |
51 | |
52 | =begin testing |
53 | |
54 | use Text::Tradition; |
55 | binmode STDOUT, ":utf8"; |
56 | binmode STDERR, ":utf8"; |
57 | eval { no warnings; binmode $DB::OUT, ":utf8"; }; |
58 | |
59 | my $csv = 't/data/florilegium.csv'; |
60 | my $t = Text::Tradition->new( |
61 | 'name' => 'inline', |
62 | 'input' => 'Tabular', |
63 | 'file' => $csv, |
64 | 'sep_char' => ',', |
65 | ); |
d9e873d0 |
66 | |
3b853983 |
67 | is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" ); |
d9e873d0 |
68 | |
3b853983 |
69 | ### TODO Check these figures |
70 | if( $t ) { |
0e47f4f6 |
71 | is( scalar $t->collation->readings, 311, "Collation has all readings" ); |
72 | is( scalar $t->collation->paths, 361, "Collation has all paths" ); |
3b853983 |
73 | is( scalar $t->witnesses, 13, "Collation has all witnesses" ); |
74 | } |
75 | |
b0b4421a |
76 | # Check that we have the right witnesses |
77 | my %seen_wits; |
78 | map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /; |
79 | foreach my $wit ( $t->witnesses ) { |
80 | $seen_wits{$wit->sigil} = 1; |
81 | } |
82 | is( scalar keys %seen_wits, 13, "No extra witnesses were made" ); |
83 | foreach my $k ( keys %seen_wits ) { |
84 | ok( $seen_wits{$k}, "Witness $k still exists" ); |
85 | } |
86 | |
87 | # Check that the witnesses have the right texts |
88 | foreach my $wit ( $t->witnesses ) { |
89 | my $origtext = join( ' ', @{$wit->text} ); |
90 | my $graphtext = $t->collation->path_text( $wit->sigil ); |
91 | is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil ); |
92 | } |
93 | |
94 | # Check that the a.c. witnesses have the right text |
95 | map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /; |
96 | foreach my $k ( keys %seen_wits ) { |
97 | my $wit = $t->witness( $k ); |
98 | if( $seen_wits{$k} ) { |
99 | ok( $wit->is_layered, "Witness $k got marked as layered" ); |
100 | ok( $wit->has_layertext, "Witness $k has an a.c. version" ); |
101 | my $origtext = join( ' ', @{$wit->layertext} ); |
102 | my $acsig = $wit->sigil . $t->collation->ac_label; |
861c3e27 |
103 | my $graphtext = $t->collation->path_text( $acsig ); |
b0b4421a |
104 | is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" ); |
105 | } else { |
106 | ok( !$wit->is_layered, "Witness $k not marked as layered" ); |
107 | ok( !$wit->has_layertext, "Witness $k has no a.c. version" ); |
108 | } |
109 | } |
110 | |
3b853983 |
111 | =end testing |
d9e873d0 |
112 | |
113 | =cut |
114 | |
115 | sub parse { |
dfc37e38 |
116 | my( $tradition, $opts ) = @_; |
d9e873d0 |
117 | my $c = $tradition->collation; # shorthand |
97a52a67 |
118 | my $csv_options = { 'binary' => 1 }; |
119 | $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t"; |
120 | if( $csv_options->{'sep_char'} eq "\t" ) { |
121 | # If it is really tab separated, nothing is an escape char. |
122 | $csv_options->{'quote_char'} = undef; |
bba696c6 |
123 | $csv_options->{'escape_char'} = undef; |
97a52a67 |
124 | } |
125 | my $csv = Text::CSV_XS->new( $csv_options ); |
dfc37e38 |
126 | |
d9e873d0 |
127 | my $alignment_table; |
dfc37e38 |
128 | if( exists $opts->{'string' } ) { |
129 | my @lines = split( "\n", $opts->{'string'} ); |
130 | foreach my $l ( @lines ) { |
131 | my $status = $csv->parse( $l ); |
132 | if( $status ) { |
133 | push( @$alignment_table, [ $csv->fields ] ); |
134 | } else { |
135 | warn "Could not parse line $l: " . $csv->error_input; |
136 | } |
137 | } |
138 | } elsif( exists $opts->{'file'} ) { |
bb11025b |
139 | open( my $fh, $opts->{'file'} ) |
140 | or warn "Could not open input file " . $opts->{'file'}; |
141 | binmode( $fh, ':utf8' ); |
dfc37e38 |
142 | while( my $row = $csv->getline( $fh ) ) { |
143 | push( @$alignment_table, $row ); |
d9e873d0 |
144 | } |
dfc37e38 |
145 | close $fh; |
146 | } else { |
147 | warn "Could not find string or file option to parse"; |
148 | return; |
d9e873d0 |
149 | } |
dfc37e38 |
150 | |
d9e873d0 |
151 | # Set up the witnesses we find in the first line |
152 | my @witnesses; |
b0b4421a |
153 | my %ac_wits; # Track layered witness -> main witness mapping |
d9e873d0 |
154 | foreach my $sigil ( @{$alignment_table->[0]} ) { |
155 | my $wit = $tradition->add_witness( 'sigil' => $sigil ); |
156 | $wit->path( [ $c->start ] ); |
157 | push( @witnesses, $wit ); |
3b853983 |
158 | my $aclabel = $c->ac_label; |
159 | if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) { |
b0b4421a |
160 | $ac_wits{$sigil} = $1; |
3b853983 |
161 | } |
d9e873d0 |
162 | } |
163 | |
b0b4421a |
164 | # Save the original witness text sequences. Have to loop back through |
165 | # the witness columns after we have identified all the a.c. witnesses. |
166 | foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) { |
167 | my @sequence = map { $_->[$idx] } @{$alignment_table}; |
168 | my $sigil = shift @sequence; |
169 | my $is_layer = exists( $ac_wits{$sigil} ); |
170 | my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil ); |
171 | # Now get rid of gaps and meta-readings like #LACUNA# |
172 | my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence; |
173 | $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words ); |
174 | } |
175 | |
d9e873d0 |
176 | # Now for the next rows, make nodes as necessary, assign their ranks, and |
177 | # add them to the witness paths. |
d9e873d0 |
178 | foreach my $idx ( 1 .. $#{$alignment_table} ) { |
179 | my $row = $alignment_table->[$idx]; |
027d819c |
180 | my $nodes = _make_nodes( $c, $row, $idx ); |
d9e873d0 |
181 | foreach my $w ( 0 .. $#{$row} ) { |
182 | # push the appropriate node onto the appropriate witness path |
183 | my $word = $row->[$w]; |
184 | if( $word ) { |
185 | my $reading = $nodes->{$word}; |
186 | my $wit = $witnesses[$w]; |
187 | push( @{$wit->path}, $reading ); |
188 | } # else skip it for empty readings. |
189 | } |
190 | } |
191 | |
eca16057 |
192 | # Collapse our lacunae into a single node and |
193 | # push the end node onto all paths. |
d9e873d0 |
194 | $c->end->rank( scalar @$alignment_table ); |
195 | foreach my $wit ( @witnesses ) { |
eca16057 |
196 | my $p = $wit->path; |
197 | my $last_rdg = shift @$p; |
198 | my $new_p = [ $last_rdg ]; |
199 | foreach my $rdg ( @$p ) { |
83d5ac3a |
200 | # Omit the reading if we are in a lacuna already. |
201 | next if $rdg->is_lacuna && $last_rdg->is_lacuna; |
202 | # Save the reading otherwise. |
203 | push( @$new_p, $rdg ); |
204 | $last_rdg = $rdg; |
eca16057 |
205 | } |
206 | push( @$new_p, $c->end ); |
207 | $wit->path( $new_p ); |
d9e873d0 |
208 | } |
209 | |
3b853983 |
210 | # Fold any a.c. witnesses into their main witness objects, and |
211 | # delete the independent a.c. versions. |
212 | foreach my $a ( keys %ac_wits ) { |
b0b4421a |
213 | my $ac_wit = $tradition->witness( $a ); |
214 | my $main_wit = $tradition->witness( $ac_wits{$a} ); |
3b853983 |
215 | next unless $main_wit; |
861c3e27 |
216 | $main_wit->is_layered(1); |
3b853983 |
217 | $main_wit->uncorrected_path( $ac_wit->path ); |
218 | $tradition->del_witness( $ac_wit ); |
219 | } |
83d5ac3a |
220 | |
d9e873d0 |
221 | # Join up the paths. |
222 | $c->make_witness_paths; |
83d5ac3a |
223 | # Delete our unused lacuna nodes. |
224 | foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) { |
225 | $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg ); |
226 | } |
861c3e27 |
227 | |
228 | # Do a consistency check. |
229 | foreach my $wit ( $tradition->witnesses ) { |
230 | my $pathtext = $c->path_text( $wit->sigil ); |
231 | my $origtext = join( ' ', @{$wit->text} ); |
232 | warn "Text differs for witness " . $wit->sigil |
233 | unless $pathtext eq $origtext; |
234 | if( $wit->is_layered ) { |
235 | $pathtext = $c->path_text( $wit->sigil.$c->ac_label ); |
236 | $origtext = join( ' ', @{$wit->layertext} ); |
237 | warn "Ante-corr text differs for witness " . $wit->sigil |
238 | unless $pathtext eq $origtext; |
239 | } else { |
240 | warn "Text " . $wit->sigil . " has a layered text but is not marked as layered" |
241 | if $wit->has_layertext; |
242 | } |
243 | } |
202ccb18 |
244 | |
245 | # Note that our ranks and common readings are set. |
246 | $c->_graphcalc_done(1); |
d9e873d0 |
247 | } |
248 | |
027d819c |
249 | sub _make_nodes { |
d9e873d0 |
250 | my( $collation, $row, $index ) = @_; |
251 | my %unique; |
15db7774 |
252 | my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae. |
d9e873d0 |
253 | foreach my $w ( @$row ) { |
254 | $unique{$w} = 1 if $w; |
15db7774 |
255 | $commonctr +=1 unless ( $w && $w eq '#LACUNA#' ); |
d9e873d0 |
256 | } |
257 | my $ctr = 1; |
258 | foreach my $w ( keys %unique ) { |
a753cc84 |
259 | my $rargs = { |
a753cc84 |
260 | 'id' => "$index,$ctr", |
261 | 'rank' => $index, |
262 | 'text' => $w, |
263 | }; |
15db7774 |
264 | if( $w eq '#LACUNA#' ) { |
265 | $rargs->{'is_lacuna'} = 1; |
266 | } elsif( $commonctr == 1 ) { |
267 | $rargs->{'is_common'} = 1; |
268 | } |
a753cc84 |
269 | my $r = $collation->add_reading( $rargs ); |
d9e873d0 |
270 | $unique{$w} = $r; |
a753cc84 |
271 | $ctr++; |
d9e873d0 |
272 | } |
273 | return \%unique; |
274 | } |
275 | |
3b853983 |
276 | 1; |
277 | |
278 | =head1 LICENSE |
279 | |
280 | This package is free software and is provided "as is" without express |
281 | or implied warranty. You can redistribute it and/or modify it under |
282 | the same terms as Perl itself. |
283 | |
284 | =head1 AUTHOR |
285 | |
286 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |