Commit | Line | Data |
d9e873d0 |
1 | package Text::Tradition::Parser::Tabular; |
2 | |
3 | use strict; |
4 | use warnings; |
3a3b8213 |
5 | use Encode qw/ decode_utf8 /; |
82fa4d57 |
6 | use Text::CSV; |
701ad2ba |
7 | use Text::Tradition::Error; |
8 | use TryCatch; |
d9e873d0 |
9 | |
10 | =head1 NAME |
11 | |
12 | Text::Tradition::Parser::Tabular |
13 | |
3b853983 |
14 | =head1 SYNOPSIS |
15 | |
16 | use Text::Tradition; |
17 | |
18 | my $t_from_file = Text::Tradition->new( |
19 | 'name' => 'my text', |
20 | 'input' => 'Tabular', |
21 | 'file' => '/path/to/collation.csv', |
22 | 'sep_char' => ',' |
23 | ); |
24 | |
25 | my $t_from_string = Text::Tradition->new( |
26 | 'name' => 'my text', |
27 | 'input' => 'Tabular', |
28 | 'string' => $tab_separated_collation, |
29 | 'sep_char' => "\t", |
30 | ); |
31 | |
d9e873d0 |
32 | =head1 DESCRIPTION |
33 | |
34 | Parser module for Text::Tradition to read an alignment table format, such as CSV. |
35 | |
36 | =head1 METHODS |
37 | |
e867486f |
38 | =head2 B<parse>( $tradition, $option_hash ) |
3b853983 |
39 | |
40 | Takes an initialized tradition and a set of options; creates the |
41 | appropriate nodes and edges on the graph, as well as the appropriate |
42 | witness objects. The $option_hash must contain either a 'file' or a |
43 | 'string' argument with the table to be parsed; it may also contain a |
44 | 'sep_char' argument to specify how the fields are separated. |
45 | |
46 | The table should have witnesses arranged in columns, with the witness sigla |
47 | in the first row. Empty cells are interpreted as omissions (and thus |
48 | stemmatologically relevant.) Longer lacunae in the text, to be disregarded |
49 | in cladistic analysis, may be represented by filling the appropriate cells |
50 | with the tag '#LACUNA#'. |
51 | |
52 | If a witness name ends in the collation's ac_label, it will be treated as |
53 | an 'ante-correction' version of the 'main' witness whose sigil it shares. |
54 | |
55 | =begin testing |
56 | |
57 | use Text::Tradition; |
58 | binmode STDOUT, ":utf8"; |
59 | binmode STDERR, ":utf8"; |
60 | eval { no warnings; binmode $DB::OUT, ":utf8"; }; |
61 | |
62 | my $csv = 't/data/florilegium.csv'; |
63 | my $t = Text::Tradition->new( |
64 | 'name' => 'inline', |
65 | 'input' => 'Tabular', |
66 | 'file' => $csv, |
67 | 'sep_char' => ',', |
68 | ); |
d9e873d0 |
69 | |
3b853983 |
70 | is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" ); |
d9e873d0 |
71 | |
3b853983 |
72 | ### TODO Check these figures |
73 | if( $t ) { |
0e47f4f6 |
74 | is( scalar $t->collation->readings, 311, "Collation has all readings" ); |
75 | is( scalar $t->collation->paths, 361, "Collation has all paths" ); |
3b853983 |
76 | is( scalar $t->witnesses, 13, "Collation has all witnesses" ); |
77 | } |
78 | |
b0b4421a |
79 | # Check that we have the right witnesses |
80 | my %seen_wits; |
81 | map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /; |
82 | foreach my $wit ( $t->witnesses ) { |
83 | $seen_wits{$wit->sigil} = 1; |
84 | } |
85 | is( scalar keys %seen_wits, 13, "No extra witnesses were made" ); |
86 | foreach my $k ( keys %seen_wits ) { |
87 | ok( $seen_wits{$k}, "Witness $k still exists" ); |
88 | } |
89 | |
90 | # Check that the witnesses have the right texts |
91 | foreach my $wit ( $t->witnesses ) { |
92 | my $origtext = join( ' ', @{$wit->text} ); |
93 | my $graphtext = $t->collation->path_text( $wit->sigil ); |
94 | is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil ); |
95 | } |
96 | |
97 | # Check that the a.c. witnesses have the right text |
98 | map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /; |
99 | foreach my $k ( keys %seen_wits ) { |
100 | my $wit = $t->witness( $k ); |
101 | if( $seen_wits{$k} ) { |
102 | ok( $wit->is_layered, "Witness $k got marked as layered" ); |
103 | ok( $wit->has_layertext, "Witness $k has an a.c. version" ); |
104 | my $origtext = join( ' ', @{$wit->layertext} ); |
105 | my $acsig = $wit->sigil . $t->collation->ac_label; |
861c3e27 |
106 | my $graphtext = $t->collation->path_text( $acsig ); |
b0b4421a |
107 | is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" ); |
108 | } else { |
109 | ok( !$wit->is_layered, "Witness $k not marked as layered" ); |
110 | ok( !$wit->has_layertext, "Witness $k has no a.c. version" ); |
111 | } |
112 | } |
113 | |
cc31ebaa |
114 | # Check that we only have collation relationships where we need them |
115 | is( scalar $t->collation->relationships, 3, "Redundant collations were removed" ); |
cc31ebaa |
116 | |
701ad2ba |
117 | ## Check excel parsing |
118 | |
119 | my $xls = 't/data/armexample.xls'; |
120 | my $xt = Text::Tradition->new( |
121 | 'name' => 'excel test', |
122 | 'input' => 'Tabular', |
123 | 'file' => $xls, |
3a3b8213 |
124 | 'excel' => 'xls' |
701ad2ba |
125 | ); |
126 | |
127 | is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" ); |
128 | my %xls_wits; |
129 | map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /; |
130 | foreach my $wit ( $xt->witnesses ) { |
131 | $xls_wits{$wit->sigil} = 1; |
132 | } |
133 | is( scalar keys %xls_wits, 3, "No extra witnesses were made" ); |
134 | foreach my $k ( keys %xls_wits ) { |
135 | ok( $xls_wits{$k}, "Witness $k still exists" ); |
136 | } |
137 | is( scalar $xt->collation->readings, 11, "Got correct number of test readings" ); |
138 | is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" ); |
139 | is( $xt->collation->reading('r5.1')->text, "\x{587}", |
140 | "Correct decoding of at least one reading" ); |
141 | |
3a3b8213 |
142 | my $xlsx = 't/data/armexample.xlsx'; |
143 | my $xtx = Text::Tradition->new( |
144 | 'name' => 'excel test', |
145 | 'input' => 'Tabular', |
146 | 'file' => $xlsx, |
147 | 'excel' => 'xlsx' |
148 | ); |
149 | |
150 | is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" ); |
151 | my %xlsx_wits; |
152 | map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /; |
153 | foreach my $wit ( $xtx->witnesses ) { |
154 | $xlsx_wits{$wit->sigil} = 1; |
155 | } |
156 | is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" ); |
157 | foreach my $k ( keys %xlsx_wits ) { |
158 | ok( $xlsx_wits{$k}, "Witness $k still exists" ); |
159 | } |
160 | is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" ); |
161 | is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" ); |
162 | is( $xtx->collation->reading('r5.1')->text, "\x{587}", |
163 | "Correct decoding of at least one reading" ); |
164 | |
3b853983 |
165 | =end testing |
d9e873d0 |
166 | |
167 | =cut |
168 | |
169 | sub parse { |
dfc37e38 |
170 | my( $tradition, $opts ) = @_; |
3a3b8213 |
171 | my $alignment_table = _table_from_input( $opts ); |
d9e873d0 |
172 | # Set up the witnesses we find in the first line |
173 | my @witnesses; |
b0b4421a |
174 | my %ac_wits; # Track layered witness -> main witness mapping |
3a3b8213 |
175 | my $c = $tradition->collation; # shorthand |
82fa4d57 |
176 | my $aclabel = $c->ac_label; |
d9e873d0 |
177 | foreach my $sigil ( @{$alignment_table->[0]} ) { |
3b853983 |
178 | if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) { |
82fa4d57 |
179 | # Sanitize the sigil name to an XML name |
180 | $sigil = $1 . '_layered'; |
b0b4421a |
181 | $ac_wits{$sigil} = $1; |
3b853983 |
182 | } |
82fa4d57 |
183 | my $wit = $tradition->add_witness( |
184 | 'sigil' => $sigil, 'sourcetype' => 'collation' ); |
185 | $wit->path( [ $c->start ] ); |
186 | push( @witnesses, $wit ); |
187 | my $aclabel = $c->ac_label; |
d9e873d0 |
188 | } |
189 | |
b0b4421a |
190 | # Save the original witness text sequences. Have to loop back through |
191 | # the witness columns after we have identified all the a.c. witnesses. |
192 | foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) { |
193 | my @sequence = map { $_->[$idx] } @{$alignment_table}; |
194 | my $sigil = shift @sequence; |
195 | my $is_layer = exists( $ac_wits{$sigil} ); |
196 | my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil ); |
197 | # Now get rid of gaps and meta-readings like #LACUNA# |
198 | my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence; |
199 | $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words ); |
200 | } |
201 | |
9bdf9d67 |
202 | my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000; |
203 | print STDERR "Tradition too big for row collation\n" if $nocollate; |
204 | |
d9e873d0 |
205 | # Now for the next rows, make nodes as necessary, assign their ranks, and |
206 | # add them to the witness paths. |
d9e873d0 |
207 | foreach my $idx ( 1 .. $#{$alignment_table} ) { |
208 | my $row = $alignment_table->[$idx]; |
9bdf9d67 |
209 | my $nodes = _make_nodes( $c, $row, $idx, $nocollate ); |
d9e873d0 |
210 | foreach my $w ( 0 .. $#{$row} ) { |
211 | # push the appropriate node onto the appropriate witness path |
212 | my $word = $row->[$w]; |
213 | if( $word ) { |
214 | my $reading = $nodes->{$word}; |
215 | my $wit = $witnesses[$w]; |
216 | push( @{$wit->path}, $reading ); |
217 | } # else skip it for empty readings. |
218 | } |
219 | } |
220 | |
eca16057 |
221 | # Collapse our lacunae into a single node and |
222 | # push the end node onto all paths. |
d9e873d0 |
223 | $c->end->rank( scalar @$alignment_table ); |
224 | foreach my $wit ( @witnesses ) { |
eca16057 |
225 | my $p = $wit->path; |
226 | my $last_rdg = shift @$p; |
227 | my $new_p = [ $last_rdg ]; |
228 | foreach my $rdg ( @$p ) { |
83d5ac3a |
229 | # Omit the reading if we are in a lacuna already. |
230 | next if $rdg->is_lacuna && $last_rdg->is_lacuna; |
231 | # Save the reading otherwise. |
232 | push( @$new_p, $rdg ); |
233 | $last_rdg = $rdg; |
eca16057 |
234 | } |
235 | push( @$new_p, $c->end ); |
236 | $wit->path( $new_p ); |
d9e873d0 |
237 | } |
238 | |
3b853983 |
239 | # Fold any a.c. witnesses into their main witness objects, and |
240 | # delete the independent a.c. versions. |
241 | foreach my $a ( keys %ac_wits ) { |
b0b4421a |
242 | my $ac_wit = $tradition->witness( $a ); |
243 | my $main_wit = $tradition->witness( $ac_wits{$a} ); |
3b853983 |
244 | next unless $main_wit; |
861c3e27 |
245 | $main_wit->is_layered(1); |
3b853983 |
246 | $main_wit->uncorrected_path( $ac_wit->path ); |
247 | $tradition->del_witness( $ac_wit ); |
248 | } |
83d5ac3a |
249 | |
d9e873d0 |
250 | # Join up the paths. |
251 | $c->make_witness_paths; |
83d5ac3a |
252 | # Delete our unused lacuna nodes. |
253 | foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) { |
254 | $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg ); |
255 | } |
861c3e27 |
256 | |
257 | # Do a consistency check. |
258 | foreach my $wit ( $tradition->witnesses ) { |
259 | my $pathtext = $c->path_text( $wit->sigil ); |
260 | my $origtext = join( ' ', @{$wit->text} ); |
261 | warn "Text differs for witness " . $wit->sigil |
262 | unless $pathtext eq $origtext; |
263 | if( $wit->is_layered ) { |
264 | $pathtext = $c->path_text( $wit->sigil.$c->ac_label ); |
265 | $origtext = join( ' ', @{$wit->layertext} ); |
266 | warn "Ante-corr text differs for witness " . $wit->sigil |
267 | unless $pathtext eq $origtext; |
268 | } else { |
269 | warn "Text " . $wit->sigil . " has a layered text but is not marked as layered" |
270 | if $wit->has_layertext; |
271 | } |
272 | } |
202ccb18 |
273 | |
274 | # Note that our ranks and common readings are set. |
275 | $c->_graphcalc_done(1); |
cc31ebaa |
276 | # Remove redundant collation relationships. |
9bdf9d67 |
277 | $c->relations->filter_collations() unless $nocollate; |
d9e873d0 |
278 | } |
279 | |
3a3b8213 |
280 | sub _table_from_input { |
281 | my $opts = shift; |
282 | my $alignment_table = []; |
283 | if( $opts->{'excel'} ) { |
284 | my $sheet; |
285 | my $need_decode; |
286 | unless( exists $opts->{'file'} ) { |
287 | throw( "Must pass the filename for Excel parsing" ); |
288 | } |
289 | if( $opts->{'excel'} eq 'xls' ) { |
290 | try { |
291 | require Spreadsheet::ParseExcel; |
292 | } catch { |
293 | throw( "Need module Spreadsheet::ParseExcel to parse .xls files" ); |
294 | } |
295 | my $parser = Spreadsheet::ParseExcel->new(); |
296 | my $workbook = $parser->parse( $opts->{'file'} ); |
297 | unless( defined $workbook && defined $workbook->worksheet(0) ) { |
298 | throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() ); |
299 | } |
300 | $sheet = $workbook->worksheet(0); |
301 | } elsif( $opts->{'excel'} eq 'xlsx' ) { |
302 | try { |
303 | require Spreadsheet::XLSX; |
304 | } catch { |
305 | throw( "Need module Spreadsheet::XLSX to parse .xlsx files" ); |
306 | } |
307 | $need_decode = 1; |
308 | my $workbook; |
309 | try { |
310 | $workbook = Spreadsheet::XLSX->new( $opts->{'file'} ); |
311 | } catch { |
312 | throw( "Failed to parse file " . $opts->{'file'} ); |
313 | } |
314 | $sheet = $workbook->worksheet(0); |
315 | } else { |
316 | throw( "Unrecognized Excel variant" . $opts->{'excel'} ); |
317 | } |
318 | $alignment_table = _alignment_from_worksheet( $sheet, $need_decode ); |
319 | } else { |
320 | # Assume it is a comma-, tab-, or whatever-separated format. |
321 | my $csv_options = { 'binary' => 1 }; |
322 | $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t"; |
323 | if( $csv_options->{'sep_char'} eq "\t" ) { |
324 | # If it is really tab separated, nothing is an escape char. |
325 | $csv_options->{'quote_char'} = undef; |
326 | $csv_options->{'escape_char'} = undef; |
327 | } |
328 | my $csv = Text::CSV->new( $csv_options ); |
329 | |
330 | if( exists $opts->{'string' } ) { |
331 | my @lines = split( "\n", $opts->{'string'} ); |
332 | foreach my $l ( @lines ) { |
333 | my $status = $csv->parse( $l ); |
334 | if( $status ) { |
335 | push( @$alignment_table, [ $csv->fields ] ); |
336 | } else { |
337 | throw( "Could not parse line $l: " . $csv->error_input ); |
338 | } |
339 | } |
340 | } elsif( exists $opts->{'file'} ) { |
341 | open( my $fh, $opts->{'file'} ) |
342 | or warn "Could not open input file " . $opts->{'file'}; |
343 | binmode( $fh, ':utf8' ); |
344 | while( my $row = $csv->getline( $fh ) ) { |
345 | push( @$alignment_table, $row ); |
346 | } |
347 | close $fh; |
348 | } else { |
349 | throw( "Could not find string or file option to parse" ); |
350 | } |
351 | } |
352 | return $alignment_table; |
353 | } |
354 | sub _alignment_from_worksheet { |
355 | my( $sheet, $decode ) = @_; |
356 | my $alignment_table = []; |
357 | |
358 | my( $rmin, $rmax ) = $sheet->row_range(); |
359 | my( $cmin, $cmax ) = $sheet->col_range(); |
360 | unless( $cmax && $rmax ) { |
361 | throw( "Found no rows or no columns in first worksheet" ); |
362 | } |
363 | # Populate the alignment table. We only want columns that have |
364 | # a sigil in row zero. |
365 | my %sigcols = (); |
366 | push( @$alignment_table, [] ); |
367 | foreach my $col ( $cmin .. $cmax ) { |
368 | my $cell = $sheet->get_cell( $rmin, $col ); |
369 | my $cellval = $cell ? $cell->value() : undef; |
370 | if( $cellval ) { |
371 | $sigcols{$col} = 1; |
372 | push( @{$alignment_table->[0]}, $cellval ); |
373 | } |
374 | } |
375 | # Now go through the rest of the rows and pick up the columns |
376 | # that were headed by a sigil. |
377 | foreach my $row ( $rmin+1 .. $rmax ) { |
378 | my @tablerow; |
379 | foreach my $col ( $cmin .. $cmax ) { |
380 | next unless $sigcols{$col}; |
381 | my $cell = $sheet->get_cell( $row, $col ); |
382 | my $cellval; |
383 | if( $cell ) { |
384 | $cellval = $decode ? decode_utf8( $cell->value ) : $cell->value; |
385 | } |
386 | push( @tablerow, $cellval ); |
387 | } |
388 | push( @$alignment_table, \@tablerow ); |
389 | } |
390 | return $alignment_table; |
391 | } |
392 | |
027d819c |
393 | sub _make_nodes { |
9bdf9d67 |
394 | my( $collation, $row, $index, $nocollate ) = @_; |
d9e873d0 |
395 | my %unique; |
15db7774 |
396 | my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae. |
d9e873d0 |
397 | foreach my $w ( @$row ) { |
398 | $unique{$w} = 1 if $w; |
15db7774 |
399 | $commonctr +=1 unless ( $w && $w eq '#LACUNA#' ); |
d9e873d0 |
400 | } |
401 | my $ctr = 1; |
402 | foreach my $w ( keys %unique ) { |
a753cc84 |
403 | my $rargs = { |
10e4b1ac |
404 | 'id' => "r$index.$ctr", |
a753cc84 |
405 | 'rank' => $index, |
406 | 'text' => $w, |
407 | }; |
15db7774 |
408 | if( $w eq '#LACUNA#' ) { |
409 | $rargs->{'is_lacuna'} = 1; |
410 | } elsif( $commonctr == 1 ) { |
411 | $rargs->{'is_common'} = 1; |
412 | } |
a753cc84 |
413 | my $r = $collation->add_reading( $rargs ); |
d9e873d0 |
414 | $unique{$w} = $r; |
a753cc84 |
415 | $ctr++; |
d9e873d0 |
416 | } |
bf6e338d |
417 | # Collate this sequence of readings via a single 'collation' relationship. |
9bdf9d67 |
418 | unless( $nocollate ) { |
419 | my @rankrdgs = values %unique; |
420 | my $collation_rel; |
421 | while( @rankrdgs ) { |
422 | my $r = shift @rankrdgs; |
423 | next if $r->is_meta; |
424 | foreach my $nr ( @rankrdgs ) { |
425 | next if $nr->is_meta; |
426 | if( $collation_rel ) { |
427 | $collation->add_relationship( $r, $nr, $collation_rel ); |
428 | } else { |
429 | $collation->add_relationship( $r, $nr, |
430 | { 'type' => 'collated', |
431 | 'annotation' => "Parsed together for rank $index" } ); |
432 | $collation_rel = $collation->get_relationship( $r, $nr ); |
433 | } |
434 | } |
435 | } |
436 | } |
d9e873d0 |
437 | return \%unique; |
438 | } |
439 | |
3a3b8213 |
440 | sub throw { |
441 | Text::Tradition::Error->throw( |
442 | 'ident' => 'Parser::Tabular error', |
443 | 'message' => $_[0], |
444 | ); |
445 | } |
446 | |
3b853983 |
447 | 1; |
448 | |
449 | =head1 LICENSE |
450 | |
451 | This package is free software and is provided "as is" without express |
452 | or implied warranty. You can redistribute it and/or modify it under |
453 | the same terms as Perl itself. |
454 | |
455 | =head1 AUTHOR |
456 | |
457 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |