Commit | Line | Data |
d9e873d0 |
1 | package Text::Tradition::Parser::Tabular; |
2 | |
3 | use strict; |
4 | use warnings; |
3a3b8213 |
5 | use Encode qw/ decode_utf8 /; |
82fa4d57 |
6 | use Text::CSV; |
701ad2ba |
7 | use Text::Tradition::Error; |
8 | use TryCatch; |
d9e873d0 |
9 | |
10 | =head1 NAME |
11 | |
12 | Text::Tradition::Parser::Tabular |
13 | |
3b853983 |
14 | =head1 SYNOPSIS |
15 | |
16 | use Text::Tradition; |
17 | |
18 | my $t_from_file = Text::Tradition->new( |
19 | 'name' => 'my text', |
20 | 'input' => 'Tabular', |
21 | 'file' => '/path/to/collation.csv', |
22 | 'sep_char' => ',' |
23 | ); |
24 | |
25 | my $t_from_string = Text::Tradition->new( |
26 | 'name' => 'my text', |
27 | 'input' => 'Tabular', |
28 | 'string' => $tab_separated_collation, |
29 | 'sep_char' => "\t", |
30 | ); |
31 | |
d9e873d0 |
32 | =head1 DESCRIPTION |
33 | |
a445ce40 |
34 | Parser module for Text::Tradition to read an alignment table format, such as |
35 | CSV or Excel. |
d9e873d0 |
36 | |
37 | =head1 METHODS |
38 | |
e867486f |
39 | =head2 B<parse>( $tradition, $option_hash ) |
3b853983 |
40 | |
41 | Takes an initialized tradition and a set of options; creates the |
42 | appropriate nodes and edges on the graph, as well as the appropriate |
a445ce40 |
43 | witness objects. The $option_hash can contain the following: |
44 | |
45 | =over |
46 | |
47 | =item * file - Name of file which contains the data |
48 | |
49 | =item * string - A string that itself contains the data. One of 'file' or |
50 | 'string' is required. |
51 | |
52 | =item * sep_char - For plaintext formats, the field separation character. |
53 | Defaults to "\t" (tab-separated); should be "," for comma-separated format. |
54 | |
55 | =item * excel - If the data is in an Excel file, this option should be set |
56 | to 'xls' (for pre-2007 Excel format) or 'xlsx' (for Excel XML format.) |
57 | |
58 | =back |
59 | |
60 | The data should comprise a table with witnesses arranged in columns, with |
61 | the witness sigla in the first row. Empty cells are interpreted as |
62 | omissions (and thus stemmatologically relevant.) Longer lacunae in the |
63 | text, to be disregarded in cladistic analysis, may be represented by |
64 | filling the appropriate cells with the tag '#LACUNA#'. |
3b853983 |
65 | |
66 | If a witness name ends in the collation's ac_label, it will be treated as |
67 | an 'ante-correction' version of the 'main' witness whose sigil it shares. |
68 | |
69 | =begin testing |
70 | |
71 | use Text::Tradition; |
72 | binmode STDOUT, ":utf8"; |
73 | binmode STDERR, ":utf8"; |
74 | eval { no warnings; binmode $DB::OUT, ":utf8"; }; |
75 | |
76 | my $csv = 't/data/florilegium.csv'; |
77 | my $t = Text::Tradition->new( |
78 | 'name' => 'inline', |
79 | 'input' => 'Tabular', |
80 | 'file' => $csv, |
81 | 'sep_char' => ',', |
82 | ); |
d9e873d0 |
83 | |
3b853983 |
84 | is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" ); |
d9e873d0 |
85 | |
3b853983 |
86 | ### TODO Check these figures |
87 | if( $t ) { |
0e47f4f6 |
88 | is( scalar $t->collation->readings, 311, "Collation has all readings" ); |
89 | is( scalar $t->collation->paths, 361, "Collation has all paths" ); |
3b853983 |
90 | is( scalar $t->witnesses, 13, "Collation has all witnesses" ); |
91 | } |
92 | |
b0b4421a |
93 | # Check that we have the right witnesses |
94 | my %seen_wits; |
95 | map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /; |
96 | foreach my $wit ( $t->witnesses ) { |
97 | $seen_wits{$wit->sigil} = 1; |
98 | } |
99 | is( scalar keys %seen_wits, 13, "No extra witnesses were made" ); |
100 | foreach my $k ( keys %seen_wits ) { |
101 | ok( $seen_wits{$k}, "Witness $k still exists" ); |
102 | } |
103 | |
104 | # Check that the witnesses have the right texts |
105 | foreach my $wit ( $t->witnesses ) { |
106 | my $origtext = join( ' ', @{$wit->text} ); |
107 | my $graphtext = $t->collation->path_text( $wit->sigil ); |
108 | is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil ); |
109 | } |
110 | |
111 | # Check that the a.c. witnesses have the right text |
112 | map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /; |
113 | foreach my $k ( keys %seen_wits ) { |
114 | my $wit = $t->witness( $k ); |
115 | if( $seen_wits{$k} ) { |
116 | ok( $wit->is_layered, "Witness $k got marked as layered" ); |
117 | ok( $wit->has_layertext, "Witness $k has an a.c. version" ); |
118 | my $origtext = join( ' ', @{$wit->layertext} ); |
119 | my $acsig = $wit->sigil . $t->collation->ac_label; |
861c3e27 |
120 | my $graphtext = $t->collation->path_text( $acsig ); |
b0b4421a |
121 | is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" ); |
122 | } else { |
123 | ok( !$wit->is_layered, "Witness $k not marked as layered" ); |
124 | ok( !$wit->has_layertext, "Witness $k has no a.c. version" ); |
125 | } |
126 | } |
127 | |
cc31ebaa |
128 | # Check that we only have collation relationships where we need them |
129 | is( scalar $t->collation->relationships, 3, "Redundant collations were removed" ); |
cc31ebaa |
130 | |
701ad2ba |
131 | ## Check excel parsing |
132 | |
133 | my $xls = 't/data/armexample.xls'; |
134 | my $xt = Text::Tradition->new( |
135 | 'name' => 'excel test', |
136 | 'input' => 'Tabular', |
137 | 'file' => $xls, |
3a3b8213 |
138 | 'excel' => 'xls' |
701ad2ba |
139 | ); |
140 | |
141 | is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" ); |
142 | my %xls_wits; |
143 | map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /; |
144 | foreach my $wit ( $xt->witnesses ) { |
145 | $xls_wits{$wit->sigil} = 1; |
146 | } |
147 | is( scalar keys %xls_wits, 3, "No extra witnesses were made" ); |
148 | foreach my $k ( keys %xls_wits ) { |
149 | ok( $xls_wits{$k}, "Witness $k still exists" ); |
150 | } |
151 | is( scalar $xt->collation->readings, 11, "Got correct number of test readings" ); |
152 | is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" ); |
153 | is( $xt->collation->reading('r5.1')->text, "\x{587}", |
154 | "Correct decoding of at least one reading" ); |
155 | |
3a3b8213 |
156 | my $xlsx = 't/data/armexample.xlsx'; |
157 | my $xtx = Text::Tradition->new( |
158 | 'name' => 'excel test', |
159 | 'input' => 'Tabular', |
160 | 'file' => $xlsx, |
161 | 'excel' => 'xlsx' |
162 | ); |
163 | |
164 | is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" ); |
165 | my %xlsx_wits; |
166 | map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /; |
167 | foreach my $wit ( $xtx->witnesses ) { |
168 | $xlsx_wits{$wit->sigil} = 1; |
169 | } |
170 | is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" ); |
171 | foreach my $k ( keys %xlsx_wits ) { |
172 | ok( $xlsx_wits{$k}, "Witness $k still exists" ); |
173 | } |
174 | is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" ); |
175 | is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" ); |
176 | is( $xtx->collation->reading('r5.1')->text, "\x{587}", |
177 | "Correct decoding of at least one reading" ); |
178 | |
3b853983 |
179 | =end testing |
d9e873d0 |
180 | |
181 | =cut |
182 | |
183 | sub parse { |
dfc37e38 |
184 | my( $tradition, $opts ) = @_; |
3a3b8213 |
185 | my $alignment_table = _table_from_input( $opts ); |
d9e873d0 |
186 | # Set up the witnesses we find in the first line |
187 | my @witnesses; |
b0b4421a |
188 | my %ac_wits; # Track layered witness -> main witness mapping |
3a3b8213 |
189 | my $c = $tradition->collation; # shorthand |
82fa4d57 |
190 | my $aclabel = $c->ac_label; |
d9e873d0 |
191 | foreach my $sigil ( @{$alignment_table->[0]} ) { |
3b853983 |
192 | if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) { |
82fa4d57 |
193 | # Sanitize the sigil name to an XML name |
194 | $sigil = $1 . '_layered'; |
b0b4421a |
195 | $ac_wits{$sigil} = $1; |
3b853983 |
196 | } |
82fa4d57 |
197 | my $wit = $tradition->add_witness( |
198 | 'sigil' => $sigil, 'sourcetype' => 'collation' ); |
199 | $wit->path( [ $c->start ] ); |
200 | push( @witnesses, $wit ); |
201 | my $aclabel = $c->ac_label; |
d9e873d0 |
202 | } |
203 | |
b0b4421a |
204 | # Save the original witness text sequences. Have to loop back through |
205 | # the witness columns after we have identified all the a.c. witnesses. |
206 | foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) { |
207 | my @sequence = map { $_->[$idx] } @{$alignment_table}; |
208 | my $sigil = shift @sequence; |
209 | my $is_layer = exists( $ac_wits{$sigil} ); |
210 | my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil ); |
211 | # Now get rid of gaps and meta-readings like #LACUNA# |
212 | my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence; |
213 | $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words ); |
214 | } |
215 | |
9bdf9d67 |
216 | my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000; |
217 | print STDERR "Tradition too big for row collation\n" if $nocollate; |
218 | |
d9e873d0 |
219 | # Now for the next rows, make nodes as necessary, assign their ranks, and |
220 | # add them to the witness paths. |
d9e873d0 |
221 | foreach my $idx ( 1 .. $#{$alignment_table} ) { |
222 | my $row = $alignment_table->[$idx]; |
9bdf9d67 |
223 | my $nodes = _make_nodes( $c, $row, $idx, $nocollate ); |
d9e873d0 |
224 | foreach my $w ( 0 .. $#{$row} ) { |
225 | # push the appropriate node onto the appropriate witness path |
226 | my $word = $row->[$w]; |
227 | if( $word ) { |
228 | my $reading = $nodes->{$word}; |
229 | my $wit = $witnesses[$w]; |
230 | push( @{$wit->path}, $reading ); |
231 | } # else skip it for empty readings. |
232 | } |
233 | } |
234 | |
eca16057 |
235 | # Collapse our lacunae into a single node and |
236 | # push the end node onto all paths. |
d9e873d0 |
237 | $c->end->rank( scalar @$alignment_table ); |
238 | foreach my $wit ( @witnesses ) { |
eca16057 |
239 | my $p = $wit->path; |
240 | my $last_rdg = shift @$p; |
241 | my $new_p = [ $last_rdg ]; |
242 | foreach my $rdg ( @$p ) { |
83d5ac3a |
243 | # Omit the reading if we are in a lacuna already. |
244 | next if $rdg->is_lacuna && $last_rdg->is_lacuna; |
245 | # Save the reading otherwise. |
246 | push( @$new_p, $rdg ); |
247 | $last_rdg = $rdg; |
eca16057 |
248 | } |
249 | push( @$new_p, $c->end ); |
250 | $wit->path( $new_p ); |
d9e873d0 |
251 | } |
252 | |
3b853983 |
253 | # Fold any a.c. witnesses into their main witness objects, and |
254 | # delete the independent a.c. versions. |
255 | foreach my $a ( keys %ac_wits ) { |
b0b4421a |
256 | my $ac_wit = $tradition->witness( $a ); |
257 | my $main_wit = $tradition->witness( $ac_wits{$a} ); |
3b853983 |
258 | next unless $main_wit; |
861c3e27 |
259 | $main_wit->is_layered(1); |
3b853983 |
260 | $main_wit->uncorrected_path( $ac_wit->path ); |
261 | $tradition->del_witness( $ac_wit ); |
262 | } |
83d5ac3a |
263 | |
d9e873d0 |
264 | # Join up the paths. |
265 | $c->make_witness_paths; |
83d5ac3a |
266 | # Delete our unused lacuna nodes. |
267 | foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) { |
268 | $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg ); |
269 | } |
861c3e27 |
270 | |
271 | # Do a consistency check. |
272 | foreach my $wit ( $tradition->witnesses ) { |
273 | my $pathtext = $c->path_text( $wit->sigil ); |
274 | my $origtext = join( ' ', @{$wit->text} ); |
275 | warn "Text differs for witness " . $wit->sigil |
276 | unless $pathtext eq $origtext; |
277 | if( $wit->is_layered ) { |
278 | $pathtext = $c->path_text( $wit->sigil.$c->ac_label ); |
279 | $origtext = join( ' ', @{$wit->layertext} ); |
280 | warn "Ante-corr text differs for witness " . $wit->sigil |
281 | unless $pathtext eq $origtext; |
282 | } else { |
283 | warn "Text " . $wit->sigil . " has a layered text but is not marked as layered" |
284 | if $wit->has_layertext; |
285 | } |
286 | } |
202ccb18 |
287 | |
288 | # Note that our ranks and common readings are set. |
289 | $c->_graphcalc_done(1); |
cc31ebaa |
290 | # Remove redundant collation relationships. |
9bdf9d67 |
291 | $c->relations->filter_collations() unless $nocollate; |
d9e873d0 |
292 | } |
293 | |
3a3b8213 |
294 | sub _table_from_input { |
295 | my $opts = shift; |
296 | my $alignment_table = []; |
297 | if( $opts->{'excel'} ) { |
298 | my $sheet; |
299 | my $need_decode; |
300 | unless( exists $opts->{'file'} ) { |
301 | throw( "Must pass the filename for Excel parsing" ); |
302 | } |
303 | if( $opts->{'excel'} eq 'xls' ) { |
304 | try { |
305 | require Spreadsheet::ParseExcel; |
306 | } catch { |
307 | throw( "Need module Spreadsheet::ParseExcel to parse .xls files" ); |
308 | } |
309 | my $parser = Spreadsheet::ParseExcel->new(); |
310 | my $workbook = $parser->parse( $opts->{'file'} ); |
311 | unless( defined $workbook && defined $workbook->worksheet(0) ) { |
312 | throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() ); |
313 | } |
314 | $sheet = $workbook->worksheet(0); |
315 | } elsif( $opts->{'excel'} eq 'xlsx' ) { |
316 | try { |
317 | require Spreadsheet::XLSX; |
318 | } catch { |
319 | throw( "Need module Spreadsheet::XLSX to parse .xlsx files" ); |
320 | } |
321 | $need_decode = 1; |
322 | my $workbook; |
323 | try { |
324 | $workbook = Spreadsheet::XLSX->new( $opts->{'file'} ); |
325 | } catch { |
326 | throw( "Failed to parse file " . $opts->{'file'} ); |
327 | } |
328 | $sheet = $workbook->worksheet(0); |
329 | } else { |
330 | throw( "Unrecognized Excel variant" . $opts->{'excel'} ); |
331 | } |
332 | $alignment_table = _alignment_from_worksheet( $sheet, $need_decode ); |
333 | } else { |
334 | # Assume it is a comma-, tab-, or whatever-separated format. |
335 | my $csv_options = { 'binary' => 1 }; |
336 | $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t"; |
337 | if( $csv_options->{'sep_char'} eq "\t" ) { |
338 | # If it is really tab separated, nothing is an escape char. |
339 | $csv_options->{'quote_char'} = undef; |
340 | $csv_options->{'escape_char'} = undef; |
341 | } |
342 | my $csv = Text::CSV->new( $csv_options ); |
343 | |
344 | if( exists $opts->{'string' } ) { |
345 | my @lines = split( "\n", $opts->{'string'} ); |
346 | foreach my $l ( @lines ) { |
347 | my $status = $csv->parse( $l ); |
348 | if( $status ) { |
349 | push( @$alignment_table, [ $csv->fields ] ); |
350 | } else { |
351 | throw( "Could not parse line $l: " . $csv->error_input ); |
352 | } |
353 | } |
354 | } elsif( exists $opts->{'file'} ) { |
355 | open( my $fh, $opts->{'file'} ) |
356 | or warn "Could not open input file " . $opts->{'file'}; |
357 | binmode( $fh, ':utf8' ); |
358 | while( my $row = $csv->getline( $fh ) ) { |
359 | push( @$alignment_table, $row ); |
360 | } |
361 | close $fh; |
362 | } else { |
363 | throw( "Could not find string or file option to parse" ); |
364 | } |
365 | } |
366 | return $alignment_table; |
367 | } |
368 | sub _alignment_from_worksheet { |
369 | my( $sheet, $decode ) = @_; |
370 | my $alignment_table = []; |
371 | |
372 | my( $rmin, $rmax ) = $sheet->row_range(); |
373 | my( $cmin, $cmax ) = $sheet->col_range(); |
374 | unless( $cmax && $rmax ) { |
375 | throw( "Found no rows or no columns in first worksheet" ); |
376 | } |
377 | # Populate the alignment table. We only want columns that have |
378 | # a sigil in row zero. |
379 | my %sigcols = (); |
380 | push( @$alignment_table, [] ); |
381 | foreach my $col ( $cmin .. $cmax ) { |
382 | my $cell = $sheet->get_cell( $rmin, $col ); |
383 | my $cellval = $cell ? $cell->value() : undef; |
384 | if( $cellval ) { |
385 | $sigcols{$col} = 1; |
386 | push( @{$alignment_table->[0]}, $cellval ); |
387 | } |
388 | } |
389 | # Now go through the rest of the rows and pick up the columns |
390 | # that were headed by a sigil. |
391 | foreach my $row ( $rmin+1 .. $rmax ) { |
392 | my @tablerow; |
393 | foreach my $col ( $cmin .. $cmax ) { |
394 | next unless $sigcols{$col}; |
395 | my $cell = $sheet->get_cell( $row, $col ); |
396 | my $cellval; |
397 | if( $cell ) { |
398 | $cellval = $decode ? decode_utf8( $cell->value ) : $cell->value; |
399 | } |
400 | push( @tablerow, $cellval ); |
401 | } |
402 | push( @$alignment_table, \@tablerow ); |
403 | } |
404 | return $alignment_table; |
405 | } |
406 | |
027d819c |
407 | sub _make_nodes { |
9bdf9d67 |
408 | my( $collation, $row, $index, $nocollate ) = @_; |
d9e873d0 |
409 | my %unique; |
15db7774 |
410 | my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae. |
d9e873d0 |
411 | foreach my $w ( @$row ) { |
412 | $unique{$w} = 1 if $w; |
15db7774 |
413 | $commonctr +=1 unless ( $w && $w eq '#LACUNA#' ); |
d9e873d0 |
414 | } |
415 | my $ctr = 1; |
416 | foreach my $w ( keys %unique ) { |
a753cc84 |
417 | my $rargs = { |
10e4b1ac |
418 | 'id' => "r$index.$ctr", |
a753cc84 |
419 | 'rank' => $index, |
420 | 'text' => $w, |
421 | }; |
15db7774 |
422 | if( $w eq '#LACUNA#' ) { |
423 | $rargs->{'is_lacuna'} = 1; |
424 | } elsif( $commonctr == 1 ) { |
425 | $rargs->{'is_common'} = 1; |
426 | } |
a753cc84 |
427 | my $r = $collation->add_reading( $rargs ); |
d9e873d0 |
428 | $unique{$w} = $r; |
a753cc84 |
429 | $ctr++; |
d9e873d0 |
430 | } |
bf6e338d |
431 | # Collate this sequence of readings via a single 'collation' relationship. |
9bdf9d67 |
432 | unless( $nocollate ) { |
433 | my @rankrdgs = values %unique; |
434 | my $collation_rel; |
435 | while( @rankrdgs ) { |
436 | my $r = shift @rankrdgs; |
437 | next if $r->is_meta; |
438 | foreach my $nr ( @rankrdgs ) { |
439 | next if $nr->is_meta; |
440 | if( $collation_rel ) { |
441 | $collation->add_relationship( $r, $nr, $collation_rel ); |
442 | } else { |
443 | $collation->add_relationship( $r, $nr, |
444 | { 'type' => 'collated', |
445 | 'annotation' => "Parsed together for rank $index" } ); |
446 | $collation_rel = $collation->get_relationship( $r, $nr ); |
447 | } |
448 | } |
449 | } |
450 | } |
d9e873d0 |
451 | return \%unique; |
452 | } |
453 | |
3a3b8213 |
454 | sub throw { |
455 | Text::Tradition::Error->throw( |
456 | 'ident' => 'Parser::Tabular error', |
457 | 'message' => $_[0], |
458 | ); |
459 | } |
460 | |
3b853983 |
461 | 1; |
462 | |
463 | =head1 LICENSE |
464 | |
465 | This package is free software and is provided "as is" without express |
466 | or implied warranty. You can redistribute it and/or modify it under |
467 | the same terms as Perl itself. |
468 | |
469 | =head1 AUTHOR |
470 | |
471 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |