Commit | Line | Data |
d9e873d0 |
1 | package Text::Tradition::Parser::Tabular; |
2 | |
3 | use strict; |
4 | use warnings; |
3a3b8213 |
5 | use Encode qw/ decode_utf8 /; |
82fa4d57 |
6 | use Text::CSV; |
701ad2ba |
7 | use Text::Tradition::Error; |
8 | use TryCatch; |
d9e873d0 |
9 | |
10 | =head1 NAME |
11 | |
12 | Text::Tradition::Parser::Tabular |
13 | |
3b853983 |
14 | =head1 SYNOPSIS |
15 | |
16 | use Text::Tradition; |
17 | |
18 | my $t_from_file = Text::Tradition->new( |
19 | 'name' => 'my text', |
20 | 'input' => 'Tabular', |
21 | 'file' => '/path/to/collation.csv', |
22 | 'sep_char' => ',' |
23 | ); |
24 | |
25 | my $t_from_string = Text::Tradition->new( |
26 | 'name' => 'my text', |
27 | 'input' => 'Tabular', |
28 | 'string' => $tab_separated_collation, |
29 | 'sep_char' => "\t", |
30 | ); |
31 | |
d9e873d0 |
32 | =head1 DESCRIPTION |
33 | |
a445ce40 |
34 | Parser module for Text::Tradition to read an alignment table format, such as |
35 | CSV or Excel. |
d9e873d0 |
36 | |
37 | =head1 METHODS |
38 | |
e867486f |
39 | =head2 B<parse>( $tradition, $option_hash ) |
3b853983 |
40 | |
41 | Takes an initialized tradition and a set of options; creates the |
42 | appropriate nodes and edges on the graph, as well as the appropriate |
a445ce40 |
43 | witness objects. The $option_hash can contain the following: |
44 | |
45 | =over |
46 | |
47 | =item * file - Name of file which contains the data |
48 | |
49 | =item * string - A string that itself contains the data. One of 'file' or |
50 | 'string' is required. |
51 | |
52 | =item * sep_char - For plaintext formats, the field separation character. |
53 | Defaults to "\t" (tab-separated); should be "," for comma-separated format. |
54 | |
55 | =item * excel - If the data is in an Excel file, this option should be set |
56 | to 'xls' (for pre-2007 Excel format) or 'xlsx' (for Excel XML format.) |
57 | |
58 | =back |
59 | |
60 | The data should comprise a table with witnesses arranged in columns, with |
61 | the witness sigla in the first row. Empty cells are interpreted as |
62 | omissions (and thus stemmatologically relevant.) Longer lacunae in the |
63 | text, to be disregarded in cladistic analysis, may be represented by |
64 | filling the appropriate cells with the tag '#LACUNA#'. |
3b853983 |
65 | |
66 | If a witness name ends in the collation's ac_label, it will be treated as |
67 | an 'ante-correction' version of the 'main' witness whose sigil it shares. |
68 | |
69 | =begin testing |
70 | |
fa6bc75d |
71 | use Test::More::UTF8; |
3b853983 |
72 | use Text::Tradition; |
73 | binmode STDOUT, ":utf8"; |
74 | binmode STDERR, ":utf8"; |
75 | eval { no warnings; binmode $DB::OUT, ":utf8"; }; |
76 | |
77 | my $csv = 't/data/florilegium.csv'; |
78 | my $t = Text::Tradition->new( |
79 | 'name' => 'inline', |
80 | 'input' => 'Tabular', |
81 | 'file' => $csv, |
82 | 'sep_char' => ',', |
83 | ); |
d9e873d0 |
84 | |
3b853983 |
85 | is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" ); |
d9e873d0 |
86 | |
3b853983 |
87 | ### TODO Check these figures |
88 | if( $t ) { |
0e47f4f6 |
89 | is( scalar $t->collation->readings, 311, "Collation has all readings" ); |
90 | is( scalar $t->collation->paths, 361, "Collation has all paths" ); |
3b853983 |
91 | is( scalar $t->witnesses, 13, "Collation has all witnesses" ); |
92 | } |
93 | |
b0b4421a |
94 | # Check that we have the right witnesses |
95 | my %seen_wits; |
96 | map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /; |
97 | foreach my $wit ( $t->witnesses ) { |
98 | $seen_wits{$wit->sigil} = 1; |
99 | } |
100 | is( scalar keys %seen_wits, 13, "No extra witnesses were made" ); |
101 | foreach my $k ( keys %seen_wits ) { |
102 | ok( $seen_wits{$k}, "Witness $k still exists" ); |
103 | } |
104 | |
105 | # Check that the witnesses have the right texts |
106 | foreach my $wit ( $t->witnesses ) { |
107 | my $origtext = join( ' ', @{$wit->text} ); |
108 | my $graphtext = $t->collation->path_text( $wit->sigil ); |
109 | is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil ); |
110 | } |
111 | |
112 | # Check that the a.c. witnesses have the right text |
113 | map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /; |
114 | foreach my $k ( keys %seen_wits ) { |
115 | my $wit = $t->witness( $k ); |
116 | if( $seen_wits{$k} ) { |
117 | ok( $wit->is_layered, "Witness $k got marked as layered" ); |
118 | ok( $wit->has_layertext, "Witness $k has an a.c. version" ); |
119 | my $origtext = join( ' ', @{$wit->layertext} ); |
120 | my $acsig = $wit->sigil . $t->collation->ac_label; |
861c3e27 |
121 | my $graphtext = $t->collation->path_text( $acsig ); |
b0b4421a |
122 | is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" ); |
123 | } else { |
124 | ok( !$wit->is_layered, "Witness $k not marked as layered" ); |
125 | ok( !$wit->has_layertext, "Witness $k has no a.c. version" ); |
126 | } |
127 | } |
128 | |
cc31ebaa |
129 | # Check that we only have collation relationships where we need them |
130 | is( scalar $t->collation->relationships, 3, "Redundant collations were removed" ); |
cc31ebaa |
131 | |
701ad2ba |
132 | ## Check excel parsing |
133 | |
134 | my $xls = 't/data/armexample.xls'; |
135 | my $xt = Text::Tradition->new( |
136 | 'name' => 'excel test', |
137 | 'input' => 'Tabular', |
138 | 'file' => $xls, |
3a3b8213 |
139 | 'excel' => 'xls' |
701ad2ba |
140 | ); |
141 | |
142 | is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" ); |
143 | my %xls_wits; |
144 | map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /; |
145 | foreach my $wit ( $xt->witnesses ) { |
146 | $xls_wits{$wit->sigil} = 1; |
147 | } |
148 | is( scalar keys %xls_wits, 3, "No extra witnesses were made" ); |
149 | foreach my $k ( keys %xls_wits ) { |
150 | ok( $xls_wits{$k}, "Witness $k still exists" ); |
151 | } |
152 | is( scalar $xt->collation->readings, 11, "Got correct number of test readings" ); |
153 | is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" ); |
154 | is( $xt->collation->reading('r5.1')->text, "\x{587}", |
155 | "Correct decoding of at least one reading" ); |
156 | |
3a3b8213 |
157 | my $xlsx = 't/data/armexample.xlsx'; |
158 | my $xtx = Text::Tradition->new( |
159 | 'name' => 'excel test', |
160 | 'input' => 'Tabular', |
161 | 'file' => $xlsx, |
162 | 'excel' => 'xlsx' |
163 | ); |
164 | |
165 | is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" ); |
166 | my %xlsx_wits; |
fa6bc75d |
167 | map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit3 /; |
168 | $xlsx_wits{"\x{531}\x{562}2"} = 0; |
3a3b8213 |
169 | foreach my $wit ( $xtx->witnesses ) { |
170 | $xlsx_wits{$wit->sigil} = 1; |
171 | } |
172 | is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" ); |
173 | foreach my $k ( keys %xlsx_wits ) { |
174 | ok( $xlsx_wits{$k}, "Witness $k still exists" ); |
175 | } |
176 | is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" ); |
177 | is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" ); |
178 | is( $xtx->collation->reading('r5.1')->text, "\x{587}", |
179 | "Correct decoding of at least one reading" ); |
180 | |
3b853983 |
181 | =end testing |
d9e873d0 |
182 | |
183 | =cut |
184 | |
185 | sub parse { |
dfc37e38 |
186 | my( $tradition, $opts ) = @_; |
3a3b8213 |
187 | my $alignment_table = _table_from_input( $opts ); |
d9e873d0 |
188 | # Set up the witnesses we find in the first line |
189 | my @witnesses; |
b0b4421a |
190 | my %ac_wits; # Track layered witness -> main witness mapping |
3a3b8213 |
191 | my $c = $tradition->collation; # shorthand |
82fa4d57 |
192 | my $aclabel = $c->ac_label; |
d9e873d0 |
193 | foreach my $sigil ( @{$alignment_table->[0]} ) { |
3b853983 |
194 | if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) { |
82fa4d57 |
195 | # Sanitize the sigil name to an XML name |
196 | $sigil = $1 . '_layered'; |
b0b4421a |
197 | $ac_wits{$sigil} = $1; |
3b853983 |
198 | } |
82fa4d57 |
199 | my $wit = $tradition->add_witness( |
200 | 'sigil' => $sigil, 'sourcetype' => 'collation' ); |
201 | $wit->path( [ $c->start ] ); |
202 | push( @witnesses, $wit ); |
203 | my $aclabel = $c->ac_label; |
d9e873d0 |
204 | } |
205 | |
b0b4421a |
206 | # Save the original witness text sequences. Have to loop back through |
207 | # the witness columns after we have identified all the a.c. witnesses. |
208 | foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) { |
209 | my @sequence = map { $_->[$idx] } @{$alignment_table}; |
210 | my $sigil = shift @sequence; |
211 | my $is_layer = exists( $ac_wits{$sigil} ); |
212 | my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil ); |
213 | # Now get rid of gaps and meta-readings like #LACUNA# |
214 | my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence; |
215 | $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words ); |
216 | } |
217 | |
9bdf9d67 |
218 | my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000; |
219 | print STDERR "Tradition too big for row collation\n" if $nocollate; |
220 | |
d9e873d0 |
221 | # Now for the next rows, make nodes as necessary, assign their ranks, and |
222 | # add them to the witness paths. |
d9e873d0 |
223 | foreach my $idx ( 1 .. $#{$alignment_table} ) { |
224 | my $row = $alignment_table->[$idx]; |
9bdf9d67 |
225 | my $nodes = _make_nodes( $c, $row, $idx, $nocollate ); |
d9e873d0 |
226 | foreach my $w ( 0 .. $#{$row} ) { |
227 | # push the appropriate node onto the appropriate witness path |
228 | my $word = $row->[$w]; |
229 | if( $word ) { |
230 | my $reading = $nodes->{$word}; |
231 | my $wit = $witnesses[$w]; |
232 | push( @{$wit->path}, $reading ); |
233 | } # else skip it for empty readings. |
234 | } |
235 | } |
236 | |
eca16057 |
237 | # Collapse our lacunae into a single node and |
238 | # push the end node onto all paths. |
d9e873d0 |
239 | $c->end->rank( scalar @$alignment_table ); |
240 | foreach my $wit ( @witnesses ) { |
eca16057 |
241 | my $p = $wit->path; |
242 | my $last_rdg = shift @$p; |
243 | my $new_p = [ $last_rdg ]; |
244 | foreach my $rdg ( @$p ) { |
83d5ac3a |
245 | # Omit the reading if we are in a lacuna already. |
246 | next if $rdg->is_lacuna && $last_rdg->is_lacuna; |
247 | # Save the reading otherwise. |
248 | push( @$new_p, $rdg ); |
249 | $last_rdg = $rdg; |
eca16057 |
250 | } |
251 | push( @$new_p, $c->end ); |
252 | $wit->path( $new_p ); |
d9e873d0 |
253 | } |
254 | |
3b853983 |
255 | # Fold any a.c. witnesses into their main witness objects, and |
256 | # delete the independent a.c. versions. |
257 | foreach my $a ( keys %ac_wits ) { |
b0b4421a |
258 | my $ac_wit = $tradition->witness( $a ); |
259 | my $main_wit = $tradition->witness( $ac_wits{$a} ); |
3b853983 |
260 | next unless $main_wit; |
861c3e27 |
261 | $main_wit->is_layered(1); |
3b853983 |
262 | $main_wit->uncorrected_path( $ac_wit->path ); |
263 | $tradition->del_witness( $ac_wit ); |
264 | } |
83d5ac3a |
265 | |
d9e873d0 |
266 | # Join up the paths. |
267 | $c->make_witness_paths; |
83d5ac3a |
268 | # Delete our unused lacuna nodes. |
269 | foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) { |
270 | $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg ); |
271 | } |
861c3e27 |
272 | |
273 | # Do a consistency check. |
274 | foreach my $wit ( $tradition->witnesses ) { |
275 | my $pathtext = $c->path_text( $wit->sigil ); |
276 | my $origtext = join( ' ', @{$wit->text} ); |
277 | warn "Text differs for witness " . $wit->sigil |
278 | unless $pathtext eq $origtext; |
279 | if( $wit->is_layered ) { |
280 | $pathtext = $c->path_text( $wit->sigil.$c->ac_label ); |
281 | $origtext = join( ' ', @{$wit->layertext} ); |
282 | warn "Ante-corr text differs for witness " . $wit->sigil |
283 | unless $pathtext eq $origtext; |
284 | } else { |
285 | warn "Text " . $wit->sigil . " has a layered text but is not marked as layered" |
286 | if $wit->has_layertext; |
287 | } |
288 | } |
202ccb18 |
289 | |
290 | # Note that our ranks and common readings are set. |
291 | $c->_graphcalc_done(1); |
98a66507 |
292 | _add_collations( $c ) unless $nocollate; |
d9e873d0 |
293 | } |
294 | |
3a3b8213 |
295 | sub _table_from_input { |
296 | my $opts = shift; |
297 | my $alignment_table = []; |
298 | if( $opts->{'excel'} ) { |
299 | my $sheet; |
300 | my $need_decode; |
301 | unless( exists $opts->{'file'} ) { |
302 | throw( "Must pass the filename for Excel parsing" ); |
303 | } |
304 | if( $opts->{'excel'} eq 'xls' ) { |
305 | try { |
306 | require Spreadsheet::ParseExcel; |
307 | } catch { |
308 | throw( "Need module Spreadsheet::ParseExcel to parse .xls files" ); |
309 | } |
310 | my $parser = Spreadsheet::ParseExcel->new(); |
311 | my $workbook = $parser->parse( $opts->{'file'} ); |
312 | unless( defined $workbook && defined $workbook->worksheet(0) ) { |
313 | throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() ); |
314 | } |
315 | $sheet = $workbook->worksheet(0); |
316 | } elsif( $opts->{'excel'} eq 'xlsx' ) { |
317 | try { |
318 | require Spreadsheet::XLSX; |
319 | } catch { |
320 | throw( "Need module Spreadsheet::XLSX to parse .xlsx files" ); |
321 | } |
322 | $need_decode = 1; |
323 | my $workbook; |
324 | try { |
325 | $workbook = Spreadsheet::XLSX->new( $opts->{'file'} ); |
326 | } catch { |
327 | throw( "Failed to parse file " . $opts->{'file'} ); |
328 | } |
329 | $sheet = $workbook->worksheet(0); |
330 | } else { |
331 | throw( "Unrecognized Excel variant" . $opts->{'excel'} ); |
332 | } |
333 | $alignment_table = _alignment_from_worksheet( $sheet, $need_decode ); |
334 | } else { |
335 | # Assume it is a comma-, tab-, or whatever-separated format. |
336 | my $csv_options = { 'binary' => 1 }; |
337 | $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t"; |
338 | if( $csv_options->{'sep_char'} eq "\t" ) { |
339 | # If it is really tab separated, nothing is an escape char. |
340 | $csv_options->{'quote_char'} = undef; |
341 | $csv_options->{'escape_char'} = undef; |
342 | } |
343 | my $csv = Text::CSV->new( $csv_options ); |
344 | |
345 | if( exists $opts->{'string' } ) { |
346 | my @lines = split( "\n", $opts->{'string'} ); |
347 | foreach my $l ( @lines ) { |
348 | my $status = $csv->parse( $l ); |
349 | if( $status ) { |
350 | push( @$alignment_table, [ $csv->fields ] ); |
351 | } else { |
352 | throw( "Could not parse line $l: " . $csv->error_input ); |
353 | } |
354 | } |
355 | } elsif( exists $opts->{'file'} ) { |
356 | open( my $fh, $opts->{'file'} ) |
357 | or warn "Could not open input file " . $opts->{'file'}; |
358 | binmode( $fh, ':utf8' ); |
359 | while( my $row = $csv->getline( $fh ) ) { |
360 | push( @$alignment_table, $row ); |
361 | } |
362 | close $fh; |
363 | } else { |
364 | throw( "Could not find string or file option to parse" ); |
365 | } |
366 | } |
367 | return $alignment_table; |
368 | } |
369 | sub _alignment_from_worksheet { |
370 | my( $sheet, $decode ) = @_; |
371 | my $alignment_table = []; |
372 | |
373 | my( $rmin, $rmax ) = $sheet->row_range(); |
374 | my( $cmin, $cmax ) = $sheet->col_range(); |
375 | unless( $cmax && $rmax ) { |
376 | throw( "Found no rows or no columns in first worksheet" ); |
377 | } |
378 | # Populate the alignment table. We only want columns that have |
379 | # a sigil in row zero. |
380 | my %sigcols = (); |
381 | push( @$alignment_table, [] ); |
382 | foreach my $col ( $cmin .. $cmax ) { |
383 | my $cell = $sheet->get_cell( $rmin, $col ); |
fa6bc75d |
384 | my $cellval; |
385 | if( $cell ) { |
386 | $cellval = $decode ? decode_utf8( $cell->value ) : $cell->value; |
387 | } |
3a3b8213 |
388 | if( $cellval ) { |
389 | $sigcols{$col} = 1; |
390 | push( @{$alignment_table->[0]}, $cellval ); |
391 | } |
392 | } |
393 | # Now go through the rest of the rows and pick up the columns |
394 | # that were headed by a sigil. |
395 | foreach my $row ( $rmin+1 .. $rmax ) { |
396 | my @tablerow; |
397 | foreach my $col ( $cmin .. $cmax ) { |
398 | next unless $sigcols{$col}; |
399 | my $cell = $sheet->get_cell( $row, $col ); |
400 | my $cellval; |
401 | if( $cell ) { |
402 | $cellval = $decode ? decode_utf8( $cell->value ) : $cell->value; |
403 | } |
404 | push( @tablerow, $cellval ); |
405 | } |
406 | push( @$alignment_table, \@tablerow ); |
407 | } |
408 | return $alignment_table; |
409 | } |
410 | |
027d819c |
411 | sub _make_nodes { |
9bdf9d67 |
412 | my( $collation, $row, $index, $nocollate ) = @_; |
d9e873d0 |
413 | my %unique; |
15db7774 |
414 | my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae. |
d9e873d0 |
415 | foreach my $w ( @$row ) { |
416 | $unique{$w} = 1 if $w; |
15db7774 |
417 | $commonctr +=1 unless ( $w && $w eq '#LACUNA#' ); |
d9e873d0 |
418 | } |
419 | my $ctr = 1; |
420 | foreach my $w ( keys %unique ) { |
a753cc84 |
421 | my $rargs = { |
10e4b1ac |
422 | 'id' => "r$index.$ctr", |
a753cc84 |
423 | 'rank' => $index, |
424 | 'text' => $w, |
425 | }; |
15db7774 |
426 | if( $w eq '#LACUNA#' ) { |
427 | $rargs->{'is_lacuna'} = 1; |
428 | } elsif( $commonctr == 1 ) { |
429 | $rargs->{'is_common'} = 1; |
430 | } |
a753cc84 |
431 | my $r = $collation->add_reading( $rargs ); |
d9e873d0 |
432 | $unique{$w} = $r; |
a753cc84 |
433 | $ctr++; |
d9e873d0 |
434 | } |
98a66507 |
435 | return \%unique; |
436 | } |
437 | |
438 | sub _add_collations { |
439 | my( $collation ) = shift; |
440 | # For each reading that needs to be held in place, add a 'collated' |
441 | # relationship to whatever anchor we can find. An anchor is a reading |
442 | # that would occupy its rank by virtue of being subsequent to a |
443 | # reading at $rank-1. |
444 | my @collate_pairs; |
445 | foreach my $r ( 1 .. $collation->end->rank - 1 ) { |
98a66507 |
446 | my $anchor; |
447 | my @need_weak; |
448 | my @here = grep { !$_->is_meta } $collation->readings_at_rank( $r ); |
449 | next unless @here > 1; |
450 | foreach my $rdg ( @here ) { |
451 | my $ip = 0; |
452 | foreach my $pred ( $rdg->predecessors ) { |
453 | if( $pred->rank == $r - 1 ) { |
454 | $ip = 1; |
455 | $anchor = $rdg unless( $anchor ); |
456 | last; |
9bdf9d67 |
457 | } |
458 | } |
98a66507 |
459 | push( @need_weak, $rdg ) unless $ip; |
9bdf9d67 |
460 | } |
98a66507 |
461 | $anchor |
462 | ? map { push( @collate_pairs, [ $r, $anchor, $_ ] ) } @need_weak |
463 | : print STDERR "No anchor found at $r\n"; |
464 | } |
465 | foreach my $p ( @collate_pairs ) { |
466 | my $r = shift @$p; |
467 | $collation->add_relationship( @$p, |
468 | { 'type' => 'collated', |
469 | 'annotation' => "Collated together for rank $r" } ) |
470 | unless $collation->get_relationship( @$p ) |
471 | } |
d9e873d0 |
472 | } |
473 | |
3a3b8213 |
474 | sub throw { |
475 | Text::Tradition::Error->throw( |
476 | 'ident' => 'Parser::Tabular error', |
477 | 'message' => $_[0], |
478 | ); |
479 | } |
480 | |
3b853983 |
481 | 1; |
482 | |
483 | =head1 LICENSE |
484 | |
485 | This package is free software and is provided "as is" without express |
486 | or implied warranty. You can redistribute it and/or modify it under |
487 | the same terms as Perl itself. |
488 | |
489 | =head1 AUTHOR |
490 | |
491 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |