load extensions statically to avoid bad object wrapping interactions
[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Parser / Tabular.pm
CommitLineData
d9e873d0 1package Text::Tradition::Parser::Tabular;
2
3use strict;
4use warnings;
3a3b8213 5use Encode qw/ decode_utf8 /;
82fa4d57 6use Text::CSV;
701ad2ba 7use Text::Tradition::Error;
8use TryCatch;
d9e873d0 9
10=head1 NAME
11
12Text::Tradition::Parser::Tabular
13
3b853983 14=head1 SYNOPSIS
15
16 use Text::Tradition;
17
18 my $t_from_file = Text::Tradition->new(
19 'name' => 'my text',
20 'input' => 'Tabular',
21 'file' => '/path/to/collation.csv',
22 'sep_char' => ','
23 );
24
25 my $t_from_string = Text::Tradition->new(
26 'name' => 'my text',
27 'input' => 'Tabular',
28 'string' => $tab_separated_collation,
29 'sep_char' => "\t",
30 );
31
d9e873d0 32=head1 DESCRIPTION
33
a445ce40 34Parser module for Text::Tradition to read an alignment table format, such as
35CSV or Excel.
d9e873d0 36
37=head1 METHODS
38
e867486f 39=head2 B<parse>( $tradition, $option_hash )
3b853983 40
41Takes an initialized tradition and a set of options; creates the
42appropriate nodes and edges on the graph, as well as the appropriate
a445ce40 43witness objects. The $option_hash can contain the following:
44
45=over
46
47=item * file - Name of file which contains the data
48
49=item * string - A string that itself contains the data. One of 'file' or
50'string' is required.
51
52=item * sep_char - For plaintext formats, the field separation character.
53Defaults to "\t" (tab-separated); should be "," for comma-separated format.
54
55=item * excel - If the data is in an Excel file, this option should be set
56to 'xls' (for pre-2007 Excel format) or 'xlsx' (for Excel XML format.)
57
58=back
59
60The data should comprise a table with witnesses arranged in columns, with
61the witness sigla in the first row. Empty cells are interpreted as
62omissions (and thus stemmatologically relevant.) Longer lacunae in the
63text, to be disregarded in cladistic analysis, may be represented by
64filling the appropriate cells with the tag '#LACUNA#'.
3b853983 65
66If a witness name ends in the collation's ac_label, it will be treated as
67an 'ante-correction' version of the 'main' witness whose sigil it shares.
68
69=begin testing
70
71use Text::Tradition;
72binmode STDOUT, ":utf8";
73binmode STDERR, ":utf8";
74eval { no warnings; binmode $DB::OUT, ":utf8"; };
75
76my $csv = 't/data/florilegium.csv';
77my $t = Text::Tradition->new(
78 'name' => 'inline',
79 'input' => 'Tabular',
80 'file' => $csv,
81 'sep_char' => ',',
82 );
d9e873d0 83
3b853983 84is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
d9e873d0 85
3b853983 86### TODO Check these figures
87if( $t ) {
0e47f4f6 88 is( scalar $t->collation->readings, 311, "Collation has all readings" );
89 is( scalar $t->collation->paths, 361, "Collation has all paths" );
3b853983 90 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
91}
92
b0b4421a 93# Check that we have the right witnesses
94my %seen_wits;
95map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
96foreach my $wit ( $t->witnesses ) {
97 $seen_wits{$wit->sigil} = 1;
98}
99is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
100foreach my $k ( keys %seen_wits ) {
101 ok( $seen_wits{$k}, "Witness $k still exists" );
102}
103
104# Check that the witnesses have the right texts
105foreach my $wit ( $t->witnesses ) {
106 my $origtext = join( ' ', @{$wit->text} );
107 my $graphtext = $t->collation->path_text( $wit->sigil );
108 is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
109}
110
111# Check that the a.c. witnesses have the right text
112map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
113foreach my $k ( keys %seen_wits ) {
114 my $wit = $t->witness( $k );
115 if( $seen_wits{$k} ) {
116 ok( $wit->is_layered, "Witness $k got marked as layered" );
117 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
118 my $origtext = join( ' ', @{$wit->layertext} );
119 my $acsig = $wit->sigil . $t->collation->ac_label;
861c3e27 120 my $graphtext = $t->collation->path_text( $acsig );
b0b4421a 121 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
122 } else {
123 ok( !$wit->is_layered, "Witness $k not marked as layered" );
124 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
125 }
126}
127
cc31ebaa 128# Check that we only have collation relationships where we need them
129is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
cc31ebaa 130
701ad2ba 131## Check excel parsing
132
133my $xls = 't/data/armexample.xls';
134my $xt = Text::Tradition->new(
135 'name' => 'excel test',
136 'input' => 'Tabular',
137 'file' => $xls,
3a3b8213 138 'excel' => 'xls'
701ad2ba 139 );
140
141is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
142my %xls_wits;
143map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
144foreach my $wit ( $xt->witnesses ) {
145 $xls_wits{$wit->sigil} = 1;
146}
147is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
148foreach my $k ( keys %xls_wits ) {
149 ok( $xls_wits{$k}, "Witness $k still exists" );
150}
151is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
152is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
153is( $xt->collation->reading('r5.1')->text, "\x{587}",
154 "Correct decoding of at least one reading" );
155
3a3b8213 156my $xlsx = 't/data/armexample.xlsx';
157my $xtx = Text::Tradition->new(
158 'name' => 'excel test',
159 'input' => 'Tabular',
160 'file' => $xlsx,
161 'excel' => 'xlsx'
162 );
163
164is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" );
165my %xlsx_wits;
166map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
167foreach my $wit ( $xtx->witnesses ) {
168 $xlsx_wits{$wit->sigil} = 1;
169}
170is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" );
171foreach my $k ( keys %xlsx_wits ) {
172 ok( $xlsx_wits{$k}, "Witness $k still exists" );
173}
174is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" );
175is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" );
176is( $xtx->collation->reading('r5.1')->text, "\x{587}",
177 "Correct decoding of at least one reading" );
178
3b853983 179=end testing
d9e873d0 180
181=cut
182
183sub parse {
dfc37e38 184 my( $tradition, $opts ) = @_;
3a3b8213 185 my $alignment_table = _table_from_input( $opts );
d9e873d0 186 # Set up the witnesses we find in the first line
187 my @witnesses;
b0b4421a 188 my %ac_wits; # Track layered witness -> main witness mapping
3a3b8213 189 my $c = $tradition->collation; # shorthand
82fa4d57 190 my $aclabel = $c->ac_label;
d9e873d0 191 foreach my $sigil ( @{$alignment_table->[0]} ) {
3b853983 192 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
82fa4d57 193 # Sanitize the sigil name to an XML name
194 $sigil = $1 . '_layered';
b0b4421a 195 $ac_wits{$sigil} = $1;
3b853983 196 }
82fa4d57 197 my $wit = $tradition->add_witness(
198 'sigil' => $sigil, 'sourcetype' => 'collation' );
199 $wit->path( [ $c->start ] );
200 push( @witnesses, $wit );
201 my $aclabel = $c->ac_label;
d9e873d0 202 }
203
b0b4421a 204 # Save the original witness text sequences. Have to loop back through
205 # the witness columns after we have identified all the a.c. witnesses.
206 foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
207 my @sequence = map { $_->[$idx] } @{$alignment_table};
208 my $sigil = shift @sequence;
209 my $is_layer = exists( $ac_wits{$sigil} );
210 my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
211 # Now get rid of gaps and meta-readings like #LACUNA#
212 my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
213 $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
214 }
215
9bdf9d67 216 my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
217 print STDERR "Tradition too big for row collation\n" if $nocollate;
218
d9e873d0 219 # Now for the next rows, make nodes as necessary, assign their ranks, and
220 # add them to the witness paths.
d9e873d0 221 foreach my $idx ( 1 .. $#{$alignment_table} ) {
222 my $row = $alignment_table->[$idx];
9bdf9d67 223 my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
d9e873d0 224 foreach my $w ( 0 .. $#{$row} ) {
225 # push the appropriate node onto the appropriate witness path
226 my $word = $row->[$w];
227 if( $word ) {
228 my $reading = $nodes->{$word};
229 my $wit = $witnesses[$w];
230 push( @{$wit->path}, $reading );
231 } # else skip it for empty readings.
232 }
233 }
234
eca16057 235 # Collapse our lacunae into a single node and
236 # push the end node onto all paths.
d9e873d0 237 $c->end->rank( scalar @$alignment_table );
238 foreach my $wit ( @witnesses ) {
eca16057 239 my $p = $wit->path;
240 my $last_rdg = shift @$p;
241 my $new_p = [ $last_rdg ];
242 foreach my $rdg ( @$p ) {
83d5ac3a 243 # Omit the reading if we are in a lacuna already.
244 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
245 # Save the reading otherwise.
246 push( @$new_p, $rdg );
247 $last_rdg = $rdg;
eca16057 248 }
249 push( @$new_p, $c->end );
250 $wit->path( $new_p );
d9e873d0 251 }
252
3b853983 253 # Fold any a.c. witnesses into their main witness objects, and
254 # delete the independent a.c. versions.
255 foreach my $a ( keys %ac_wits ) {
b0b4421a 256 my $ac_wit = $tradition->witness( $a );
257 my $main_wit = $tradition->witness( $ac_wits{$a} );
3b853983 258 next unless $main_wit;
861c3e27 259 $main_wit->is_layered(1);
3b853983 260 $main_wit->uncorrected_path( $ac_wit->path );
261 $tradition->del_witness( $ac_wit );
262 }
83d5ac3a 263
d9e873d0 264 # Join up the paths.
265 $c->make_witness_paths;
83d5ac3a 266 # Delete our unused lacuna nodes.
267 foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
268 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
269 }
861c3e27 270
271 # Do a consistency check.
272 foreach my $wit ( $tradition->witnesses ) {
273 my $pathtext = $c->path_text( $wit->sigil );
274 my $origtext = join( ' ', @{$wit->text} );
275 warn "Text differs for witness " . $wit->sigil
276 unless $pathtext eq $origtext;
277 if( $wit->is_layered ) {
278 $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
279 $origtext = join( ' ', @{$wit->layertext} );
280 warn "Ante-corr text differs for witness " . $wit->sigil
281 unless $pathtext eq $origtext;
282 } else {
283 warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
284 if $wit->has_layertext;
285 }
286 }
202ccb18 287
288 # Note that our ranks and common readings are set.
289 $c->_graphcalc_done(1);
cc31ebaa 290 # Remove redundant collation relationships.
9bdf9d67 291 $c->relations->filter_collations() unless $nocollate;
d9e873d0 292}
293
3a3b8213 294sub _table_from_input {
295 my $opts = shift;
296 my $alignment_table = [];
297 if( $opts->{'excel'} ) {
298 my $sheet;
299 my $need_decode;
300 unless( exists $opts->{'file'} ) {
301 throw( "Must pass the filename for Excel parsing" );
302 }
303 if( $opts->{'excel'} eq 'xls' ) {
304 try {
305 require Spreadsheet::ParseExcel;
306 } catch {
307 throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
308 }
309 my $parser = Spreadsheet::ParseExcel->new();
310 my $workbook = $parser->parse( $opts->{'file'} );
311 unless( defined $workbook && defined $workbook->worksheet(0) ) {
312 throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
313 }
314 $sheet = $workbook->worksheet(0);
315 } elsif( $opts->{'excel'} eq 'xlsx' ) {
316 try {
317 require Spreadsheet::XLSX;
318 } catch {
319 throw( "Need module Spreadsheet::XLSX to parse .xlsx files" );
320 }
321 $need_decode = 1;
322 my $workbook;
323 try {
324 $workbook = Spreadsheet::XLSX->new( $opts->{'file'} );
325 } catch {
326 throw( "Failed to parse file " . $opts->{'file'} );
327 }
328 $sheet = $workbook->worksheet(0);
329 } else {
330 throw( "Unrecognized Excel variant" . $opts->{'excel'} );
331 }
332 $alignment_table = _alignment_from_worksheet( $sheet, $need_decode );
333 } else {
334 # Assume it is a comma-, tab-, or whatever-separated format.
335 my $csv_options = { 'binary' => 1 };
336 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
337 if( $csv_options->{'sep_char'} eq "\t" ) {
338 # If it is really tab separated, nothing is an escape char.
339 $csv_options->{'quote_char'} = undef;
340 $csv_options->{'escape_char'} = undef;
341 }
342 my $csv = Text::CSV->new( $csv_options );
343
344 if( exists $opts->{'string' } ) {
345 my @lines = split( "\n", $opts->{'string'} );
346 foreach my $l ( @lines ) {
347 my $status = $csv->parse( $l );
348 if( $status ) {
349 push( @$alignment_table, [ $csv->fields ] );
350 } else {
351 throw( "Could not parse line $l: " . $csv->error_input );
352 }
353 }
354 } elsif( exists $opts->{'file'} ) {
355 open( my $fh, $opts->{'file'} )
356 or warn "Could not open input file " . $opts->{'file'};
357 binmode( $fh, ':utf8' );
358 while( my $row = $csv->getline( $fh ) ) {
359 push( @$alignment_table, $row );
360 }
361 close $fh;
362 } else {
363 throw( "Could not find string or file option to parse" );
364 }
365 }
366 return $alignment_table;
367}
368sub _alignment_from_worksheet {
369 my( $sheet, $decode ) = @_;
370 my $alignment_table = [];
371
372 my( $rmin, $rmax ) = $sheet->row_range();
373 my( $cmin, $cmax ) = $sheet->col_range();
374 unless( $cmax && $rmax ) {
375 throw( "Found no rows or no columns in first worksheet" );
376 }
377 # Populate the alignment table. We only want columns that have
378 # a sigil in row zero.
379 my %sigcols = ();
380 push( @$alignment_table, [] );
381 foreach my $col ( $cmin .. $cmax ) {
382 my $cell = $sheet->get_cell( $rmin, $col );
383 my $cellval = $cell ? $cell->value() : undef;
384 if( $cellval ) {
385 $sigcols{$col} = 1;
386 push( @{$alignment_table->[0]}, $cellval );
387 }
388 }
389 # Now go through the rest of the rows and pick up the columns
390 # that were headed by a sigil.
391 foreach my $row ( $rmin+1 .. $rmax ) {
392 my @tablerow;
393 foreach my $col ( $cmin .. $cmax ) {
394 next unless $sigcols{$col};
395 my $cell = $sheet->get_cell( $row, $col );
396 my $cellval;
397 if( $cell ) {
398 $cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
399 }
400 push( @tablerow, $cellval );
401 }
402 push( @$alignment_table, \@tablerow );
403 }
404 return $alignment_table;
405}
406
027d819c 407sub _make_nodes {
9bdf9d67 408 my( $collation, $row, $index, $nocollate ) = @_;
d9e873d0 409 my %unique;
15db7774 410 my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
d9e873d0 411 foreach my $w ( @$row ) {
412 $unique{$w} = 1 if $w;
15db7774 413 $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
d9e873d0 414 }
415 my $ctr = 1;
416 foreach my $w ( keys %unique ) {
a753cc84 417 my $rargs = {
10e4b1ac 418 'id' => "r$index.$ctr",
a753cc84 419 'rank' => $index,
420 'text' => $w,
421 };
15db7774 422 if( $w eq '#LACUNA#' ) {
423 $rargs->{'is_lacuna'} = 1;
424 } elsif( $commonctr == 1 ) {
425 $rargs->{'is_common'} = 1;
426 }
a753cc84 427 my $r = $collation->add_reading( $rargs );
d9e873d0 428 $unique{$w} = $r;
a753cc84 429 $ctr++;
d9e873d0 430 }
bf6e338d 431 # Collate this sequence of readings via a single 'collation' relationship.
9bdf9d67 432 unless( $nocollate ) {
433 my @rankrdgs = values %unique;
434 my $collation_rel;
435 while( @rankrdgs ) {
436 my $r = shift @rankrdgs;
437 next if $r->is_meta;
438 foreach my $nr ( @rankrdgs ) {
439 next if $nr->is_meta;
440 if( $collation_rel ) {
441 $collation->add_relationship( $r, $nr, $collation_rel );
442 } else {
443 $collation->add_relationship( $r, $nr,
444 { 'type' => 'collated',
445 'annotation' => "Parsed together for rank $index" } );
446 $collation_rel = $collation->get_relationship( $r, $nr );
447 }
448 }
449 }
450 }
d9e873d0 451 return \%unique;
452}
453
3a3b8213 454sub throw {
455 Text::Tradition::Error->throw(
456 'ident' => 'Parser::Tabular error',
457 'message' => $_[0],
458 );
459}
460
3b853983 4611;
462
463=head1 LICENSE
464
465This package is free software and is provided "as is" without express
466or implied warranty. You can redistribute it and/or modify it under
467the same terms as Perl itself.
468
469=head1 AUTHOR
470
471Tara L Andrews E<lt>aurum@cpan.orgE<gt>