XML parsers should accept already-parsed XML object too
[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Parser / Tabular.pm
CommitLineData
d9e873d0 1package Text::Tradition::Parser::Tabular;
2
3use strict;
4use warnings;
3a3b8213 5use Encode qw/ decode_utf8 /;
82fa4d57 6use Text::CSV;
701ad2ba 7use Text::Tradition::Error;
8use TryCatch;
d9e873d0 9
10=head1 NAME
11
12Text::Tradition::Parser::Tabular
13
3b853983 14=head1 SYNOPSIS
15
16 use Text::Tradition;
17
18 my $t_from_file = Text::Tradition->new(
19 'name' => 'my text',
20 'input' => 'Tabular',
21 'file' => '/path/to/collation.csv',
22 'sep_char' => ','
23 );
24
25 my $t_from_string = Text::Tradition->new(
26 'name' => 'my text',
27 'input' => 'Tabular',
28 'string' => $tab_separated_collation,
29 'sep_char' => "\t",
30 );
31
d9e873d0 32=head1 DESCRIPTION
33
a445ce40 34Parser module for Text::Tradition to read an alignment table format, such as
35CSV or Excel.
d9e873d0 36
37=head1 METHODS
38
e867486f 39=head2 B<parse>( $tradition, $option_hash )
3b853983 40
41Takes an initialized tradition and a set of options; creates the
42appropriate nodes and edges on the graph, as well as the appropriate
a445ce40 43witness objects. The $option_hash can contain the following:
44
45=over
46
47=item * file - Name of file which contains the data
48
49=item * string - A string that itself contains the data. One of 'file' or
50'string' is required.
51
52=item * sep_char - For plaintext formats, the field separation character.
53Defaults to "\t" (tab-separated); should be "," for comma-separated format.
54
55=item * excel - If the data is in an Excel file, this option should be set
56to 'xls' (for pre-2007 Excel format) or 'xlsx' (for Excel XML format.)
57
58=back
59
60The data should comprise a table with witnesses arranged in columns, with
61the witness sigla in the first row. Empty cells are interpreted as
62omissions (and thus stemmatologically relevant.) Longer lacunae in the
63text, to be disregarded in cladistic analysis, may be represented by
64filling the appropriate cells with the tag '#LACUNA#'.
3b853983 65
66If a witness name ends in the collation's ac_label, it will be treated as
67an 'ante-correction' version of the 'main' witness whose sigil it shares.
68
69=begin testing
70
fa6bc75d 71use Test::More::UTF8;
3b853983 72use Text::Tradition;
73binmode STDOUT, ":utf8";
74binmode STDERR, ":utf8";
75eval { no warnings; binmode $DB::OUT, ":utf8"; };
76
77my $csv = 't/data/florilegium.csv';
78my $t = Text::Tradition->new(
79 'name' => 'inline',
80 'input' => 'Tabular',
81 'file' => $csv,
82 'sep_char' => ',',
83 );
d9e873d0 84
3b853983 85is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
d9e873d0 86
3b853983 87### TODO Check these figures
88if( $t ) {
0e47f4f6 89 is( scalar $t->collation->readings, 311, "Collation has all readings" );
90 is( scalar $t->collation->paths, 361, "Collation has all paths" );
3b853983 91 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
92}
93
b0b4421a 94# Check that we have the right witnesses
95my %seen_wits;
96map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
97foreach my $wit ( $t->witnesses ) {
98 $seen_wits{$wit->sigil} = 1;
99}
100is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
101foreach my $k ( keys %seen_wits ) {
102 ok( $seen_wits{$k}, "Witness $k still exists" );
103}
104
105# Check that the witnesses have the right texts
106foreach my $wit ( $t->witnesses ) {
107 my $origtext = join( ' ', @{$wit->text} );
108 my $graphtext = $t->collation->path_text( $wit->sigil );
109 is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
110}
111
112# Check that the a.c. witnesses have the right text
113map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
114foreach my $k ( keys %seen_wits ) {
115 my $wit = $t->witness( $k );
116 if( $seen_wits{$k} ) {
117 ok( $wit->is_layered, "Witness $k got marked as layered" );
118 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
119 my $origtext = join( ' ', @{$wit->layertext} );
120 my $acsig = $wit->sigil . $t->collation->ac_label;
861c3e27 121 my $graphtext = $t->collation->path_text( $acsig );
b0b4421a 122 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
123 } else {
124 ok( !$wit->is_layered, "Witness $k not marked as layered" );
125 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
126 }
127}
128
cc31ebaa 129# Check that we only have collation relationships where we need them
130is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
cc31ebaa 131
701ad2ba 132## Check excel parsing
133
134my $xls = 't/data/armexample.xls';
135my $xt = Text::Tradition->new(
136 'name' => 'excel test',
137 'input' => 'Tabular',
138 'file' => $xls,
3a3b8213 139 'excel' => 'xls'
701ad2ba 140 );
141
142is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
143my %xls_wits;
144map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
145foreach my $wit ( $xt->witnesses ) {
146 $xls_wits{$wit->sigil} = 1;
147}
148is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
149foreach my $k ( keys %xls_wits ) {
150 ok( $xls_wits{$k}, "Witness $k still exists" );
151}
152is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
153is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
154is( $xt->collation->reading('r5.1')->text, "\x{587}",
155 "Correct decoding of at least one reading" );
156
3a3b8213 157my $xlsx = 't/data/armexample.xlsx';
158my $xtx = Text::Tradition->new(
159 'name' => 'excel test',
160 'input' => 'Tabular',
161 'file' => $xlsx,
162 'excel' => 'xlsx'
163 );
164
165is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" );
166my %xlsx_wits;
fa6bc75d 167map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit3 /;
168$xlsx_wits{"\x{531}\x{562}2"} = 0;
3a3b8213 169foreach my $wit ( $xtx->witnesses ) {
170 $xlsx_wits{$wit->sigil} = 1;
171}
172is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" );
173foreach my $k ( keys %xlsx_wits ) {
174 ok( $xlsx_wits{$k}, "Witness $k still exists" );
175}
176is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" );
177is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" );
178is( $xtx->collation->reading('r5.1')->text, "\x{587}",
179 "Correct decoding of at least one reading" );
180
3b853983 181=end testing
d9e873d0 182
183=cut
184
185sub parse {
dfc37e38 186 my( $tradition, $opts ) = @_;
3a3b8213 187 my $alignment_table = _table_from_input( $opts );
d9e873d0 188 # Set up the witnesses we find in the first line
189 my @witnesses;
b0b4421a 190 my %ac_wits; # Track layered witness -> main witness mapping
3a3b8213 191 my $c = $tradition->collation; # shorthand
82fa4d57 192 my $aclabel = $c->ac_label;
d9e873d0 193 foreach my $sigil ( @{$alignment_table->[0]} ) {
3b853983 194 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
82fa4d57 195 # Sanitize the sigil name to an XML name
196 $sigil = $1 . '_layered';
b0b4421a 197 $ac_wits{$sigil} = $1;
3b853983 198 }
82fa4d57 199 my $wit = $tradition->add_witness(
200 'sigil' => $sigil, 'sourcetype' => 'collation' );
201 $wit->path( [ $c->start ] );
202 push( @witnesses, $wit );
203 my $aclabel = $c->ac_label;
d9e873d0 204 }
205
b0b4421a 206 # Save the original witness text sequences. Have to loop back through
207 # the witness columns after we have identified all the a.c. witnesses.
208 foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
209 my @sequence = map { $_->[$idx] } @{$alignment_table};
210 my $sigil = shift @sequence;
211 my $is_layer = exists( $ac_wits{$sigil} );
212 my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
213 # Now get rid of gaps and meta-readings like #LACUNA#
214 my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
215 $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
216 }
217
9bdf9d67 218 my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
219 print STDERR "Tradition too big for row collation\n" if $nocollate;
220
d9e873d0 221 # Now for the next rows, make nodes as necessary, assign their ranks, and
222 # add them to the witness paths.
d9e873d0 223 foreach my $idx ( 1 .. $#{$alignment_table} ) {
224 my $row = $alignment_table->[$idx];
9bdf9d67 225 my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
d9e873d0 226 foreach my $w ( 0 .. $#{$row} ) {
227 # push the appropriate node onto the appropriate witness path
228 my $word = $row->[$w];
229 if( $word ) {
230 my $reading = $nodes->{$word};
231 my $wit = $witnesses[$w];
232 push( @{$wit->path}, $reading );
233 } # else skip it for empty readings.
234 }
235 }
236
eca16057 237 # Collapse our lacunae into a single node and
238 # push the end node onto all paths.
d9e873d0 239 $c->end->rank( scalar @$alignment_table );
240 foreach my $wit ( @witnesses ) {
eca16057 241 my $p = $wit->path;
242 my $last_rdg = shift @$p;
243 my $new_p = [ $last_rdg ];
244 foreach my $rdg ( @$p ) {
83d5ac3a 245 # Omit the reading if we are in a lacuna already.
246 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
247 # Save the reading otherwise.
248 push( @$new_p, $rdg );
249 $last_rdg = $rdg;
eca16057 250 }
251 push( @$new_p, $c->end );
252 $wit->path( $new_p );
d9e873d0 253 }
254
3b853983 255 # Fold any a.c. witnesses into their main witness objects, and
256 # delete the independent a.c. versions.
257 foreach my $a ( keys %ac_wits ) {
b0b4421a 258 my $ac_wit = $tradition->witness( $a );
259 my $main_wit = $tradition->witness( $ac_wits{$a} );
3b853983 260 next unless $main_wit;
861c3e27 261 $main_wit->is_layered(1);
3b853983 262 $main_wit->uncorrected_path( $ac_wit->path );
263 $tradition->del_witness( $ac_wit );
264 }
83d5ac3a 265
d9e873d0 266 # Join up the paths.
267 $c->make_witness_paths;
83d5ac3a 268 # Delete our unused lacuna nodes.
269 foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
270 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
271 }
861c3e27 272
273 # Do a consistency check.
274 foreach my $wit ( $tradition->witnesses ) {
275 my $pathtext = $c->path_text( $wit->sigil );
276 my $origtext = join( ' ', @{$wit->text} );
277 warn "Text differs for witness " . $wit->sigil
278 unless $pathtext eq $origtext;
279 if( $wit->is_layered ) {
280 $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
281 $origtext = join( ' ', @{$wit->layertext} );
282 warn "Ante-corr text differs for witness " . $wit->sigil
283 unless $pathtext eq $origtext;
284 } else {
285 warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
286 if $wit->has_layertext;
287 }
288 }
202ccb18 289
290 # Note that our ranks and common readings are set.
291 $c->_graphcalc_done(1);
98a66507 292 _add_collations( $c ) unless $nocollate;
d9e873d0 293}
294
3a3b8213 295sub _table_from_input {
296 my $opts = shift;
297 my $alignment_table = [];
298 if( $opts->{'excel'} ) {
299 my $sheet;
300 my $need_decode;
301 unless( exists $opts->{'file'} ) {
302 throw( "Must pass the filename for Excel parsing" );
303 }
304 if( $opts->{'excel'} eq 'xls' ) {
305 try {
306 require Spreadsheet::ParseExcel;
307 } catch {
308 throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
309 }
310 my $parser = Spreadsheet::ParseExcel->new();
311 my $workbook = $parser->parse( $opts->{'file'} );
312 unless( defined $workbook && defined $workbook->worksheet(0) ) {
313 throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
314 }
315 $sheet = $workbook->worksheet(0);
316 } elsif( $opts->{'excel'} eq 'xlsx' ) {
317 try {
318 require Spreadsheet::XLSX;
319 } catch {
320 throw( "Need module Spreadsheet::XLSX to parse .xlsx files" );
321 }
322 $need_decode = 1;
323 my $workbook;
324 try {
325 $workbook = Spreadsheet::XLSX->new( $opts->{'file'} );
326 } catch {
327 throw( "Failed to parse file " . $opts->{'file'} );
328 }
329 $sheet = $workbook->worksheet(0);
330 } else {
331 throw( "Unrecognized Excel variant" . $opts->{'excel'} );
332 }
333 $alignment_table = _alignment_from_worksheet( $sheet, $need_decode );
334 } else {
335 # Assume it is a comma-, tab-, or whatever-separated format.
336 my $csv_options = { 'binary' => 1 };
337 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
338 if( $csv_options->{'sep_char'} eq "\t" ) {
339 # If it is really tab separated, nothing is an escape char.
340 $csv_options->{'quote_char'} = undef;
341 $csv_options->{'escape_char'} = undef;
342 }
343 my $csv = Text::CSV->new( $csv_options );
344
345 if( exists $opts->{'string' } ) {
346 my @lines = split( "\n", $opts->{'string'} );
347 foreach my $l ( @lines ) {
348 my $status = $csv->parse( $l );
349 if( $status ) {
350 push( @$alignment_table, [ $csv->fields ] );
351 } else {
352 throw( "Could not parse line $l: " . $csv->error_input );
353 }
354 }
355 } elsif( exists $opts->{'file'} ) {
356 open( my $fh, $opts->{'file'} )
357 or warn "Could not open input file " . $opts->{'file'};
358 binmode( $fh, ':utf8' );
359 while( my $row = $csv->getline( $fh ) ) {
360 push( @$alignment_table, $row );
361 }
362 close $fh;
363 } else {
364 throw( "Could not find string or file option to parse" );
365 }
366 }
367 return $alignment_table;
368}
369sub _alignment_from_worksheet {
370 my( $sheet, $decode ) = @_;
371 my $alignment_table = [];
372
373 my( $rmin, $rmax ) = $sheet->row_range();
374 my( $cmin, $cmax ) = $sheet->col_range();
375 unless( $cmax && $rmax ) {
376 throw( "Found no rows or no columns in first worksheet" );
377 }
378 # Populate the alignment table. We only want columns that have
379 # a sigil in row zero.
380 my %sigcols = ();
381 push( @$alignment_table, [] );
382 foreach my $col ( $cmin .. $cmax ) {
383 my $cell = $sheet->get_cell( $rmin, $col );
fa6bc75d 384 my $cellval;
385 if( $cell ) {
386 $cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
387 }
3a3b8213 388 if( $cellval ) {
389 $sigcols{$col} = 1;
390 push( @{$alignment_table->[0]}, $cellval );
391 }
392 }
393 # Now go through the rest of the rows and pick up the columns
394 # that were headed by a sigil.
395 foreach my $row ( $rmin+1 .. $rmax ) {
396 my @tablerow;
397 foreach my $col ( $cmin .. $cmax ) {
398 next unless $sigcols{$col};
399 my $cell = $sheet->get_cell( $row, $col );
400 my $cellval;
401 if( $cell ) {
402 $cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
403 }
404 push( @tablerow, $cellval );
405 }
406 push( @$alignment_table, \@tablerow );
407 }
408 return $alignment_table;
409}
410
027d819c 411sub _make_nodes {
9bdf9d67 412 my( $collation, $row, $index, $nocollate ) = @_;
d9e873d0 413 my %unique;
15db7774 414 my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
d9e873d0 415 foreach my $w ( @$row ) {
416 $unique{$w} = 1 if $w;
15db7774 417 $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
d9e873d0 418 }
419 my $ctr = 1;
420 foreach my $w ( keys %unique ) {
a753cc84 421 my $rargs = {
10e4b1ac 422 'id' => "r$index.$ctr",
a753cc84 423 'rank' => $index,
424 'text' => $w,
425 };
15db7774 426 if( $w eq '#LACUNA#' ) {
427 $rargs->{'is_lacuna'} = 1;
428 } elsif( $commonctr == 1 ) {
429 $rargs->{'is_common'} = 1;
430 }
a753cc84 431 my $r = $collation->add_reading( $rargs );
d9e873d0 432 $unique{$w} = $r;
a753cc84 433 $ctr++;
d9e873d0 434 }
98a66507 435 return \%unique;
436}
437
438sub _add_collations {
439 my( $collation ) = shift;
440 # For each reading that needs to be held in place, add a 'collated'
441 # relationship to whatever anchor we can find. An anchor is a reading
442 # that would occupy its rank by virtue of being subsequent to a
443 # reading at $rank-1.
444 my @collate_pairs;
445 foreach my $r ( 1 .. $collation->end->rank - 1 ) {
98a66507 446 my $anchor;
447 my @need_weak;
448 my @here = grep { !$_->is_meta } $collation->readings_at_rank( $r );
449 next unless @here > 1;
450 foreach my $rdg ( @here ) {
451 my $ip = 0;
452 foreach my $pred ( $rdg->predecessors ) {
453 if( $pred->rank == $r - 1 ) {
454 $ip = 1;
455 $anchor = $rdg unless( $anchor );
456 last;
9bdf9d67 457 }
458 }
98a66507 459 push( @need_weak, $rdg ) unless $ip;
9bdf9d67 460 }
98a66507 461 $anchor
462 ? map { push( @collate_pairs, [ $r, $anchor, $_ ] ) } @need_weak
463 : print STDERR "No anchor found at $r\n";
464 }
465 foreach my $p ( @collate_pairs ) {
466 my $r = shift @$p;
467 $collation->add_relationship( @$p,
468 { 'type' => 'collated',
469 'annotation' => "Collated together for rank $r" } )
470 unless $collation->get_relationship( @$p )
471 }
d9e873d0 472}
473
3a3b8213 474sub throw {
475 Text::Tradition::Error->throw(
476 'ident' => 'Parser::Tabular error',
477 'message' => $_[0],
478 );
479}
480
3b853983 4811;
482
483=head1 LICENSE
484
485This package is free software and is provided "as is" without express
486or implied warranty. You can redistribute it and/or modify it under
487the same terms as Perl itself.
488
489=head1 AUTHOR
490
491Tara L Andrews E<lt>aurum@cpan.orgE<gt>