split off stemma analysis modules from base Tradition layer
[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Parser / Tabular.pm
CommitLineData
d9e873d0 1package Text::Tradition::Parser::Tabular;
2
3use strict;
4use warnings;
3a3b8213 5use Encode qw/ decode_utf8 /;
82fa4d57 6use Text::CSV;
701ad2ba 7use Text::Tradition::Error;
8use TryCatch;
d9e873d0 9
10=head1 NAME
11
12Text::Tradition::Parser::Tabular
13
3b853983 14=head1 SYNOPSIS
15
16 use Text::Tradition;
17
18 my $t_from_file = Text::Tradition->new(
19 'name' => 'my text',
20 'input' => 'Tabular',
21 'file' => '/path/to/collation.csv',
22 'sep_char' => ','
23 );
24
25 my $t_from_string = Text::Tradition->new(
26 'name' => 'my text',
27 'input' => 'Tabular',
28 'string' => $tab_separated_collation,
29 'sep_char' => "\t",
30 );
31
d9e873d0 32=head1 DESCRIPTION
33
34Parser module for Text::Tradition to read an alignment table format, such as CSV.
35
36=head1 METHODS
37
e867486f 38=head2 B<parse>( $tradition, $option_hash )
3b853983 39
40Takes an initialized tradition and a set of options; creates the
41appropriate nodes and edges on the graph, as well as the appropriate
42witness objects. The $option_hash must contain either a 'file' or a
43'string' argument with the table to be parsed; it may also contain a
44'sep_char' argument to specify how the fields are separated.
45
46The table should have witnesses arranged in columns, with the witness sigla
47in the first row. Empty cells are interpreted as omissions (and thus
48stemmatologically relevant.) Longer lacunae in the text, to be disregarded
49in cladistic analysis, may be represented by filling the appropriate cells
50with the tag '#LACUNA#'.
51
52If a witness name ends in the collation's ac_label, it will be treated as
53an 'ante-correction' version of the 'main' witness whose sigil it shares.
54
55=begin testing
56
57use Text::Tradition;
58binmode STDOUT, ":utf8";
59binmode STDERR, ":utf8";
60eval { no warnings; binmode $DB::OUT, ":utf8"; };
61
62my $csv = 't/data/florilegium.csv';
63my $t = Text::Tradition->new(
64 'name' => 'inline',
65 'input' => 'Tabular',
66 'file' => $csv,
67 'sep_char' => ',',
68 );
d9e873d0 69
3b853983 70is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
d9e873d0 71
3b853983 72### TODO Check these figures
73if( $t ) {
0e47f4f6 74 is( scalar $t->collation->readings, 311, "Collation has all readings" );
75 is( scalar $t->collation->paths, 361, "Collation has all paths" );
3b853983 76 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
77}
78
b0b4421a 79# Check that we have the right witnesses
80my %seen_wits;
81map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
82foreach my $wit ( $t->witnesses ) {
83 $seen_wits{$wit->sigil} = 1;
84}
85is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
86foreach my $k ( keys %seen_wits ) {
87 ok( $seen_wits{$k}, "Witness $k still exists" );
88}
89
90# Check that the witnesses have the right texts
91foreach my $wit ( $t->witnesses ) {
92 my $origtext = join( ' ', @{$wit->text} );
93 my $graphtext = $t->collation->path_text( $wit->sigil );
94 is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
95}
96
97# Check that the a.c. witnesses have the right text
98map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
99foreach my $k ( keys %seen_wits ) {
100 my $wit = $t->witness( $k );
101 if( $seen_wits{$k} ) {
102 ok( $wit->is_layered, "Witness $k got marked as layered" );
103 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
104 my $origtext = join( ' ', @{$wit->layertext} );
105 my $acsig = $wit->sigil . $t->collation->ac_label;
861c3e27 106 my $graphtext = $t->collation->path_text( $acsig );
b0b4421a 107 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
108 } else {
109 ok( !$wit->is_layered, "Witness $k not marked as layered" );
110 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
111 }
112}
113
cc31ebaa 114# Check that we only have collation relationships where we need them
115is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
cc31ebaa 116
701ad2ba 117## Check excel parsing
118
119my $xls = 't/data/armexample.xls';
120my $xt = Text::Tradition->new(
121 'name' => 'excel test',
122 'input' => 'Tabular',
123 'file' => $xls,
3a3b8213 124 'excel' => 'xls'
701ad2ba 125 );
126
127is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
128my %xls_wits;
129map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
130foreach my $wit ( $xt->witnesses ) {
131 $xls_wits{$wit->sigil} = 1;
132}
133is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
134foreach my $k ( keys %xls_wits ) {
135 ok( $xls_wits{$k}, "Witness $k still exists" );
136}
137is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
138is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
139is( $xt->collation->reading('r5.1')->text, "\x{587}",
140 "Correct decoding of at least one reading" );
141
3a3b8213 142my $xlsx = 't/data/armexample.xlsx';
143my $xtx = Text::Tradition->new(
144 'name' => 'excel test',
145 'input' => 'Tabular',
146 'file' => $xlsx,
147 'excel' => 'xlsx'
148 );
149
150is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" );
151my %xlsx_wits;
152map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
153foreach my $wit ( $xtx->witnesses ) {
154 $xlsx_wits{$wit->sigil} = 1;
155}
156is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" );
157foreach my $k ( keys %xlsx_wits ) {
158 ok( $xlsx_wits{$k}, "Witness $k still exists" );
159}
160is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" );
161is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" );
162is( $xtx->collation->reading('r5.1')->text, "\x{587}",
163 "Correct decoding of at least one reading" );
164
3b853983 165=end testing
d9e873d0 166
167=cut
168
169sub parse {
dfc37e38 170 my( $tradition, $opts ) = @_;
3a3b8213 171 my $alignment_table = _table_from_input( $opts );
d9e873d0 172 # Set up the witnesses we find in the first line
173 my @witnesses;
b0b4421a 174 my %ac_wits; # Track layered witness -> main witness mapping
3a3b8213 175 my $c = $tradition->collation; # shorthand
82fa4d57 176 my $aclabel = $c->ac_label;
d9e873d0 177 foreach my $sigil ( @{$alignment_table->[0]} ) {
3b853983 178 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
82fa4d57 179 # Sanitize the sigil name to an XML name
180 $sigil = $1 . '_layered';
b0b4421a 181 $ac_wits{$sigil} = $1;
3b853983 182 }
82fa4d57 183 my $wit = $tradition->add_witness(
184 'sigil' => $sigil, 'sourcetype' => 'collation' );
185 $wit->path( [ $c->start ] );
186 push( @witnesses, $wit );
187 my $aclabel = $c->ac_label;
d9e873d0 188 }
189
b0b4421a 190 # Save the original witness text sequences. Have to loop back through
191 # the witness columns after we have identified all the a.c. witnesses.
192 foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
193 my @sequence = map { $_->[$idx] } @{$alignment_table};
194 my $sigil = shift @sequence;
195 my $is_layer = exists( $ac_wits{$sigil} );
196 my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
197 # Now get rid of gaps and meta-readings like #LACUNA#
198 my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
199 $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
200 }
201
9bdf9d67 202 my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
203 print STDERR "Tradition too big for row collation\n" if $nocollate;
204
d9e873d0 205 # Now for the next rows, make nodes as necessary, assign their ranks, and
206 # add them to the witness paths.
d9e873d0 207 foreach my $idx ( 1 .. $#{$alignment_table} ) {
208 my $row = $alignment_table->[$idx];
9bdf9d67 209 my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
d9e873d0 210 foreach my $w ( 0 .. $#{$row} ) {
211 # push the appropriate node onto the appropriate witness path
212 my $word = $row->[$w];
213 if( $word ) {
214 my $reading = $nodes->{$word};
215 my $wit = $witnesses[$w];
216 push( @{$wit->path}, $reading );
217 } # else skip it for empty readings.
218 }
219 }
220
eca16057 221 # Collapse our lacunae into a single node and
222 # push the end node onto all paths.
d9e873d0 223 $c->end->rank( scalar @$alignment_table );
224 foreach my $wit ( @witnesses ) {
eca16057 225 my $p = $wit->path;
226 my $last_rdg = shift @$p;
227 my $new_p = [ $last_rdg ];
228 foreach my $rdg ( @$p ) {
83d5ac3a 229 # Omit the reading if we are in a lacuna already.
230 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
231 # Save the reading otherwise.
232 push( @$new_p, $rdg );
233 $last_rdg = $rdg;
eca16057 234 }
235 push( @$new_p, $c->end );
236 $wit->path( $new_p );
d9e873d0 237 }
238
3b853983 239 # Fold any a.c. witnesses into their main witness objects, and
240 # delete the independent a.c. versions.
241 foreach my $a ( keys %ac_wits ) {
b0b4421a 242 my $ac_wit = $tradition->witness( $a );
243 my $main_wit = $tradition->witness( $ac_wits{$a} );
3b853983 244 next unless $main_wit;
861c3e27 245 $main_wit->is_layered(1);
3b853983 246 $main_wit->uncorrected_path( $ac_wit->path );
247 $tradition->del_witness( $ac_wit );
248 }
83d5ac3a 249
d9e873d0 250 # Join up the paths.
251 $c->make_witness_paths;
83d5ac3a 252 # Delete our unused lacuna nodes.
253 foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
254 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
255 }
861c3e27 256
257 # Do a consistency check.
258 foreach my $wit ( $tradition->witnesses ) {
259 my $pathtext = $c->path_text( $wit->sigil );
260 my $origtext = join( ' ', @{$wit->text} );
261 warn "Text differs for witness " . $wit->sigil
262 unless $pathtext eq $origtext;
263 if( $wit->is_layered ) {
264 $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
265 $origtext = join( ' ', @{$wit->layertext} );
266 warn "Ante-corr text differs for witness " . $wit->sigil
267 unless $pathtext eq $origtext;
268 } else {
269 warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
270 if $wit->has_layertext;
271 }
272 }
202ccb18 273
274 # Note that our ranks and common readings are set.
275 $c->_graphcalc_done(1);
cc31ebaa 276 # Remove redundant collation relationships.
9bdf9d67 277 $c->relations->filter_collations() unless $nocollate;
d9e873d0 278}
279
3a3b8213 280sub _table_from_input {
281 my $opts = shift;
282 my $alignment_table = [];
283 if( $opts->{'excel'} ) {
284 my $sheet;
285 my $need_decode;
286 unless( exists $opts->{'file'} ) {
287 throw( "Must pass the filename for Excel parsing" );
288 }
289 if( $opts->{'excel'} eq 'xls' ) {
290 try {
291 require Spreadsheet::ParseExcel;
292 } catch {
293 throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
294 }
295 my $parser = Spreadsheet::ParseExcel->new();
296 my $workbook = $parser->parse( $opts->{'file'} );
297 unless( defined $workbook && defined $workbook->worksheet(0) ) {
298 throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
299 }
300 $sheet = $workbook->worksheet(0);
301 } elsif( $opts->{'excel'} eq 'xlsx' ) {
302 try {
303 require Spreadsheet::XLSX;
304 } catch {
305 throw( "Need module Spreadsheet::XLSX to parse .xlsx files" );
306 }
307 $need_decode = 1;
308 my $workbook;
309 try {
310 $workbook = Spreadsheet::XLSX->new( $opts->{'file'} );
311 } catch {
312 throw( "Failed to parse file " . $opts->{'file'} );
313 }
314 $sheet = $workbook->worksheet(0);
315 } else {
316 throw( "Unrecognized Excel variant" . $opts->{'excel'} );
317 }
318 $alignment_table = _alignment_from_worksheet( $sheet, $need_decode );
319 } else {
320 # Assume it is a comma-, tab-, or whatever-separated format.
321 my $csv_options = { 'binary' => 1 };
322 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
323 if( $csv_options->{'sep_char'} eq "\t" ) {
324 # If it is really tab separated, nothing is an escape char.
325 $csv_options->{'quote_char'} = undef;
326 $csv_options->{'escape_char'} = undef;
327 }
328 my $csv = Text::CSV->new( $csv_options );
329
330 if( exists $opts->{'string' } ) {
331 my @lines = split( "\n", $opts->{'string'} );
332 foreach my $l ( @lines ) {
333 my $status = $csv->parse( $l );
334 if( $status ) {
335 push( @$alignment_table, [ $csv->fields ] );
336 } else {
337 throw( "Could not parse line $l: " . $csv->error_input );
338 }
339 }
340 } elsif( exists $opts->{'file'} ) {
341 open( my $fh, $opts->{'file'} )
342 or warn "Could not open input file " . $opts->{'file'};
343 binmode( $fh, ':utf8' );
344 while( my $row = $csv->getline( $fh ) ) {
345 push( @$alignment_table, $row );
346 }
347 close $fh;
348 } else {
349 throw( "Could not find string or file option to parse" );
350 }
351 }
352 return $alignment_table;
353}
354sub _alignment_from_worksheet {
355 my( $sheet, $decode ) = @_;
356 my $alignment_table = [];
357
358 my( $rmin, $rmax ) = $sheet->row_range();
359 my( $cmin, $cmax ) = $sheet->col_range();
360 unless( $cmax && $rmax ) {
361 throw( "Found no rows or no columns in first worksheet" );
362 }
363 # Populate the alignment table. We only want columns that have
364 # a sigil in row zero.
365 my %sigcols = ();
366 push( @$alignment_table, [] );
367 foreach my $col ( $cmin .. $cmax ) {
368 my $cell = $sheet->get_cell( $rmin, $col );
369 my $cellval = $cell ? $cell->value() : undef;
370 if( $cellval ) {
371 $sigcols{$col} = 1;
372 push( @{$alignment_table->[0]}, $cellval );
373 }
374 }
375 # Now go through the rest of the rows and pick up the columns
376 # that were headed by a sigil.
377 foreach my $row ( $rmin+1 .. $rmax ) {
378 my @tablerow;
379 foreach my $col ( $cmin .. $cmax ) {
380 next unless $sigcols{$col};
381 my $cell = $sheet->get_cell( $row, $col );
382 my $cellval;
383 if( $cell ) {
384 $cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
385 }
386 push( @tablerow, $cellval );
387 }
388 push( @$alignment_table, \@tablerow );
389 }
390 return $alignment_table;
391}
392
027d819c 393sub _make_nodes {
9bdf9d67 394 my( $collation, $row, $index, $nocollate ) = @_;
d9e873d0 395 my %unique;
15db7774 396 my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
d9e873d0 397 foreach my $w ( @$row ) {
398 $unique{$w} = 1 if $w;
15db7774 399 $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
d9e873d0 400 }
401 my $ctr = 1;
402 foreach my $w ( keys %unique ) {
a753cc84 403 my $rargs = {
10e4b1ac 404 'id' => "r$index.$ctr",
a753cc84 405 'rank' => $index,
406 'text' => $w,
407 };
15db7774 408 if( $w eq '#LACUNA#' ) {
409 $rargs->{'is_lacuna'} = 1;
410 } elsif( $commonctr == 1 ) {
411 $rargs->{'is_common'} = 1;
412 }
a753cc84 413 my $r = $collation->add_reading( $rargs );
d9e873d0 414 $unique{$w} = $r;
a753cc84 415 $ctr++;
d9e873d0 416 }
bf6e338d 417 # Collate this sequence of readings via a single 'collation' relationship.
9bdf9d67 418 unless( $nocollate ) {
419 my @rankrdgs = values %unique;
420 my $collation_rel;
421 while( @rankrdgs ) {
422 my $r = shift @rankrdgs;
423 next if $r->is_meta;
424 foreach my $nr ( @rankrdgs ) {
425 next if $nr->is_meta;
426 if( $collation_rel ) {
427 $collation->add_relationship( $r, $nr, $collation_rel );
428 } else {
429 $collation->add_relationship( $r, $nr,
430 { 'type' => 'collated',
431 'annotation' => "Parsed together for rank $index" } );
432 $collation_rel = $collation->get_relationship( $r, $nr );
433 }
434 }
435 }
436 }
d9e873d0 437 return \%unique;
438}
439
3a3b8213 440sub throw {
441 Text::Tradition::Error->throw(
442 'ident' => 'Parser::Tabular error',
443 'message' => $_[0],
444 );
445}
446
3b853983 4471;
448
449=head1 LICENSE
450
451This package is free software and is provided "as is" without express
452or implied warranty. You can redistribute it and/or modify it under
453the same terms as Perl itself.
454
455=head1 AUTHOR
456
457Tara L Andrews E<lt>aurum@cpan.orgE<gt>