1 package Text::Tradition::Parser::Tabular;
5 use Encode qw/ decode_utf8 /;
7 use Text::Tradition::Error;
12 Text::Tradition::Parser::Tabular
18 my $t_from_file = Text::Tradition->new(
21 'file' => '/path/to/collation.csv',
25 my $t_from_string = Text::Tradition->new(
28 'string' => $tab_separated_collation,
34 Parser module for Text::Tradition to read an alignment table format, such as CSV.
38 =head2 B<parse>( $tradition, $option_hash )
40 Takes an initialized tradition and a set of options; creates the
41 appropriate nodes and edges on the graph, as well as the appropriate
42 witness objects. The $option_hash must contain either a 'file' or a
43 'string' argument with the table to be parsed; it may also contain a
44 'sep_char' argument to specify how the fields are separated.
46 The table should have witnesses arranged in columns, with the witness sigla
47 in the first row. Empty cells are interpreted as omissions (and thus
48 stemmatologically relevant.) Longer lacunae in the text, to be disregarded
49 in cladistic analysis, may be represented by filling the appropriate cells
50 with the tag '#LACUNA#'.
52 If a witness name ends in the collation's ac_label, it will be treated as
53 an 'ante-correction' version of the 'main' witness whose sigil it shares.
58 binmode STDOUT, ":utf8";
59 binmode STDERR, ":utf8";
60 eval { no warnings; binmode $DB::OUT, ":utf8"; };
62 my $csv = 't/data/florilegium.csv';
63 my $t = Text::Tradition->new(
70 is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
72 ### TODO Check these figures
74 is( scalar $t->collation->readings, 311, "Collation has all readings" );
75 is( scalar $t->collation->paths, 361, "Collation has all paths" );
76 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
79 # Check that we have the right witnesses
81 map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
82 foreach my $wit ( $t->witnesses ) {
83 $seen_wits{$wit->sigil} = 1;
85 is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
86 foreach my $k ( keys %seen_wits ) {
87 ok( $seen_wits{$k}, "Witness $k still exists" );
90 # Check that the witnesses have the right texts
91 foreach my $wit ( $t->witnesses ) {
92 my $origtext = join( ' ', @{$wit->text} );
93 my $graphtext = $t->collation->path_text( $wit->sigil );
94 is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
97 # Check that the a.c. witnesses have the right text
98 map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
99 foreach my $k ( keys %seen_wits ) {
100 my $wit = $t->witness( $k );
101 if( $seen_wits{$k} ) {
102 ok( $wit->is_layered, "Witness $k got marked as layered" );
103 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
104 my $origtext = join( ' ', @{$wit->layertext} );
105 my $acsig = $wit->sigil . $t->collation->ac_label;
106 my $graphtext = $t->collation->path_text( $acsig );
107 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
109 ok( !$wit->is_layered, "Witness $k not marked as layered" );
110 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
114 # Check that we only have collation relationships where we need them
115 is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
117 ## Check excel parsing
119 my $xls = 't/data/armexample.xls';
120 my $xt = Text::Tradition->new(
121 'name' => 'excel test',
122 'input' => 'Tabular',
127 is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
129 map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
130 foreach my $wit ( $xt->witnesses ) {
131 $xls_wits{$wit->sigil} = 1;
133 is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
134 foreach my $k ( keys %xls_wits ) {
135 ok( $xls_wits{$k}, "Witness $k still exists" );
137 is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
138 is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
139 is( $xt->collation->reading('r5.1')->text, "\x{587}",
140 "Correct decoding of at least one reading" );
142 my $xlsx = 't/data/armexample.xlsx';
143 my $xtx = Text::Tradition->new(
144 'name' => 'excel test',
145 'input' => 'Tabular',
150 is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" );
152 map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
153 foreach my $wit ( $xtx->witnesses ) {
154 $xlsx_wits{$wit->sigil} = 1;
156 is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" );
157 foreach my $k ( keys %xlsx_wits ) {
158 ok( $xlsx_wits{$k}, "Witness $k still exists" );
160 is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" );
161 is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" );
162 is( $xtx->collation->reading('r5.1')->text, "\x{587}",
163 "Correct decoding of at least one reading" );
170 my( $tradition, $opts ) = @_;
171 my $alignment_table = _table_from_input( $opts );
172 # Set up the witnesses we find in the first line
174 my %ac_wits; # Track layered witness -> main witness mapping
175 my $c = $tradition->collation; # shorthand
176 my $aclabel = $c->ac_label;
177 foreach my $sigil ( @{$alignment_table->[0]} ) {
178 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
179 # Sanitize the sigil name to an XML name
180 $sigil = $1 . '_layered';
181 $ac_wits{$sigil} = $1;
183 my $wit = $tradition->add_witness(
184 'sigil' => $sigil, 'sourcetype' => 'collation' );
185 $wit->path( [ $c->start ] );
186 push( @witnesses, $wit );
187 my $aclabel = $c->ac_label;
190 # Save the original witness text sequences. Have to loop back through
191 # the witness columns after we have identified all the a.c. witnesses.
192 foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
193 my @sequence = map { $_->[$idx] } @{$alignment_table};
194 my $sigil = shift @sequence;
195 my $is_layer = exists( $ac_wits{$sigil} );
196 my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
197 # Now get rid of gaps and meta-readings like #LACUNA#
198 my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
199 $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
202 my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
203 print STDERR "Tradition too big for row collation\n" if $nocollate;
205 # Now for the next rows, make nodes as necessary, assign their ranks, and
206 # add them to the witness paths.
207 foreach my $idx ( 1 .. $#{$alignment_table} ) {
208 my $row = $alignment_table->[$idx];
209 my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
210 foreach my $w ( 0 .. $#{$row} ) {
211 # push the appropriate node onto the appropriate witness path
212 my $word = $row->[$w];
214 my $reading = $nodes->{$word};
215 my $wit = $witnesses[$w];
216 push( @{$wit->path}, $reading );
217 } # else skip it for empty readings.
221 # Collapse our lacunae into a single node and
222 # push the end node onto all paths.
223 $c->end->rank( scalar @$alignment_table );
224 foreach my $wit ( @witnesses ) {
226 my $last_rdg = shift @$p;
227 my $new_p = [ $last_rdg ];
228 foreach my $rdg ( @$p ) {
229 # Omit the reading if we are in a lacuna already.
230 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
231 # Save the reading otherwise.
232 push( @$new_p, $rdg );
235 push( @$new_p, $c->end );
236 $wit->path( $new_p );
239 # Fold any a.c. witnesses into their main witness objects, and
240 # delete the independent a.c. versions.
241 foreach my $a ( keys %ac_wits ) {
242 my $ac_wit = $tradition->witness( $a );
243 my $main_wit = $tradition->witness( $ac_wits{$a} );
244 next unless $main_wit;
245 $main_wit->is_layered(1);
246 $main_wit->uncorrected_path( $ac_wit->path );
247 $tradition->del_witness( $ac_wit );
251 $c->make_witness_paths;
252 # Delete our unused lacuna nodes.
253 foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
254 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
257 # Do a consistency check.
258 foreach my $wit ( $tradition->witnesses ) {
259 my $pathtext = $c->path_text( $wit->sigil );
260 my $origtext = join( ' ', @{$wit->text} );
261 warn "Text differs for witness " . $wit->sigil
262 unless $pathtext eq $origtext;
263 if( $wit->is_layered ) {
264 $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
265 $origtext = join( ' ', @{$wit->layertext} );
266 warn "Ante-corr text differs for witness " . $wit->sigil
267 unless $pathtext eq $origtext;
269 warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
270 if $wit->has_layertext;
274 # Note that our ranks and common readings are set.
275 $c->_graphcalc_done(1);
276 # Remove redundant collation relationships.
277 $c->relations->filter_collations() unless $nocollate;
280 sub _table_from_input {
282 my $alignment_table = [];
283 if( $opts->{'excel'} ) {
286 unless( exists $opts->{'file'} ) {
287 throw( "Must pass the filename for Excel parsing" );
289 if( $opts->{'excel'} eq 'xls' ) {
291 require Spreadsheet::ParseExcel;
293 throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
295 my $parser = Spreadsheet::ParseExcel->new();
296 my $workbook = $parser->parse( $opts->{'file'} );
297 unless( defined $workbook && defined $workbook->worksheet(0) ) {
298 throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
300 $sheet = $workbook->worksheet(0);
301 } elsif( $opts->{'excel'} eq 'xlsx' ) {
303 require Spreadsheet::XLSX;
305 throw( "Need module Spreadsheet::XLSX to parse .xlsx files" );
310 $workbook = Spreadsheet::XLSX->new( $opts->{'file'} );
312 throw( "Failed to parse file " . $opts->{'file'} );
314 $sheet = $workbook->worksheet(0);
316 throw( "Unrecognized Excel variant" . $opts->{'excel'} );
318 $alignment_table = _alignment_from_worksheet( $sheet, $need_decode );
320 # Assume it is a comma-, tab-, or whatever-separated format.
321 my $csv_options = { 'binary' => 1 };
322 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
323 if( $csv_options->{'sep_char'} eq "\t" ) {
324 # If it is really tab separated, nothing is an escape char.
325 $csv_options->{'quote_char'} = undef;
326 $csv_options->{'escape_char'} = undef;
328 my $csv = Text::CSV->new( $csv_options );
330 if( exists $opts->{'string' } ) {
331 my @lines = split( "\n", $opts->{'string'} );
332 foreach my $l ( @lines ) {
333 my $status = $csv->parse( $l );
335 push( @$alignment_table, [ $csv->fields ] );
337 throw( "Could not parse line $l: " . $csv->error_input );
340 } elsif( exists $opts->{'file'} ) {
341 open( my $fh, $opts->{'file'} )
342 or warn "Could not open input file " . $opts->{'file'};
343 binmode( $fh, ':utf8' );
344 while( my $row = $csv->getline( $fh ) ) {
345 push( @$alignment_table, $row );
349 throw( "Could not find string or file option to parse" );
352 return $alignment_table;
354 sub _alignment_from_worksheet {
355 my( $sheet, $decode ) = @_;
356 my $alignment_table = [];
358 my( $rmin, $rmax ) = $sheet->row_range();
359 my( $cmin, $cmax ) = $sheet->col_range();
360 unless( $cmax && $rmax ) {
361 throw( "Found no rows or no columns in first worksheet" );
363 # Populate the alignment table. We only want columns that have
364 # a sigil in row zero.
366 push( @$alignment_table, [] );
367 foreach my $col ( $cmin .. $cmax ) {
368 my $cell = $sheet->get_cell( $rmin, $col );
369 my $cellval = $cell ? $cell->value() : undef;
372 push( @{$alignment_table->[0]}, $cellval );
375 # Now go through the rest of the rows and pick up the columns
376 # that were headed by a sigil.
377 foreach my $row ( $rmin+1 .. $rmax ) {
379 foreach my $col ( $cmin .. $cmax ) {
380 next unless $sigcols{$col};
381 my $cell = $sheet->get_cell( $row, $col );
384 $cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
386 push( @tablerow, $cellval );
388 push( @$alignment_table, \@tablerow );
390 return $alignment_table;
394 my( $collation, $row, $index, $nocollate ) = @_;
396 my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
397 foreach my $w ( @$row ) {
398 $unique{$w} = 1 if $w;
399 $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
402 foreach my $w ( keys %unique ) {
404 'id' => "r$index.$ctr",
408 if( $w eq '#LACUNA#' ) {
409 $rargs->{'is_lacuna'} = 1;
410 } elsif( $commonctr == 1 ) {
411 $rargs->{'is_common'} = 1;
413 my $r = $collation->add_reading( $rargs );
417 # Collate this sequence of readings via a single 'collation' relationship.
418 unless( $nocollate ) {
419 my @rankrdgs = values %unique;
422 my $r = shift @rankrdgs;
424 foreach my $nr ( @rankrdgs ) {
425 next if $nr->is_meta;
426 if( $collation_rel ) {
427 $collation->add_relationship( $r, $nr, $collation_rel );
429 $collation->add_relationship( $r, $nr,
430 { 'type' => 'collated',
431 'annotation' => "Parsed together for rank $index" } );
432 $collation_rel = $collation->get_relationship( $r, $nr );
441 Text::Tradition::Error->throw(
442 'ident' => 'Parser::Tabular error',
451 This package is free software and is provided "as is" without express
452 or implied warranty. You can redistribute it and/or modify it under
453 the same terms as Perl itself.
457 Tara L Andrews E<lt>aurum@cpan.orgE<gt>