1 package Text::Tradition::Parser::Tabular;
5 use Encode qw/ decode_utf8 /;
7 use Text::Tradition::Error;
12 Text::Tradition::Parser::Tabular
18 my $t_from_file = Text::Tradition->new(
21 'file' => '/path/to/collation.csv',
25 my $t_from_string = Text::Tradition->new(
28 'string' => $tab_separated_collation,
34 Parser module for Text::Tradition to read an alignment table format, such as
39 =head2 B<parse>( $tradition, $option_hash )
41 Takes an initialized tradition and a set of options; creates the
42 appropriate nodes and edges on the graph, as well as the appropriate
43 witness objects. The $option_hash can contain the following:
47 =item * file - Name of file which contains the data
49 =item * string - A string that itself contains the data. One of 'file' or
52 =item * sep_char - For plaintext formats, the field separation character.
53 Defaults to "\t" (tab-separated); should be "," for comma-separated format.
55 =item * excel - If the data is in an Excel file, this option should be set
56 to 'xls' (for pre-2007 Excel format) or 'xlsx' (for Excel XML format.)
60 The data should comprise a table with witnesses arranged in columns, with
61 the witness sigla in the first row. Empty cells are interpreted as
62 omissions (and thus stemmatologically relevant.) Longer lacunae in the
63 text, to be disregarded in cladistic analysis, may be represented by
64 filling the appropriate cells with the tag '#LACUNA#'.
66 If a witness name ends in the collation's ac_label, it will be treated as
67 an 'ante-correction' version of the 'main' witness whose sigil it shares.
73 binmode STDOUT, ":utf8";
74 binmode STDERR, ":utf8";
75 eval { no warnings; binmode $DB::OUT, ":utf8"; };
77 my $csv = 't/data/florilegium.csv';
78 my $t = Text::Tradition->new(
85 is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
87 ### TODO Check these figures
89 is( scalar $t->collation->readings, 311, "Collation has all readings" );
90 is( scalar $t->collation->paths, 361, "Collation has all paths" );
91 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
94 # Check that we have the right witnesses
96 map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
97 foreach my $wit ( $t->witnesses ) {
98 $seen_wits{$wit->sigil} = 1;
100 is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
101 foreach my $k ( keys %seen_wits ) {
102 ok( $seen_wits{$k}, "Witness $k still exists" );
105 # Check that the witnesses have the right texts
106 foreach my $wit ( $t->witnesses ) {
107 my $origtext = join( ' ', @{$wit->text} );
108 my $graphtext = $t->collation->path_text( $wit->sigil );
109 is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
112 # Check that the a.c. witnesses have the right text
113 map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
114 foreach my $k ( keys %seen_wits ) {
115 my $wit = $t->witness( $k );
116 if( $seen_wits{$k} ) {
117 ok( $wit->is_layered, "Witness $k got marked as layered" );
118 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
119 my $origtext = join( ' ', @{$wit->layertext} );
120 my $acsig = $wit->sigil . $t->collation->ac_label;
121 my $graphtext = $t->collation->path_text( $acsig );
122 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
124 ok( !$wit->is_layered, "Witness $k not marked as layered" );
125 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
129 # Check that we only have collation relationships where we need them
130 is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
132 ## Check excel parsing
134 my $xls = 't/data/armexample.xls';
135 my $xt = Text::Tradition->new(
136 'name' => 'excel test',
137 'input' => 'Tabular',
142 is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
144 map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
145 foreach my $wit ( $xt->witnesses ) {
146 $xls_wits{$wit->sigil} = 1;
148 is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
149 foreach my $k ( keys %xls_wits ) {
150 ok( $xls_wits{$k}, "Witness $k still exists" );
152 is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
153 is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
154 is( $xt->collation->reading('r5.1')->text, "\x{587}",
155 "Correct decoding of at least one reading" );
157 my $xlsx = 't/data/armexample.xlsx';
158 my $xtx = Text::Tradition->new(
159 'name' => 'excel test',
160 'input' => 'Tabular',
165 is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" );
167 map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit3 /;
168 $xlsx_wits{"\x{531}\x{562}2"} = 0;
169 foreach my $wit ( $xtx->witnesses ) {
170 $xlsx_wits{$wit->sigil} = 1;
172 is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" );
173 foreach my $k ( keys %xlsx_wits ) {
174 ok( $xlsx_wits{$k}, "Witness $k still exists" );
176 is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" );
177 is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" );
178 is( $xtx->collation->reading('r5.1')->text, "\x{587}",
179 "Correct decoding of at least one reading" );
186 my( $tradition, $opts ) = @_;
187 my $alignment_table = _table_from_input( $opts );
188 # Set up the witnesses we find in the first line
190 my %ac_wits; # Track layered witness -> main witness mapping
191 my $c = $tradition->collation; # shorthand
192 my $aclabel = $c->ac_label;
193 foreach my $sigil ( @{$alignment_table->[0]} ) {
194 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
195 # Sanitize the sigil name to an XML name
196 $sigil = $1 . '_layered';
197 $ac_wits{$sigil} = $1;
199 my $wit = $tradition->add_witness(
200 'sigil' => $sigil, 'sourcetype' => 'collation' );
201 $wit->path( [ $c->start ] );
202 push( @witnesses, $wit );
203 my $aclabel = $c->ac_label;
206 # Save the original witness text sequences. Have to loop back through
207 # the witness columns after we have identified all the a.c. witnesses.
208 foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
209 my @sequence = map { $_->[$idx] } @{$alignment_table};
210 my $sigil = shift @sequence;
211 my $is_layer = exists( $ac_wits{$sigil} );
212 my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
213 # Now get rid of gaps and meta-readings like #LACUNA#
214 my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
215 $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
218 my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
219 print STDERR "Tradition too big for row collation\n" if $nocollate;
221 # Now for the next rows, make nodes as necessary, assign their ranks, and
222 # add them to the witness paths.
223 foreach my $idx ( 1 .. $#{$alignment_table} ) {
224 my $row = $alignment_table->[$idx];
225 my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
226 foreach my $w ( 0 .. $#{$row} ) {
227 # push the appropriate node onto the appropriate witness path
228 my $word = $row->[$w];
230 my $reading = $nodes->{$word};
231 my $wit = $witnesses[$w];
232 push( @{$wit->path}, $reading );
233 } # else skip it for empty readings.
237 # Collapse our lacunae into a single node and
238 # push the end node onto all paths.
239 $c->end->rank( scalar @$alignment_table );
240 foreach my $wit ( @witnesses ) {
242 my $last_rdg = shift @$p;
243 my $new_p = [ $last_rdg ];
244 foreach my $rdg ( @$p ) {
245 # Omit the reading if we are in a lacuna already.
246 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
247 # Save the reading otherwise.
248 push( @$new_p, $rdg );
251 push( @$new_p, $c->end );
252 $wit->path( $new_p );
255 # Fold any a.c. witnesses into their main witness objects, and
256 # delete the independent a.c. versions.
257 foreach my $a ( keys %ac_wits ) {
258 my $ac_wit = $tradition->witness( $a );
259 my $main_wit = $tradition->witness( $ac_wits{$a} );
260 next unless $main_wit;
261 $main_wit->is_layered(1);
262 $main_wit->uncorrected_path( $ac_wit->path );
263 $tradition->del_witness( $ac_wit );
267 $c->make_witness_paths;
268 # Delete our unused lacuna nodes.
269 foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
270 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
273 # Do a consistency check.
274 foreach my $wit ( $tradition->witnesses ) {
275 my $pathtext = $c->path_text( $wit->sigil );
276 my $origtext = join( ' ', @{$wit->text} );
277 warn "Text differs for witness " . $wit->sigil
278 unless $pathtext eq $origtext;
279 if( $wit->is_layered ) {
280 $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
281 $origtext = join( ' ', @{$wit->layertext} );
282 warn "Ante-corr text differs for witness " . $wit->sigil
283 unless $pathtext eq $origtext;
285 warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
286 if $wit->has_layertext;
290 # Note that our ranks and common readings are set.
291 $c->_graphcalc_done(1);
292 _add_collations( $c ) unless $nocollate;
295 sub _table_from_input {
297 my $alignment_table = [];
298 if( $opts->{'excel'} ) {
301 unless( exists $opts->{'file'} ) {
302 throw( "Must pass the filename for Excel parsing" );
304 if( $opts->{'excel'} eq 'xls' ) {
306 require Spreadsheet::ParseExcel;
308 throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
310 my $parser = Spreadsheet::ParseExcel->new();
311 my $workbook = $parser->parse( $opts->{'file'} );
312 unless( defined $workbook && defined $workbook->worksheet(0) ) {
313 throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
315 $sheet = $workbook->worksheet(0);
316 } elsif( $opts->{'excel'} eq 'xlsx' ) {
318 require Spreadsheet::XLSX;
320 throw( "Need module Spreadsheet::XLSX to parse .xlsx files" );
325 $workbook = Spreadsheet::XLSX->new( $opts->{'file'} );
327 throw( "Failed to parse file " . $opts->{'file'} );
329 $sheet = $workbook->worksheet(0);
331 throw( "Unrecognized Excel variant" . $opts->{'excel'} );
333 $alignment_table = _alignment_from_worksheet( $sheet, $need_decode );
335 # Assume it is a comma-, tab-, or whatever-separated format.
336 my $csv_options = { 'binary' => 1 };
337 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
338 if( $csv_options->{'sep_char'} eq "\t" ) {
339 # If it is really tab separated, nothing is an escape char.
340 $csv_options->{'quote_char'} = undef;
341 $csv_options->{'escape_char'} = undef;
343 my $csv = Text::CSV->new( $csv_options );
345 if( exists $opts->{'string' } ) {
346 my @lines = split( "\n", $opts->{'string'} );
347 foreach my $l ( @lines ) {
348 my $status = $csv->parse( $l );
350 push( @$alignment_table, [ $csv->fields ] );
352 throw( "Could not parse line $l: " . $csv->error_input );
355 } elsif( exists $opts->{'file'} ) {
356 open( my $fh, $opts->{'file'} )
357 or warn "Could not open input file " . $opts->{'file'};
358 binmode( $fh, ':utf8' );
359 while( my $row = $csv->getline( $fh ) ) {
360 push( @$alignment_table, $row );
364 throw( "Could not find string or file option to parse" );
367 return $alignment_table;
369 sub _alignment_from_worksheet {
370 my( $sheet, $decode ) = @_;
371 my $alignment_table = [];
373 my( $rmin, $rmax ) = $sheet->row_range();
374 my( $cmin, $cmax ) = $sheet->col_range();
375 unless( $cmax && $rmax ) {
376 throw( "Found no rows or no columns in first worksheet" );
378 # Populate the alignment table. We only want columns that have
379 # a sigil in row zero.
381 push( @$alignment_table, [] );
382 foreach my $col ( $cmin .. $cmax ) {
383 my $cell = $sheet->get_cell( $rmin, $col );
386 $cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
390 push( @{$alignment_table->[0]}, $cellval );
393 # Now go through the rest of the rows and pick up the columns
394 # that were headed by a sigil.
395 foreach my $row ( $rmin+1 .. $rmax ) {
397 foreach my $col ( $cmin .. $cmax ) {
398 next unless $sigcols{$col};
399 my $cell = $sheet->get_cell( $row, $col );
402 $cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
404 push( @tablerow, $cellval );
406 push( @$alignment_table, \@tablerow );
408 return $alignment_table;
412 my( $collation, $row, $index, $nocollate ) = @_;
414 my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
415 foreach my $w ( @$row ) {
416 $unique{$w} = 1 if $w;
417 $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
420 foreach my $w ( keys %unique ) {
422 'id' => "r$index.$ctr",
426 if( $w eq '#LACUNA#' ) {
427 $rargs->{'is_lacuna'} = 1;
428 } elsif( $commonctr == 1 ) {
429 $rargs->{'is_common'} = 1;
431 my $r = $collation->add_reading( $rargs );
438 sub _add_collations {
439 my( $collation ) = shift;
440 # For each reading that needs to be held in place, add a 'collated'
441 # relationship to whatever anchor we can find. An anchor is a reading
442 # that would occupy its rank by virtue of being subsequent to a
443 # reading at $rank-1.
445 foreach my $r ( 1 .. $collation->end->rank - 1 ) {
448 my @here = grep { !$_->is_meta } $collation->readings_at_rank( $r );
449 next unless @here > 1;
450 foreach my $rdg ( @here ) {
452 foreach my $pred ( $rdg->predecessors ) {
453 if( $pred->rank == $r - 1 ) {
455 $anchor = $rdg unless( $anchor );
459 push( @need_weak, $rdg ) unless $ip;
462 ? map { push( @collate_pairs, [ $r, $anchor, $_ ] ) } @need_weak
463 : print STDERR "No anchor found at $r\n";
465 foreach my $p ( @collate_pairs ) {
467 $collation->add_relationship( @$p,
468 { 'type' => 'collated',
469 'annotation' => "Collated together for rank $r" } )
470 unless $collation->get_relationship( @$p )
475 Text::Tradition::Error->throw(
476 'ident' => 'Parser::Tabular error',
485 This package is free software and is provided "as is" without express
486 or implied warranty. You can redistribute it and/or modify it under
487 the same terms as Perl itself.
491 Tara L Andrews E<lt>aurum@cpan.orgE<gt>