1 package Text::Tradition::Parser::Tabular;
5 use Encode qw/ decode_utf8 /;
7 use Text::Tradition::Error;
12 Text::Tradition::Parser::Tabular
18 my $t_from_file = Text::Tradition->new(
21 'file' => '/path/to/collation.csv',
25 my $t_from_string = Text::Tradition->new(
28 'string' => $tab_separated_collation,
34 Parser module for Text::Tradition to read an alignment table format, such as
39 =head2 B<parse>( $tradition, $option_hash )
41 Takes an initialized tradition and a set of options; creates the
42 appropriate nodes and edges on the graph, as well as the appropriate
43 witness objects. The $option_hash can contain the following:
47 =item * file - Name of file which contains the data
49 =item * string - A string that itself contains the data. One of 'file' or
52 =item * sep_char - For plaintext formats, the field separation character.
53 Defaults to "\t" (tab-separated); should be "," for comma-separated format.
55 =item * excel - If the data is in an Excel file, this option should be set
56 to 'xls' (for pre-2007 Excel format) or 'xlsx' (for Excel XML format.)
60 The data should comprise a table with witnesses arranged in columns, with
61 the witness sigla in the first row. Empty cells are interpreted as
62 omissions (and thus stemmatologically relevant.) Longer lacunae in the
63 text, to be disregarded in cladistic analysis, may be represented by
64 filling the appropriate cells with the tag '#LACUNA#'.
66 If a witness name ends in the collation's ac_label, it will be treated as
67 an 'ante-correction' version of the 'main' witness whose sigil it shares.
72 binmode STDOUT, ":utf8";
73 binmode STDERR, ":utf8";
74 eval { no warnings; binmode $DB::OUT, ":utf8"; };
76 my $csv = 't/data/florilegium.csv';
77 my $t = Text::Tradition->new(
84 is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
86 ### TODO Check these figures
88 is( scalar $t->collation->readings, 311, "Collation has all readings" );
89 is( scalar $t->collation->paths, 361, "Collation has all paths" );
90 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
93 # Check that we have the right witnesses
95 map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
96 foreach my $wit ( $t->witnesses ) {
97 $seen_wits{$wit->sigil} = 1;
99 is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
100 foreach my $k ( keys %seen_wits ) {
101 ok( $seen_wits{$k}, "Witness $k still exists" );
104 # Check that the witnesses have the right texts
105 foreach my $wit ( $t->witnesses ) {
106 my $origtext = join( ' ', @{$wit->text} );
107 my $graphtext = $t->collation->path_text( $wit->sigil );
108 is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
111 # Check that the a.c. witnesses have the right text
112 map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
113 foreach my $k ( keys %seen_wits ) {
114 my $wit = $t->witness( $k );
115 if( $seen_wits{$k} ) {
116 ok( $wit->is_layered, "Witness $k got marked as layered" );
117 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
118 my $origtext = join( ' ', @{$wit->layertext} );
119 my $acsig = $wit->sigil . $t->collation->ac_label;
120 my $graphtext = $t->collation->path_text( $acsig );
121 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
123 ok( !$wit->is_layered, "Witness $k not marked as layered" );
124 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
128 # Check that we only have collation relationships where we need them
129 is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
131 ## Check excel parsing
133 my $xls = 't/data/armexample.xls';
134 my $xt = Text::Tradition->new(
135 'name' => 'excel test',
136 'input' => 'Tabular',
141 is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
143 map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
144 foreach my $wit ( $xt->witnesses ) {
145 $xls_wits{$wit->sigil} = 1;
147 is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
148 foreach my $k ( keys %xls_wits ) {
149 ok( $xls_wits{$k}, "Witness $k still exists" );
151 is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
152 is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
153 is( $xt->collation->reading('r5.1')->text, "\x{587}",
154 "Correct decoding of at least one reading" );
156 my $xlsx = 't/data/armexample.xlsx';
157 my $xtx = Text::Tradition->new(
158 'name' => 'excel test',
159 'input' => 'Tabular',
164 is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" );
166 map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
167 foreach my $wit ( $xtx->witnesses ) {
168 $xlsx_wits{$wit->sigil} = 1;
170 is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" );
171 foreach my $k ( keys %xlsx_wits ) {
172 ok( $xlsx_wits{$k}, "Witness $k still exists" );
174 is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" );
175 is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" );
176 is( $xtx->collation->reading('r5.1')->text, "\x{587}",
177 "Correct decoding of at least one reading" );
184 my( $tradition, $opts ) = @_;
185 my $alignment_table = _table_from_input( $opts );
186 # Set up the witnesses we find in the first line
188 my %ac_wits; # Track layered witness -> main witness mapping
189 my $c = $tradition->collation; # shorthand
190 my $aclabel = $c->ac_label;
191 foreach my $sigil ( @{$alignment_table->[0]} ) {
192 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
193 # Sanitize the sigil name to an XML name
194 $sigil = $1 . '_layered';
195 $ac_wits{$sigil} = $1;
197 my $wit = $tradition->add_witness(
198 'sigil' => $sigil, 'sourcetype' => 'collation' );
199 $wit->path( [ $c->start ] );
200 push( @witnesses, $wit );
201 my $aclabel = $c->ac_label;
204 # Save the original witness text sequences. Have to loop back through
205 # the witness columns after we have identified all the a.c. witnesses.
206 foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
207 my @sequence = map { $_->[$idx] } @{$alignment_table};
208 my $sigil = shift @sequence;
209 my $is_layer = exists( $ac_wits{$sigil} );
210 my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
211 # Now get rid of gaps and meta-readings like #LACUNA#
212 my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
213 $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
216 my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
217 print STDERR "Tradition too big for row collation\n" if $nocollate;
219 # Now for the next rows, make nodes as necessary, assign their ranks, and
220 # add them to the witness paths.
221 foreach my $idx ( 1 .. $#{$alignment_table} ) {
222 my $row = $alignment_table->[$idx];
223 my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
224 foreach my $w ( 0 .. $#{$row} ) {
225 # push the appropriate node onto the appropriate witness path
226 my $word = $row->[$w];
228 my $reading = $nodes->{$word};
229 my $wit = $witnesses[$w];
230 push( @{$wit->path}, $reading );
231 } # else skip it for empty readings.
235 # Collapse our lacunae into a single node and
236 # push the end node onto all paths.
237 $c->end->rank( scalar @$alignment_table );
238 foreach my $wit ( @witnesses ) {
240 my $last_rdg = shift @$p;
241 my $new_p = [ $last_rdg ];
242 foreach my $rdg ( @$p ) {
243 # Omit the reading if we are in a lacuna already.
244 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
245 # Save the reading otherwise.
246 push( @$new_p, $rdg );
249 push( @$new_p, $c->end );
250 $wit->path( $new_p );
253 # Fold any a.c. witnesses into their main witness objects, and
254 # delete the independent a.c. versions.
255 foreach my $a ( keys %ac_wits ) {
256 my $ac_wit = $tradition->witness( $a );
257 my $main_wit = $tradition->witness( $ac_wits{$a} );
258 next unless $main_wit;
259 $main_wit->is_layered(1);
260 $main_wit->uncorrected_path( $ac_wit->path );
261 $tradition->del_witness( $ac_wit );
265 $c->make_witness_paths;
266 # Delete our unused lacuna nodes.
267 foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
268 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
271 # Do a consistency check.
272 foreach my $wit ( $tradition->witnesses ) {
273 my $pathtext = $c->path_text( $wit->sigil );
274 my $origtext = join( ' ', @{$wit->text} );
275 warn "Text differs for witness " . $wit->sigil
276 unless $pathtext eq $origtext;
277 if( $wit->is_layered ) {
278 $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
279 $origtext = join( ' ', @{$wit->layertext} );
280 warn "Ante-corr text differs for witness " . $wit->sigil
281 unless $pathtext eq $origtext;
283 warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
284 if $wit->has_layertext;
288 # Note that our ranks and common readings are set.
289 $c->_graphcalc_done(1);
290 # Remove redundant collation relationships.
291 $c->relations->filter_collations() unless $nocollate;
294 sub _table_from_input {
296 my $alignment_table = [];
297 if( $opts->{'excel'} ) {
300 unless( exists $opts->{'file'} ) {
301 throw( "Must pass the filename for Excel parsing" );
303 if( $opts->{'excel'} eq 'xls' ) {
305 require Spreadsheet::ParseExcel;
307 throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
309 my $parser = Spreadsheet::ParseExcel->new();
310 my $workbook = $parser->parse( $opts->{'file'} );
311 unless( defined $workbook && defined $workbook->worksheet(0) ) {
312 throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
314 $sheet = $workbook->worksheet(0);
315 } elsif( $opts->{'excel'} eq 'xlsx' ) {
317 require Spreadsheet::XLSX;
319 throw( "Need module Spreadsheet::XLSX to parse .xlsx files" );
324 $workbook = Spreadsheet::XLSX->new( $opts->{'file'} );
326 throw( "Failed to parse file " . $opts->{'file'} );
328 $sheet = $workbook->worksheet(0);
330 throw( "Unrecognized Excel variant" . $opts->{'excel'} );
332 $alignment_table = _alignment_from_worksheet( $sheet, $need_decode );
334 # Assume it is a comma-, tab-, or whatever-separated format.
335 my $csv_options = { 'binary' => 1 };
336 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
337 if( $csv_options->{'sep_char'} eq "\t" ) {
338 # If it is really tab separated, nothing is an escape char.
339 $csv_options->{'quote_char'} = undef;
340 $csv_options->{'escape_char'} = undef;
342 my $csv = Text::CSV->new( $csv_options );
344 if( exists $opts->{'string' } ) {
345 my @lines = split( "\n", $opts->{'string'} );
346 foreach my $l ( @lines ) {
347 my $status = $csv->parse( $l );
349 push( @$alignment_table, [ $csv->fields ] );
351 throw( "Could not parse line $l: " . $csv->error_input );
354 } elsif( exists $opts->{'file'} ) {
355 open( my $fh, $opts->{'file'} )
356 or warn "Could not open input file " . $opts->{'file'};
357 binmode( $fh, ':utf8' );
358 while( my $row = $csv->getline( $fh ) ) {
359 push( @$alignment_table, $row );
363 throw( "Could not find string or file option to parse" );
366 return $alignment_table;
368 sub _alignment_from_worksheet {
369 my( $sheet, $decode ) = @_;
370 my $alignment_table = [];
372 my( $rmin, $rmax ) = $sheet->row_range();
373 my( $cmin, $cmax ) = $sheet->col_range();
374 unless( $cmax && $rmax ) {
375 throw( "Found no rows or no columns in first worksheet" );
377 # Populate the alignment table. We only want columns that have
378 # a sigil in row zero.
380 push( @$alignment_table, [] );
381 foreach my $col ( $cmin .. $cmax ) {
382 my $cell = $sheet->get_cell( $rmin, $col );
383 my $cellval = $cell ? $cell->value() : undef;
386 push( @{$alignment_table->[0]}, $cellval );
389 # Now go through the rest of the rows and pick up the columns
390 # that were headed by a sigil.
391 foreach my $row ( $rmin+1 .. $rmax ) {
393 foreach my $col ( $cmin .. $cmax ) {
394 next unless $sigcols{$col};
395 my $cell = $sheet->get_cell( $row, $col );
398 $cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
400 push( @tablerow, $cellval );
402 push( @$alignment_table, \@tablerow );
404 return $alignment_table;
408 my( $collation, $row, $index, $nocollate ) = @_;
410 my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
411 foreach my $w ( @$row ) {
412 $unique{$w} = 1 if $w;
413 $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
416 foreach my $w ( keys %unique ) {
418 'id' => "r$index.$ctr",
422 if( $w eq '#LACUNA#' ) {
423 $rargs->{'is_lacuna'} = 1;
424 } elsif( $commonctr == 1 ) {
425 $rargs->{'is_common'} = 1;
427 my $r = $collation->add_reading( $rargs );
431 # Collate this sequence of readings via a single 'collation' relationship.
432 unless( $nocollate ) {
433 my @rankrdgs = values %unique;
436 my $r = shift @rankrdgs;
438 foreach my $nr ( @rankrdgs ) {
439 next if $nr->is_meta;
440 if( $collation_rel ) {
441 $collation->add_relationship( $r, $nr, $collation_rel );
443 $collation->add_relationship( $r, $nr,
444 { 'type' => 'collated',
445 'annotation' => "Parsed together for rank $index" } );
446 $collation_rel = $collation->get_relationship( $r, $nr );
455 Text::Tradition::Error->throw(
456 'ident' => 'Parser::Tabular error',
465 This package is free software and is provided "as is" without express
466 or implied warranty. You can redistribute it and/or modify it under
467 the same terms as Perl itself.
471 Tara L Andrews E<lt>aurum@cpan.orgE<gt>