From: Tara L Andrews Date: Fri, 31 Aug 2012 23:39:38 +0000 (+0200) Subject: Add simple Excel 97-2004 spreadsheet parsing X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=701ad2ba49e59b6f25aaf59b942397e398b7c909;p=scpubgit%2Fstemmatology.git Add simple Excel 97-2004 spreadsheet parsing --- diff --git a/lib/Text/Tradition/Parser/Tabular.pm b/lib/Text/Tradition/Parser/Tabular.pm index a9ce519..3909a5f 100644 --- a/lib/Text/Tradition/Parser/Tabular.pm +++ b/lib/Text/Tradition/Parser/Tabular.pm @@ -3,6 +3,8 @@ package Text::Tradition::Parser::Tabular; use strict; use warnings; use Text::CSV; +use Text::Tradition::Error; +use TryCatch; =head1 NAME @@ -111,6 +113,31 @@ foreach my $k ( keys %seen_wits ) { # Check that we only have collation relationships where we need them is( scalar $t->collation->relationships, 3, "Redundant collations were removed" ); +## Check excel parsing + +my $xls = 't/data/armexample.xls'; +my $xt = Text::Tradition->new( + 'name' => 'excel test', + 'input' => 'Tabular', + 'file' => $xls, + 'xls' => 1 + ); + +is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" ); +my %xls_wits; +map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /; +foreach my $wit ( $xt->witnesses ) { + $xls_wits{$wit->sigil} = 1; +} +is( scalar keys %xls_wits, 3, "No extra witnesses were made" ); +foreach my $k ( keys %xls_wits ) { + ok( $xls_wits{$k}, "Witness $k still exists" ); +} +is( scalar $xt->collation->readings, 11, "Got correct number of test readings" ); +is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" ); +is( $xt->collation->reading('r5.1')->text, "\x{587}", + "Correct decoding of at least one reading" ); + =end testing =cut @@ -118,39 +145,86 @@ is( scalar $t->collation->relationships, 3, "Redundant collations were removed" sub parse { my( $tradition, $opts ) = @_; my $c = $tradition->collation; # shorthand - my $csv_options = { 'binary' => 1 }; - $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t"; - if( $csv_options->{'sep_char'} eq "\t" ) { - # If it is really tab separated, nothing is an escape char. - $csv_options->{'quote_char'} = undef; - $csv_options->{'escape_char'} = undef; - } - my $csv = Text::CSV->new( $csv_options ); - my $alignment_table; - if( exists $opts->{'string' } ) { - my @lines = split( "\n", $opts->{'string'} ); - foreach my $l ( @lines ) { - my $status = $csv->parse( $l ); - if( $status ) { - push( @$alignment_table, [ $csv->fields ] ); - } else { - warn "Could not parse line $l: " . $csv->error_input; - } - } - } elsif( exists $opts->{'file'} ) { - open( my $fh, $opts->{'file'} ) - or warn "Could not open input file " . $opts->{'file'}; - binmode( $fh, ':utf8' ); - while( my $row = $csv->getline( $fh ) ) { - push( @$alignment_table, $row ); - } - close $fh; + if( $opts->{'xls'} ) { + try { + require Spreadsheet::ParseExcel; + } catch { + throw( "Need module Spreadsheet::ParseExcel to parse .xls files" ); + } + unless( exists $opts->{'file'} ) { + throw( "Must pass the filename for Excel parsing" ); + } + my $parser = Spreadsheet::ParseExcel->new(); + my $workbook = $parser->parse( $opts->{'file'} ); + unless( defined $workbook && defined $workbook->worksheet(0) ) { + throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() ); + } + # Use the first worksheet + my $sheet = $workbook->worksheet(0); + my( $rmin, $rmax ) = $sheet->row_range(); + my( $cmin, $cmax ) = $sheet->col_range(); + unless( $cmax && $rmax ) { + throw( "Found no rows or no columns in first worksheet" ); + } + # Populate the alignment table. We only want columns that have + # a sigil in row zero. + my %sigcols = (); + push( @$alignment_table, [] ); + foreach my $col ( $cmin .. $cmax ) { + my $cell = $sheet->get_cell( $rmin, $col ); + my $cellval = $cell ? $cell->value() : undef; + if( $cellval ) { + $sigcols{$col} = 1; + push( @{$alignment_table->[0]}, $cellval ); + } + } + # Now go through the rest of the rows and pick up the columns + # that were headed by a sigil. + foreach my $row ( $rmin+1 .. $rmax ) { + my @tablerow; + foreach my $col ( $cmin .. $cmax ) { + next unless $sigcols{$col}; + my $cell = $sheet->get_cell( $row, $col ); + my $cellval = $cell ? $cell->value() : undef; + push( @tablerow, $cell ? $cell->value() : undef ); + } + push( @$alignment_table, \@tablerow ); + } } else { - warn "Could not find string or file option to parse"; - return; - } - + # Assume it is a comma-, tab-, or whatever-separated format. + my $csv_options = { 'binary' => 1 }; + $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t"; + if( $csv_options->{'sep_char'} eq "\t" ) { + # If it is really tab separated, nothing is an escape char. + $csv_options->{'quote_char'} = undef; + $csv_options->{'escape_char'} = undef; + } + my $csv = Text::CSV->new( $csv_options ); + + if( exists $opts->{'string' } ) { + my @lines = split( "\n", $opts->{'string'} ); + foreach my $l ( @lines ) { + my $status = $csv->parse( $l ); + if( $status ) { + push( @$alignment_table, [ $csv->fields ] ); + } else { + warn "Could not parse line $l: " . $csv->error_input; + } + } + } elsif( exists $opts->{'file'} ) { + open( my $fh, $opts->{'file'} ) + or warn "Could not open input file " . $opts->{'file'}; + binmode( $fh, ':utf8' ); + while( my $row = $csv->getline( $fh ) ) { + push( @$alignment_table, $row ); + } + close $fh; + } else { + warn "Could not find string or file option to parse"; + return; + } + } # Set up the witnesses we find in the first line my @witnesses; my %ac_wits; # Track layered witness -> main witness mapping diff --git a/t/data/armexample.xls b/t/data/armexample.xls new file mode 100644 index 0000000..87e6a22 Binary files /dev/null and b/t/data/armexample.xls differ diff --git a/t/text_tradition_parser_tabular.t b/t/text_tradition_parser_tabular.t index c53e57e..efb052d 100644 --- a/t/text_tradition_parser_tabular.t +++ b/t/text_tradition_parser_tabular.t @@ -67,6 +67,31 @@ foreach my $k ( keys %seen_wits ) { # Check that we only have collation relationships where we need them is( scalar $t->collation->relationships, 3, "Redundant collations were removed" ); + +## Check excel parsing + +my $xls = 't/data/armexample.xls'; +my $xt = Text::Tradition->new( + 'name' => 'excel test', + 'input' => 'Tabular', + 'file' => $xls, + 'xls' => 1 + ); + +is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" ); +my %xls_wits; +map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /; +foreach my $wit ( $xt->witnesses ) { + $xls_wits{$wit->sigil} = 1; +} +is( scalar keys %xls_wits, 3, "No extra witnesses were made" ); +foreach my $k ( keys %xls_wits ) { + ok( $xls_wits{$k}, "Witness $k still exists" ); +} +is( scalar $xt->collation->readings, 11, "Got correct number of test readings" ); +is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" ); +is( $xt->collation->reading('r5.1')->text, "\x{587}", + "Correct decoding of at least one reading" ); }