base/lib/Text/Tradition/Parser/Tabular.pm

   1 package Text::Tradition::Parser::Tabular;
   2
   3 use strict;
   4 use warnings;
   5 use Encode qw/ decode_utf8 /;
   6 use Text::CSV;
   7 use Text::Tradition::Error;
   8 use TryCatch;
   9
  10 =head1 NAME
  11
  12 Text::Tradition::Parser::Tabular
  13
  14 =head1 SYNOPSIS
  15
  16   use Text::Tradition;
  17
  18   my $t_from_file = Text::Tradition->new(
  19     'name' => 'my text',
  20     'input' => 'Tabular',
  21     'file' => '/path/to/collation.csv',
  22     'sep_char' => ','
  23     );
  24
  25   my $t_from_string = Text::Tradition->new(
  26     'name' => 'my text',
  27     'input' => 'Tabular',
  28     'string' => $tab_separated_collation,
  29     'sep_char' => "\t",
  30     );
  31
  32 =head1 DESCRIPTION
  33
  34 Parser module for Text::Tradition to read an alignment table format, such as
  35 CSV or Excel.
  36
  37 =head1 METHODS
  38
  39 =head2 B<parse>( $tradition, $option_hash )
  40
  41 Takes an initialized tradition and a set of options; creates the
  42 appropriate nodes and edges on the graph, as well as the appropriate
  43 witness objects.  The $option_hash can contain the following:
  44
  45 =over
  46
  47 =item * file - Name of file which contains the data
  48
  49 =item * string - A string that itself contains the data. One of 'file' or
  50 'string' is required.
  51
  52 =item * sep_char - For plaintext formats, the field separation character.
  53 Defaults to "\t" (tab-separated); should be "," for comma-separated format.
  54
  55 =item * excel - If the data is in an Excel file, this option should be set
  56 to 'xls' (for pre-2007 Excel format) or 'xlsx' (for Excel XML format.)
  57
  58 =back
  59
  60 The data should comprise a table with witnesses arranged in columns, with
  61 the witness sigla in the first row.  Empty cells are interpreted as
  62 omissions (and thus stemmatologically relevant.) Longer lacunae in the
  63 text, to be disregarded in cladistic analysis, may be represented by
  64 filling the appropriate cells with the tag '#LACUNA#'.
  65
  66 If a witness name ends in the collation's ac_label, it will be treated as
  67 an 'ante-correction' version of the 'main' witness whose sigil it shares.
  68
  69 =begin testing
  70
  71 use Text::Tradition;
  72 binmode STDOUT, ":utf8";
  73 binmode STDERR, ":utf8";
  74 eval { no warnings; binmode $DB::OUT, ":utf8"; };
  75
  76 my $csv = 't/data/florilegium.csv';
  77 my $t = Text::Tradition->new(
  78     'name'  => 'inline',
  79     'input' => 'Tabular',
  80     'file'  => $csv,
  81     'sep_char' => ',',
  82     );
  83
  84 is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
  85
  86 ### TODO Check these figures
  87 if( $t ) {
  88     is( scalar $t->collation->readings, 311, "Collation has all readings" );
  89     is( scalar $t->collation->paths, 361, "Collation has all paths" );
  90     is( scalar $t->witnesses, 13, "Collation has all witnesses" );
  91 }
  92
  93 # Check that we have the right witnesses
  94 my %seen_wits;
  95 map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
  96 foreach my $wit ( $t->witnesses ) {
  97         $seen_wits{$wit->sigil} = 1;
  98 }
  99 is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
 100 foreach my $k ( keys %seen_wits ) {
 101         ok( $seen_wits{$k}, "Witness $k still exists" );
 102 }
 103
 104 # Check that the witnesses have the right texts
 105 foreach my $wit ( $t->witnesses ) {
 106         my $origtext = join( ' ', @{$wit->text} );
 107         my $graphtext = $t->collation->path_text( $wit->sigil );
 108         is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
 109 }
 110
 111 # Check that the a.c. witnesses have the right text
 112 map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
 113 foreach my $k ( keys %seen_wits ) {
 114         my $wit = $t->witness( $k );
 115         if( $seen_wits{$k} ) {
 116                 ok( $wit->is_layered, "Witness $k got marked as layered" );
 117                 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
 118                 my $origtext = join( ' ', @{$wit->layertext} );
 119                 my $acsig = $wit->sigil . $t->collation->ac_label;
 120                 my $graphtext = $t->collation->path_text( $acsig );
 121                 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
 122         } else {
 123                 ok( !$wit->is_layered, "Witness $k not marked as layered" );
 124                 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
 125         }
 126 }
 127
 128 # Check that we only have collation relationships where we need them
 129 is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
 130
 131 ## Check excel parsing
 132
 133 my $xls = 't/data/armexample.xls';
 134 my $xt = Text::Tradition->new(
 135         'name'  => 'excel test',
 136         'input' => 'Tabular',
 137         'file'  => $xls,
 138         'excel'   => 'xls'
 139         );
 140
 141 is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
 142 my %xls_wits;
 143 map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
 144 foreach my $wit ( $xt->witnesses ) {
 145         $xls_wits{$wit->sigil} = 1;
 146 }
 147 is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
 148 foreach my $k ( keys %xls_wits ) {
 149         ok( $xls_wits{$k}, "Witness $k still exists" );
 150 }
 151 is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
 152 is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
 153 is( $xt->collation->reading('r5.1')->text, "\x{587}",
 154         "Correct decoding of at least one reading" );
 155
 156 my $xlsx = 't/data/armexample.xlsx';
 157 my $xtx = Text::Tradition->new(
 158         'name'  => 'excel test',
 159         'input' => 'Tabular',
 160         'file'  => $xlsx,
 161         'excel'   => 'xlsx'
 162         );
 163
 164 is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" );
 165 my %xlsx_wits;
 166 map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
 167 foreach my $wit ( $xtx->witnesses ) {
 168         $xlsx_wits{$wit->sigil} = 1;
 169 }
 170 is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" );
 171 foreach my $k ( keys %xlsx_wits ) {
 172         ok( $xlsx_wits{$k}, "Witness $k still exists" );
 173 }
 174 is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" );
 175 is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" );
 176 is( $xtx->collation->reading('r5.1')->text, "\x{587}",
 177         "Correct decoding of at least one reading" );
 178
 179 =end testing
 180
 181 =cut
 182
 183 sub parse {
 184     my( $tradition, $opts ) = @_;
 185     my $alignment_table = _table_from_input( $opts );
 186     # Set up the witnesses we find in the first line
 187     my @witnesses;
 188     my %ac_wits;  # Track layered witness -> main witness mapping
 189     my $c = $tradition->collation; # shorthand
 190     my $aclabel = $c->ac_label;
 191     foreach my $sigil ( @{$alignment_table->[0]} ) {
 192         if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
 193                 # Sanitize the sigil name to an XML name
 194                 $sigil = $1 . '_layered';
 195             $ac_wits{$sigil} = $1;
 196         }
 197         my $wit = $tradition->add_witness(
 198                 'sigil' => $sigil, 'sourcetype' => 'collation' );
 199         $wit->path( [ $c->start ] );
 200         push( @witnesses, $wit );
 201         my $aclabel = $c->ac_label;
 202     }
 203
 204     # Save the original witness text sequences. Have to loop back through
 205     # the witness columns after we have identified all the a.c. witnesses.
 206     foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
 207         my @sequence = map { $_->[$idx] } @{$alignment_table};
 208         my $sigil = shift @sequence;
 209         my $is_layer = exists( $ac_wits{$sigil} );
 210         my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
 211         # Now get rid of gaps and meta-readings like #LACUNA#
 212         my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
 213         $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
 214     }
 215
 216     my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
 217     print STDERR "Tradition too big for row collation\n" if $nocollate;
 218
 219     # Now for the next rows, make nodes as necessary, assign their ranks, and
 220     # add them to the witness paths.
 221     foreach my $idx ( 1 .. $#{$alignment_table} ) {
 222         my $row = $alignment_table->[$idx];
 223         my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
 224         foreach my $w ( 0 .. $#{$row} ) {
 225             # push the appropriate node onto the appropriate witness path
 226             my $word = $row->[$w];
 227             if( $word ) {
 228                 my $reading = $nodes->{$word};
 229                 my $wit = $witnesses[$w];
 230                 push( @{$wit->path}, $reading );
 231             } # else skip it for empty readings.
 232         }
 233     }
 234
 235     # Collapse our lacunae into a single node and
 236     # push the end node onto all paths.
 237     $c->end->rank( scalar @$alignment_table );
 238     foreach my $wit ( @witnesses ) {
 239         my $p = $wit->path;
 240         my $last_rdg = shift @$p;
 241         my $new_p = [ $last_rdg ];
 242         foreach my $rdg ( @$p ) {
 243                 # Omit the reading if we are in a lacuna already.
 244                 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
 245                         # Save the reading otherwise.
 246                         push( @$new_p, $rdg );
 247                         $last_rdg = $rdg;
 248         }
 249         push( @$new_p, $c->end );
 250         $wit->path( $new_p );
 251     }
 252
 253     # Fold any a.c. witnesses into their main witness objects, and
 254     # delete the independent a.c. versions.
 255     foreach my $a ( keys %ac_wits ) {
 256         my $ac_wit = $tradition->witness( $a );
 257         my $main_wit = $tradition->witness( $ac_wits{$a} );
 258         next unless $main_wit;
 259         $main_wit->is_layered(1);
 260         $main_wit->uncorrected_path( $ac_wit->path );
 261         $tradition->del_witness( $ac_wit );
 262     }
 263
 264     # Join up the paths.
 265     $c->make_witness_paths;
 266     # Delete our unused lacuna nodes.
 267         foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
 268                 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
 269         }
 270
 271         # Do a consistency check.
 272         foreach my $wit ( $tradition->witnesses ) {
 273                 my $pathtext = $c->path_text( $wit->sigil );
 274                 my $origtext = join( ' ', @{$wit->text} );
 275                 warn "Text differs for witness " . $wit->sigil
 276                         unless $pathtext eq $origtext;
 277                 if( $wit->is_layered ) {
 278                         $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
 279                         $origtext = join( ' ', @{$wit->layertext} );
 280                         warn "Ante-corr text differs for witness " . $wit->sigil
 281                                 unless $pathtext eq $origtext;
 282                 } else {
 283                         warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
 284                                 if $wit->has_layertext;
 285                 }
 286         }
 287
 288         # Note that our ranks and common readings are set.
 289         $c->_graphcalc_done(1);
 290         # Remove redundant collation relationships.
 291         $c->relations->filter_collations() unless $nocollate;
 292 }
 293
 294 sub _table_from_input {
 295         my $opts = shift;
 296         my $alignment_table = [];
 297     if( $opts->{'excel'} ) {
 298         my $sheet;
 299         my $need_decode;
 300                 unless( exists $opts->{'file'} ) {
 301                         throw( "Must pass the filename for Excel parsing" );
 302                 }
 303         if( $opts->{'excel'} eq 'xls' ) {
 304                         try {
 305                                 require Spreadsheet::ParseExcel;
 306                         } catch {
 307                                 throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
 308                         }
 309                         my $parser = Spreadsheet::ParseExcel->new();
 310                         my $workbook = $parser->parse( $opts->{'file'} );
 311                         unless( defined $workbook && defined $workbook->worksheet(0) ) {
 312                                 throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
 313                         }
 314                         $sheet = $workbook->worksheet(0);
 315                 } elsif( $opts->{'excel'} eq 'xlsx' ) {
 316                         try {
 317                                 require Spreadsheet::XLSX;
 318                         } catch {
 319                                 throw( "Need module Spreadsheet::XLSX to parse .xlsx files" );
 320                         }
 321                         $need_decode = 1;
 322                         my $workbook;
 323                         try {
 324                                 $workbook = Spreadsheet::XLSX->new( $opts->{'file'} );
 325                         } catch {
 326                                 throw( "Failed to parse file " . $opts->{'file'} );
 327                         }
 328                         $sheet = $workbook->worksheet(0);
 329                 } else {
 330                         throw( "Unrecognized Excel variant" . $opts->{'excel'} );
 331                 }
 332                 $alignment_table = _alignment_from_worksheet( $sheet, $need_decode );
 333     } else {
 334         # Assume it is a comma-, tab-, or whatever-separated format.
 335                 my $csv_options = { 'binary' => 1 };
 336                 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
 337                 if( $csv_options->{'sep_char'} eq "\t" ) {
 338                         # If it is really tab separated, nothing is an escape char.
 339                         $csv_options->{'quote_char'} = undef;
 340                         $csv_options->{'escape_char'} = undef;
 341                 }
 342                 my $csv = Text::CSV->new( $csv_options );
 343
 344                 if( exists $opts->{'string' } ) {
 345                         my @lines = split( "\n", $opts->{'string'} );
 346                         foreach my $l ( @lines ) {
 347                                 my $status = $csv->parse( $l );
 348                                 if( $status ) {
 349                                         push( @$alignment_table, [ $csv->fields ] );
 350                                 } else {
 351                                         throw( "Could not parse line $l: " . $csv->error_input );
 352                                 }
 353                         }
 354                 } elsif( exists $opts->{'file'} ) {
 355                         open( my $fh, $opts->{'file'} )
 356                                 or warn "Could not open input file " . $opts->{'file'};
 357                         binmode( $fh, ':utf8' );
 358                         while( my $row = $csv->getline( $fh ) ) {
 359                                 push( @$alignment_table, $row );
 360                         }
 361                         close $fh;
 362                 } else {
 363                         throw( "Could not find string or file option to parse" );
 364                 }
 365         }
 366         return $alignment_table;
 367 }
 368 sub _alignment_from_worksheet {
 369         my( $sheet, $decode ) = @_;
 370         my $alignment_table = [];
 371
 372         my( $rmin, $rmax ) = $sheet->row_range();
 373         my( $cmin, $cmax ) = $sheet->col_range();
 374         unless( $cmax && $rmax ) {
 375                 throw( "Found no rows or no columns in first worksheet" );
 376         }
 377         # Populate the alignment table. We only want columns that have
 378         # a sigil in row zero.
 379         my %sigcols = ();
 380         push( @$alignment_table, [] );
 381         foreach my $col ( $cmin .. $cmax ) {
 382                 my $cell = $sheet->get_cell( $rmin, $col );
 383                 my $cellval = $cell ? $cell->value() : undef;
 384                 if( $cellval ) {
 385                         $sigcols{$col} = 1;
 386                         push( @{$alignment_table->[0]}, $cellval );
 387                 }
 388         }
 389         # Now go through the rest of the rows and pick up the columns
 390         # that were headed by a sigil.
 391         foreach my $row ( $rmin+1 .. $rmax ) {
 392                 my @tablerow;
 393                 foreach my $col ( $cmin .. $cmax ) {
 394                         next unless $sigcols{$col};
 395                         my $cell = $sheet->get_cell( $row, $col );
 396                         my $cellval;
 397                         if( $cell ) {
 398                                 $cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
 399                         }
 400                         push( @tablerow, $cellval );
 401                 }
 402                 push( @$alignment_table, \@tablerow );
 403         }
 404         return $alignment_table;
 405 }
 406
 407 sub _make_nodes {
 408     my( $collation, $row, $index, $nocollate ) = @_;
 409     my %unique;
 410     my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
 411     foreach my $w ( @$row ) {
 412         $unique{$w} = 1 if $w;
 413         $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
 414     }
 415     my $ctr = 1;
 416     foreach my $w ( keys %unique ) {
 417         my $rargs = {
 418                 'id' => "r$index.$ctr",
 419                 'rank' => $index,
 420                 'text' => $w,
 421                 };
 422         if( $w eq '#LACUNA#' ) {
 423                 $rargs->{'is_lacuna'} = 1;
 424         } elsif( $commonctr == 1 ) {
 425                 $rargs->{'is_common'} = 1;
 426         }
 427         my $r = $collation->add_reading( $rargs );
 428         $unique{$w} = $r;
 429         $ctr++;
 430     }
 431     # Collate this sequence of readings via a single 'collation' relationship.
 432     unless( $nocollate ) {
 433                 my @rankrdgs = values %unique;
 434                 my $collation_rel;
 435                 while( @rankrdgs ) {
 436                         my $r = shift @rankrdgs;
 437                         next if $r->is_meta;
 438                         foreach my $nr ( @rankrdgs ) {
 439                                 next if $nr->is_meta;
 440                                 if( $collation_rel ) {
 441                                         $collation->add_relationship( $r, $nr, $collation_rel );
 442                                 } else {
 443                                         $collation->add_relationship( $r, $nr,
 444                                                 { 'type' => 'collated',
 445                                                   'annotation' => "Parsed together for rank $index" } );
 446                                         $collation_rel = $collation->get_relationship( $r, $nr );
 447                                 }
 448                         }
 449                 }
 450         }
 451     return \%unique;
 452 }
 453
 454 sub throw {
 455         Text::Tradition::Error->throw(
 456                 'ident' => 'Parser::Tabular error',
 457                 'message' => $_[0],
 458                 );
 459 }
 460
 461 1;
 462
 463 =head1 LICENSE
 464
 465 This package is free software and is provided "as is" without express
 466 or implied warranty.  You can redistribute it and/or modify it under
 467 the same terms as Perl itself.
 468
 469 =head1 AUTHOR
 470
 471 Tara L Andrews E<lt>aurum@cpan.orgE<gt>