base/lib/Text/Tradition/Parser/Tabular.pm

   1 package Text::Tradition::Parser::Tabular;
   2
   3 use strict;
   4 use warnings;
   5 use Encode qw/ decode_utf8 /;
   6 use Text::CSV;
   7 use Text::Tradition::Error;
   8 use TryCatch;
   9
  10 =head1 NAME
  11
  12 Text::Tradition::Parser::Tabular
  13
  14 =head1 SYNOPSIS
  15
  16   use Text::Tradition;
  17
  18   my $t_from_file = Text::Tradition->new(
  19     'name' => 'my text',
  20     'input' => 'Tabular',
  21     'file' => '/path/to/collation.csv',
  22     'sep_char' => ','
  23     );
  24
  25   my $t_from_string = Text::Tradition->new(
  26     'name' => 'my text',
  27     'input' => 'Tabular',
  28     'string' => $tab_separated_collation,
  29     'sep_char' => "\t",
  30     );
  31
  32 =head1 DESCRIPTION
  33
  34 Parser module for Text::Tradition to read an alignment table format, such as CSV.
  35
  36 =head1 METHODS
  37
  38 =head2 B<parse>( $tradition, $option_hash )
  39
  40 Takes an initialized tradition and a set of options; creates the
  41 appropriate nodes and edges on the graph, as well as the appropriate
  42 witness objects.  The $option_hash must contain either a 'file' or a
  43 'string' argument with the table to be parsed; it may also contain a
  44 'sep_char' argument to specify how the fields are separated.
  45
  46 The table should have witnesses arranged in columns, with the witness sigla
  47 in the first row.  Empty cells are interpreted as omissions (and thus
  48 stemmatologically relevant.) Longer lacunae in the text, to be disregarded
  49 in cladistic analysis, may be represented by filling the appropriate cells
  50 with the tag '#LACUNA#'.
  51
  52 If a witness name ends in the collation's ac_label, it will be treated as
  53 an 'ante-correction' version of the 'main' witness whose sigil it shares.
  54
  55 =begin testing
  56
  57 use Text::Tradition;
  58 binmode STDOUT, ":utf8";
  59 binmode STDERR, ":utf8";
  60 eval { no warnings; binmode $DB::OUT, ":utf8"; };
  61
  62 my $csv = 't/data/florilegium.csv';
  63 my $t = Text::Tradition->new(
  64     'name'  => 'inline',
  65     'input' => 'Tabular',
  66     'file'  => $csv,
  67     'sep_char' => ',',
  68     );
  69
  70 is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
  71
  72 ### TODO Check these figures
  73 if( $t ) {
  74     is( scalar $t->collation->readings, 311, "Collation has all readings" );
  75     is( scalar $t->collation->paths, 361, "Collation has all paths" );
  76     is( scalar $t->witnesses, 13, "Collation has all witnesses" );
  77 }
  78
  79 # Check that we have the right witnesses
  80 my %seen_wits;
  81 map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
  82 foreach my $wit ( $t->witnesses ) {
  83         $seen_wits{$wit->sigil} = 1;
  84 }
  85 is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
  86 foreach my $k ( keys %seen_wits ) {
  87         ok( $seen_wits{$k}, "Witness $k still exists" );
  88 }
  89
  90 # Check that the witnesses have the right texts
  91 foreach my $wit ( $t->witnesses ) {
  92         my $origtext = join( ' ', @{$wit->text} );
  93         my $graphtext = $t->collation->path_text( $wit->sigil );
  94         is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
  95 }
  96
  97 # Check that the a.c. witnesses have the right text
  98 map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
  99 foreach my $k ( keys %seen_wits ) {
 100         my $wit = $t->witness( $k );
 101         if( $seen_wits{$k} ) {
 102                 ok( $wit->is_layered, "Witness $k got marked as layered" );
 103                 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
 104                 my $origtext = join( ' ', @{$wit->layertext} );
 105                 my $acsig = $wit->sigil . $t->collation->ac_label;
 106                 my $graphtext = $t->collation->path_text( $acsig );
 107                 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
 108         } else {
 109                 ok( !$wit->is_layered, "Witness $k not marked as layered" );
 110                 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
 111         }
 112 }
 113
 114 # Check that we only have collation relationships where we need them
 115 is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
 116
 117 ## Check excel parsing
 118
 119 my $xls = 't/data/armexample.xls';
 120 my $xt = Text::Tradition->new(
 121         'name'  => 'excel test',
 122         'input' => 'Tabular',
 123         'file'  => $xls,
 124         'excel'   => 'xls'
 125         );
 126
 127 is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
 128 my %xls_wits;
 129 map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
 130 foreach my $wit ( $xt->witnesses ) {
 131         $xls_wits{$wit->sigil} = 1;
 132 }
 133 is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
 134 foreach my $k ( keys %xls_wits ) {
 135         ok( $xls_wits{$k}, "Witness $k still exists" );
 136 }
 137 is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
 138 is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
 139 is( $xt->collation->reading('r5.1')->text, "\x{587}",
 140         "Correct decoding of at least one reading" );
 141
 142 my $xlsx = 't/data/armexample.xlsx';
 143 my $xtx = Text::Tradition->new(
 144         'name'  => 'excel test',
 145         'input' => 'Tabular',
 146         'file'  => $xlsx,
 147         'excel'   => 'xlsx'
 148         );
 149
 150 is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" );
 151 my %xlsx_wits;
 152 map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
 153 foreach my $wit ( $xtx->witnesses ) {
 154         $xlsx_wits{$wit->sigil} = 1;
 155 }
 156 is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" );
 157 foreach my $k ( keys %xlsx_wits ) {
 158         ok( $xlsx_wits{$k}, "Witness $k still exists" );
 159 }
 160 is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" );
 161 is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" );
 162 is( $xtx->collation->reading('r5.1')->text, "\x{587}",
 163         "Correct decoding of at least one reading" );
 164
 165 =end testing
 166
 167 =cut
 168
 169 sub parse {
 170     my( $tradition, $opts ) = @_;
 171     my $alignment_table = _table_from_input( $opts );
 172     # Set up the witnesses we find in the first line
 173     my @witnesses;
 174     my %ac_wits;  # Track layered witness -> main witness mapping
 175     my $c = $tradition->collation; # shorthand
 176     my $aclabel = $c->ac_label;
 177     foreach my $sigil ( @{$alignment_table->[0]} ) {
 178         if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
 179                 # Sanitize the sigil name to an XML name
 180                 $sigil = $1 . '_layered';
 181             $ac_wits{$sigil} = $1;
 182         }
 183         my $wit = $tradition->add_witness(
 184                 'sigil' => $sigil, 'sourcetype' => 'collation' );
 185         $wit->path( [ $c->start ] );
 186         push( @witnesses, $wit );
 187         my $aclabel = $c->ac_label;
 188     }
 189
 190     # Save the original witness text sequences. Have to loop back through
 191     # the witness columns after we have identified all the a.c. witnesses.
 192     foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
 193         my @sequence = map { $_->[$idx] } @{$alignment_table};
 194         my $sigil = shift @sequence;
 195         my $is_layer = exists( $ac_wits{$sigil} );
 196         my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
 197         # Now get rid of gaps and meta-readings like #LACUNA#
 198         my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
 199         $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
 200     }
 201
 202     my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
 203     print STDERR "Tradition too big for row collation\n" if $nocollate;
 204
 205     # Now for the next rows, make nodes as necessary, assign their ranks, and
 206     # add them to the witness paths.
 207     foreach my $idx ( 1 .. $#{$alignment_table} ) {
 208         my $row = $alignment_table->[$idx];
 209         my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
 210         foreach my $w ( 0 .. $#{$row} ) {
 211             # push the appropriate node onto the appropriate witness path
 212             my $word = $row->[$w];
 213             if( $word ) {
 214                 my $reading = $nodes->{$word};
 215                 my $wit = $witnesses[$w];
 216                 push( @{$wit->path}, $reading );
 217             } # else skip it for empty readings.
 218         }
 219     }
 220
 221     # Collapse our lacunae into a single node and
 222     # push the end node onto all paths.
 223     $c->end->rank( scalar @$alignment_table );
 224     foreach my $wit ( @witnesses ) {
 225         my $p = $wit->path;
 226         my $last_rdg = shift @$p;
 227         my $new_p = [ $last_rdg ];
 228         foreach my $rdg ( @$p ) {
 229                 # Omit the reading if we are in a lacuna already.
 230                 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
 231                         # Save the reading otherwise.
 232                         push( @$new_p, $rdg );
 233                         $last_rdg = $rdg;
 234         }
 235         push( @$new_p, $c->end );
 236         $wit->path( $new_p );
 237     }
 238
 239     # Fold any a.c. witnesses into their main witness objects, and
 240     # delete the independent a.c. versions.
 241     foreach my $a ( keys %ac_wits ) {
 242         my $ac_wit = $tradition->witness( $a );
 243         my $main_wit = $tradition->witness( $ac_wits{$a} );
 244         next unless $main_wit;
 245         $main_wit->is_layered(1);
 246         $main_wit->uncorrected_path( $ac_wit->path );
 247         $tradition->del_witness( $ac_wit );
 248     }
 249
 250     # Join up the paths.
 251     $c->make_witness_paths;
 252     # Delete our unused lacuna nodes.
 253         foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
 254                 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
 255         }
 256
 257         # Do a consistency check.
 258         foreach my $wit ( $tradition->witnesses ) {
 259                 my $pathtext = $c->path_text( $wit->sigil );
 260                 my $origtext = join( ' ', @{$wit->text} );
 261                 warn "Text differs for witness " . $wit->sigil
 262                         unless $pathtext eq $origtext;
 263                 if( $wit->is_layered ) {
 264                         $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
 265                         $origtext = join( ' ', @{$wit->layertext} );
 266                         warn "Ante-corr text differs for witness " . $wit->sigil
 267                                 unless $pathtext eq $origtext;
 268                 } else {
 269                         warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
 270                                 if $wit->has_layertext;
 271                 }
 272         }
 273
 274         # Note that our ranks and common readings are set.
 275         $c->_graphcalc_done(1);
 276         # Remove redundant collation relationships.
 277         $c->relations->filter_collations() unless $nocollate;
 278 }
 279
 280 sub _table_from_input {
 281         my $opts = shift;
 282         my $alignment_table = [];
 283     if( $opts->{'excel'} ) {
 284         my $sheet;
 285         my $need_decode;
 286                 unless( exists $opts->{'file'} ) {
 287                         throw( "Must pass the filename for Excel parsing" );
 288                 }
 289         if( $opts->{'excel'} eq 'xls' ) {
 290                         try {
 291                                 require Spreadsheet::ParseExcel;
 292                         } catch {
 293                                 throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
 294                         }
 295                         my $parser = Spreadsheet::ParseExcel->new();
 296                         my $workbook = $parser->parse( $opts->{'file'} );
 297                         unless( defined $workbook && defined $workbook->worksheet(0) ) {
 298                                 throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
 299                         }
 300                         $sheet = $workbook->worksheet(0);
 301                 } elsif( $opts->{'excel'} eq 'xlsx' ) {
 302                         try {
 303                                 require Spreadsheet::XLSX;
 304                         } catch {
 305                                 throw( "Need module Spreadsheet::XLSX to parse .xlsx files" );
 306                         }
 307                         $need_decode = 1;
 308                         my $workbook;
 309                         try {
 310                                 $workbook = Spreadsheet::XLSX->new( $opts->{'file'} );
 311                         } catch {
 312                                 throw( "Failed to parse file " . $opts->{'file'} );
 313                         }
 314                         $sheet = $workbook->worksheet(0);
 315                 } else {
 316                         throw( "Unrecognized Excel variant" . $opts->{'excel'} );
 317                 }
 318                 $alignment_table = _alignment_from_worksheet( $sheet, $need_decode );
 319     } else {
 320         # Assume it is a comma-, tab-, or whatever-separated format.
 321                 my $csv_options = { 'binary' => 1 };
 322                 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
 323                 if( $csv_options->{'sep_char'} eq "\t" ) {
 324                         # If it is really tab separated, nothing is an escape char.
 325                         $csv_options->{'quote_char'} = undef;
 326                         $csv_options->{'escape_char'} = undef;
 327                 }
 328                 my $csv = Text::CSV->new( $csv_options );
 329
 330                 if( exists $opts->{'string' } ) {
 331                         my @lines = split( "\n", $opts->{'string'} );
 332                         foreach my $l ( @lines ) {
 333                                 my $status = $csv->parse( $l );
 334                                 if( $status ) {
 335                                         push( @$alignment_table, [ $csv->fields ] );
 336                                 } else {
 337                                         throw( "Could not parse line $l: " . $csv->error_input );
 338                                 }
 339                         }
 340                 } elsif( exists $opts->{'file'} ) {
 341                         open( my $fh, $opts->{'file'} )
 342                                 or warn "Could not open input file " . $opts->{'file'};
 343                         binmode( $fh, ':utf8' );
 344                         while( my $row = $csv->getline( $fh ) ) {
 345                                 push( @$alignment_table, $row );
 346                         }
 347                         close $fh;
 348                 } else {
 349                         throw( "Could not find string or file option to parse" );
 350                 }
 351         }
 352         return $alignment_table;
 353 }
 354 sub _alignment_from_worksheet {
 355         my( $sheet, $decode ) = @_;
 356         my $alignment_table = [];
 357
 358         my( $rmin, $rmax ) = $sheet->row_range();
 359         my( $cmin, $cmax ) = $sheet->col_range();
 360         unless( $cmax && $rmax ) {
 361                 throw( "Found no rows or no columns in first worksheet" );
 362         }
 363         # Populate the alignment table. We only want columns that have
 364         # a sigil in row zero.
 365         my %sigcols = ();
 366         push( @$alignment_table, [] );
 367         foreach my $col ( $cmin .. $cmax ) {
 368                 my $cell = $sheet->get_cell( $rmin, $col );
 369                 my $cellval = $cell ? $cell->value() : undef;
 370                 if( $cellval ) {
 371                         $sigcols{$col} = 1;
 372                         push( @{$alignment_table->[0]}, $cellval );
 373                 }
 374         }
 375         # Now go through the rest of the rows and pick up the columns
 376         # that were headed by a sigil.
 377         foreach my $row ( $rmin+1 .. $rmax ) {
 378                 my @tablerow;
 379                 foreach my $col ( $cmin .. $cmax ) {
 380                         next unless $sigcols{$col};
 381                         my $cell = $sheet->get_cell( $row, $col );
 382                         my $cellval;
 383                         if( $cell ) {
 384                                 $cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
 385                         }
 386                         push( @tablerow, $cellval );
 387                 }
 388                 push( @$alignment_table, \@tablerow );
 389         }
 390         return $alignment_table;
 391 }
 392
 393 sub _make_nodes {
 394     my( $collation, $row, $index, $nocollate ) = @_;
 395     my %unique;
 396     my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
 397     foreach my $w ( @$row ) {
 398         $unique{$w} = 1 if $w;
 399         $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
 400     }
 401     my $ctr = 1;
 402     foreach my $w ( keys %unique ) {
 403         my $rargs = {
 404                 'id' => "r$index.$ctr",
 405                 'rank' => $index,
 406                 'text' => $w,
 407                 };
 408         if( $w eq '#LACUNA#' ) {
 409                 $rargs->{'is_lacuna'} = 1;
 410         } elsif( $commonctr == 1 ) {
 411                 $rargs->{'is_common'} = 1;
 412         }
 413         my $r = $collation->add_reading( $rargs );
 414         $unique{$w} = $r;
 415         $ctr++;
 416     }
 417     # Collate this sequence of readings via a single 'collation' relationship.
 418     unless( $nocollate ) {
 419                 my @rankrdgs = values %unique;
 420                 my $collation_rel;
 421                 while( @rankrdgs ) {
 422                         my $r = shift @rankrdgs;
 423                         next if $r->is_meta;
 424                         foreach my $nr ( @rankrdgs ) {
 425                                 next if $nr->is_meta;
 426                                 if( $collation_rel ) {
 427                                         $collation->add_relationship( $r, $nr, $collation_rel );
 428                                 } else {
 429                                         $collation->add_relationship( $r, $nr,
 430                                                 { 'type' => 'collated',
 431                                                   'annotation' => "Parsed together for rank $index" } );
 432                                         $collation_rel = $collation->get_relationship( $r, $nr );
 433                                 }
 434                         }
 435                 }
 436         }
 437     return \%unique;
 438 }
 439
 440 sub throw {
 441         Text::Tradition::Error->throw(
 442                 'ident' => 'Parser::Tabular error',
 443                 'message' => $_[0],
 444                 );
 445 }
 446
 447 1;
 448
 449 =head1 LICENSE
 450
 451 This package is free software and is provided "as is" without express
 452 or implied warranty.  You can redistribute it and/or modify it under
 453 the same terms as Perl itself.
 454
 455 =head1 AUTHOR
 456
 457 Tara L Andrews E<lt>aurum@cpan.orgE<gt>