base/lib/Text/Tradition/Parser/Tabular.pm

   1 package Text::Tradition::Parser::Tabular;
   2
   3 use strict;
   4 use warnings;
   5 use Encode qw/ decode_utf8 /;
   6 use Text::CSV;
   7 use Text::Tradition::Error;
   8 use TryCatch;
   9
  10 =head1 NAME
  11
  12 Text::Tradition::Parser::Tabular
  13
  14 =head1 SYNOPSIS
  15
  16   use Text::Tradition;
  17
  18   my $t_from_file = Text::Tradition->new(
  19     'name' => 'my text',
  20     'input' => 'Tabular',
  21     'file' => '/path/to/collation.csv',
  22     'sep_char' => ','
  23     );
  24
  25   my $t_from_string = Text::Tradition->new(
  26     'name' => 'my text',
  27     'input' => 'Tabular',
  28     'string' => $tab_separated_collation,
  29     'sep_char' => "\t",
  30     );
  31
  32 =head1 DESCRIPTION
  33
  34 Parser module for Text::Tradition to read an alignment table format, such as
  35 CSV or Excel.
  36
  37 =head1 METHODS
  38
  39 =head2 B<parse>( $tradition, $option_hash )
  40
  41 Takes an initialized tradition and a set of options; creates the
  42 appropriate nodes and edges on the graph, as well as the appropriate
  43 witness objects.  The $option_hash can contain the following:
  44
  45 =over
  46
  47 =item * file - Name of file which contains the data
  48
  49 =item * string - A string that itself contains the data. One of 'file' or
  50 'string' is required.
  51
  52 =item * sep_char - For plaintext formats, the field separation character.
  53 Defaults to "\t" (tab-separated); should be "," for comma-separated format.
  54
  55 =item * excel - If the data is in an Excel file, this option should be set
  56 to 'xls' (for pre-2007 Excel format) or 'xlsx' (for Excel XML format.)
  57
  58 =back
  59
  60 The data should comprise a table with witnesses arranged in columns, with
  61 the witness sigla in the first row.  Empty cells are interpreted as
  62 omissions (and thus stemmatologically relevant.) Longer lacunae in the
  63 text, to be disregarded in cladistic analysis, may be represented by
  64 filling the appropriate cells with the tag '#LACUNA#'.
  65
  66 If a witness name ends in the collation's ac_label, it will be treated as
  67 an 'ante-correction' version of the 'main' witness whose sigil it shares.
  68
  69 =begin testing
  70
  71 use Test::More::UTF8;
  72 use Text::Tradition;
  73 binmode STDOUT, ":utf8";
  74 binmode STDERR, ":utf8";
  75 eval { no warnings; binmode $DB::OUT, ":utf8"; };
  76
  77 my $csv = 't/data/florilegium.csv';
  78 my $t = Text::Tradition->new(
  79     'name'  => 'inline',
  80     'input' => 'Tabular',
  81     'file'  => $csv,
  82     'sep_char' => ',',
  83     );
  84
  85 is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
  86
  87 ### TODO Check these figures
  88 if( $t ) {
  89     is( scalar $t->collation->readings, 311, "Collation has all readings" );
  90     is( scalar $t->collation->paths, 361, "Collation has all paths" );
  91     is( scalar $t->witnesses, 13, "Collation has all witnesses" );
  92 }
  93
  94 # Check that we have the right witnesses
  95 my %seen_wits;
  96 map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
  97 foreach my $wit ( $t->witnesses ) {
  98         $seen_wits{$wit->sigil} = 1;
  99 }
 100 is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
 101 foreach my $k ( keys %seen_wits ) {
 102         ok( $seen_wits{$k}, "Witness $k still exists" );
 103 }
 104
 105 # Check that the witnesses have the right texts
 106 foreach my $wit ( $t->witnesses ) {
 107         my $origtext = join( ' ', @{$wit->text} );
 108         my $graphtext = $t->collation->path_text( $wit->sigil );
 109         is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
 110 }
 111
 112 # Check that the a.c. witnesses have the right text
 113 map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
 114 foreach my $k ( keys %seen_wits ) {
 115         my $wit = $t->witness( $k );
 116         if( $seen_wits{$k} ) {
 117                 ok( $wit->is_layered, "Witness $k got marked as layered" );
 118                 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
 119                 my $origtext = join( ' ', @{$wit->layertext} );
 120                 my $acsig = $wit->sigil . $t->collation->ac_label;
 121                 my $graphtext = $t->collation->path_text( $acsig );
 122                 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
 123         } else {
 124                 ok( !$wit->is_layered, "Witness $k not marked as layered" );
 125                 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
 126         }
 127 }
 128
 129 # Check that we only have collation relationships where we need them
 130 is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
 131
 132 ## Check excel parsing
 133
 134 my $xls = 't/data/armexample.xls';
 135 my $xt = Text::Tradition->new(
 136         'name'  => 'excel test',
 137         'input' => 'Tabular',
 138         'file'  => $xls,
 139         'excel'   => 'xls'
 140         );
 141
 142 is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
 143 my %xls_wits;
 144 map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
 145 foreach my $wit ( $xt->witnesses ) {
 146         $xls_wits{$wit->sigil} = 1;
 147 }
 148 is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
 149 foreach my $k ( keys %xls_wits ) {
 150         ok( $xls_wits{$k}, "Witness $k still exists" );
 151 }
 152 is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
 153 is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
 154 is( $xt->collation->reading('r5.1')->text, "\x{587}",
 155         "Correct decoding of at least one reading" );
 156
 157 my $xlsx = 't/data/armexample.xlsx';
 158 my $xtx = Text::Tradition->new(
 159         'name'  => 'excel test',
 160         'input' => 'Tabular',
 161         'file'  => $xlsx,
 162         'excel'   => 'xlsx'
 163         );
 164
 165 is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" );
 166 my %xlsx_wits;
 167 map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit3 /;
 168 $xlsx_wits{"\x{531}\x{562}2"} = 0;
 169 foreach my $wit ( $xtx->witnesses ) {
 170         $xlsx_wits{$wit->sigil} = 1;
 171 }
 172 is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" );
 173 foreach my $k ( keys %xlsx_wits ) {
 174         ok( $xlsx_wits{$k}, "Witness $k still exists" );
 175 }
 176 is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" );
 177 is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" );
 178 is( $xtx->collation->reading('r5.1')->text, "\x{587}",
 179         "Correct decoding of at least one reading" );
 180
 181 =end testing
 182
 183 =cut
 184
 185 sub parse {
 186     my( $tradition, $opts ) = @_;
 187     my $alignment_table = _table_from_input( $opts );
 188     # Set up the witnesses we find in the first line
 189     my @witnesses;
 190     my %ac_wits;  # Track layered witness -> main witness mapping
 191     my $c = $tradition->collation; # shorthand
 192     my $aclabel = $c->ac_label;
 193     foreach my $sigil ( @{$alignment_table->[0]} ) {
 194         if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
 195                 # Sanitize the sigil name to an XML name
 196                 $sigil = $1 . '_layered';
 197             $ac_wits{$sigil} = $1;
 198         }
 199         my $wit = $tradition->add_witness(
 200                 'sigil' => $sigil, 'sourcetype' => 'collation' );
 201         $wit->path( [ $c->start ] );
 202         push( @witnesses, $wit );
 203         my $aclabel = $c->ac_label;
 204     }
 205
 206     # Save the original witness text sequences. Have to loop back through
 207     # the witness columns after we have identified all the a.c. witnesses.
 208     foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
 209         my @sequence = map { $_->[$idx] } @{$alignment_table};
 210         my $sigil = shift @sequence;
 211         my $is_layer = exists( $ac_wits{$sigil} );
 212         my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
 213         # Now get rid of gaps and meta-readings like #LACUNA#
 214         my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
 215         $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
 216     }
 217
 218     my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
 219     print STDERR "Tradition too big for row collation\n" if $nocollate;
 220
 221     # Now for the next rows, make nodes as necessary, assign their ranks, and
 222     # add them to the witness paths.
 223     foreach my $idx ( 1 .. $#{$alignment_table} ) {
 224         my $row = $alignment_table->[$idx];
 225         my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
 226         foreach my $w ( 0 .. $#{$row} ) {
 227             # push the appropriate node onto the appropriate witness path
 228             my $word = $row->[$w];
 229             if( $word ) {
 230                 my $reading = $nodes->{$word};
 231                 my $wit = $witnesses[$w];
 232                 push( @{$wit->path}, $reading );
 233             } # else skip it for empty readings.
 234         }
 235     }
 236
 237     # Collapse our lacunae into a single node and
 238     # push the end node onto all paths.
 239     $c->end->rank( scalar @$alignment_table );
 240     foreach my $wit ( @witnesses ) {
 241         my $p = $wit->path;
 242         my $last_rdg = shift @$p;
 243         my $new_p = [ $last_rdg ];
 244         foreach my $rdg ( @$p ) {
 245                 # Omit the reading if we are in a lacuna already.
 246                 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
 247                         # Save the reading otherwise.
 248                         push( @$new_p, $rdg );
 249                         $last_rdg = $rdg;
 250         }
 251         push( @$new_p, $c->end );
 252         $wit->path( $new_p );
 253     }
 254
 255     # Fold any a.c. witnesses into their main witness objects, and
 256     # delete the independent a.c. versions.
 257     foreach my $a ( keys %ac_wits ) {
 258         my $ac_wit = $tradition->witness( $a );
 259         my $main_wit = $tradition->witness( $ac_wits{$a} );
 260         next unless $main_wit;
 261         $main_wit->is_layered(1);
 262         $main_wit->uncorrected_path( $ac_wit->path );
 263         $tradition->del_witness( $ac_wit );
 264     }
 265
 266     # Join up the paths.
 267     $c->make_witness_paths;
 268     # Delete our unused lacuna nodes.
 269         foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
 270                 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
 271         }
 272
 273         # Do a consistency check.
 274         foreach my $wit ( $tradition->witnesses ) {
 275                 my $pathtext = $c->path_text( $wit->sigil );
 276                 my $origtext = join( ' ', @{$wit->text} );
 277                 warn "Text differs for witness " . $wit->sigil
 278                         unless $pathtext eq $origtext;
 279                 if( $wit->is_layered ) {
 280                         $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
 281                         $origtext = join( ' ', @{$wit->layertext} );
 282                         warn "Ante-corr text differs for witness " . $wit->sigil
 283                                 unless $pathtext eq $origtext;
 284                 } else {
 285                         warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
 286                                 if $wit->has_layertext;
 287                 }
 288         }
 289
 290         # Note that our ranks and common readings are set.
 291         $c->_graphcalc_done(1);
 292         _add_collations( $c ) unless $nocollate;
 293 }
 294
 295 sub _table_from_input {
 296         my $opts = shift;
 297         my $alignment_table = [];
 298     if( $opts->{'excel'} ) {
 299         my $sheet;
 300         my $need_decode;
 301                 unless( exists $opts->{'file'} ) {
 302                         throw( "Must pass the filename for Excel parsing" );
 303                 }
 304         if( $opts->{'excel'} eq 'xls' ) {
 305                         try {
 306                                 require Spreadsheet::ParseExcel;
 307                         } catch {
 308                                 throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
 309                         }
 310                         my $parser = Spreadsheet::ParseExcel->new();
 311                         my $workbook = $parser->parse( $opts->{'file'} );
 312                         unless( defined $workbook && defined $workbook->worksheet(0) ) {
 313                                 throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
 314                         }
 315                         $sheet = $workbook->worksheet(0);
 316                 } elsif( $opts->{'excel'} eq 'xlsx' ) {
 317                         try {
 318                                 require Spreadsheet::XLSX;
 319                         } catch {
 320                                 throw( "Need module Spreadsheet::XLSX to parse .xlsx files" );
 321                         }
 322                         $need_decode = 1;
 323                         my $workbook;
 324                         try {
 325                                 $workbook = Spreadsheet::XLSX->new( $opts->{'file'} );
 326                         } catch {
 327                                 throw( "Failed to parse file " . $opts->{'file'} );
 328                         }
 329                         $sheet = $workbook->worksheet(0);
 330                 } else {
 331                         throw( "Unrecognized Excel variant" . $opts->{'excel'} );
 332                 }
 333                 $alignment_table = _alignment_from_worksheet( $sheet, $need_decode );
 334     } else {
 335         # Assume it is a comma-, tab-, or whatever-separated format.
 336                 my $csv_options = { 'binary' => 1 };
 337                 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
 338                 if( $csv_options->{'sep_char'} eq "\t" ) {
 339                         # If it is really tab separated, nothing is an escape char.
 340                         $csv_options->{'quote_char'} = undef;
 341                         $csv_options->{'escape_char'} = undef;
 342                 }
 343                 my $csv = Text::CSV->new( $csv_options );
 344
 345                 if( exists $opts->{'string' } ) {
 346                         my @lines = split( "\n", $opts->{'string'} );
 347                         foreach my $l ( @lines ) {
 348                                 my $status = $csv->parse( $l );
 349                                 if( $status ) {
 350                                         push( @$alignment_table, [ $csv->fields ] );
 351                                 } else {
 352                                         throw( "Could not parse line $l: " . $csv->error_input );
 353                                 }
 354                         }
 355                 } elsif( exists $opts->{'file'} ) {
 356                         open( my $fh, $opts->{'file'} )
 357                                 or warn "Could not open input file " . $opts->{'file'};
 358                         binmode( $fh, ':utf8' );
 359                         while( my $row = $csv->getline( $fh ) ) {
 360                                 push( @$alignment_table, $row );
 361                         }
 362                         close $fh;
 363                 } else {
 364                         throw( "Could not find string or file option to parse" );
 365                 }
 366         }
 367         return $alignment_table;
 368 }
 369 sub _alignment_from_worksheet {
 370         my( $sheet, $decode ) = @_;
 371         my $alignment_table = [];
 372
 373         my( $rmin, $rmax ) = $sheet->row_range();
 374         my( $cmin, $cmax ) = $sheet->col_range();
 375         unless( $cmax && $rmax ) {
 376                 throw( "Found no rows or no columns in first worksheet" );
 377         }
 378         # Populate the alignment table. We only want columns that have
 379         # a sigil in row zero.
 380         my %sigcols = ();
 381         push( @$alignment_table, [] );
 382         foreach my $col ( $cmin .. $cmax ) {
 383                 my $cell = $sheet->get_cell( $rmin, $col );
 384                 my $cellval;
 385                 if( $cell ) {
 386                         $cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
 387                 }
 388                 if( $cellval ) {
 389                         $sigcols{$col} = 1;
 390                         push( @{$alignment_table->[0]}, $cellval );
 391                 }
 392         }
 393         # Now go through the rest of the rows and pick up the columns
 394         # that were headed by a sigil.
 395         foreach my $row ( $rmin+1 .. $rmax ) {
 396                 my @tablerow;
 397                 foreach my $col ( $cmin .. $cmax ) {
 398                         next unless $sigcols{$col};
 399                         my $cell = $sheet->get_cell( $row, $col );
 400                         my $cellval;
 401                         if( $cell ) {
 402                                 $cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
 403                         }
 404                         push( @tablerow, $cellval );
 405                 }
 406                 push( @$alignment_table, \@tablerow );
 407         }
 408         return $alignment_table;
 409 }
 410
 411 sub _make_nodes {
 412     my( $collation, $row, $index, $nocollate ) = @_;
 413     my %unique;
 414     my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
 415     foreach my $w ( @$row ) {
 416         $unique{$w} = 1 if $w;
 417         $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
 418     }
 419     my $ctr = 1;
 420     foreach my $w ( keys %unique ) {
 421         my $rargs = {
 422                 'id' => "r$index.$ctr",
 423                 'rank' => $index,
 424                 'text' => $w,
 425                 };
 426         if( $w eq '#LACUNA#' ) {
 427                 $rargs->{'is_lacuna'} = 1;
 428         } elsif( $commonctr == 1 ) {
 429                 $rargs->{'is_common'} = 1;
 430         }
 431         my $r = $collation->add_reading( $rargs );
 432         $unique{$w} = $r;
 433         $ctr++;
 434     }
 435     return \%unique;
 436 }
 437
 438 sub _add_collations {
 439         my( $collation ) = shift;
 440         # For each reading that needs to be held in place, add a 'collated'
 441         # relationship to whatever anchor we can find. An anchor is a reading
 442         # that would occupy its rank by virtue of being subsequent to a
 443         # reading at $rank-1.
 444         my @collate_pairs;
 445         foreach my $r ( 1 .. $collation->end->rank - 1 ) {
 446                 my $anchor;
 447                 my @need_weak;
 448                 my @here = grep { !$_->is_meta } $collation->readings_at_rank( $r );
 449                 next unless @here > 1;
 450                 foreach my $rdg ( @here ) {
 451                         my $ip = 0;
 452                         foreach my $pred ( $rdg->predecessors ) {
 453                                 if( $pred->rank == $r - 1 ) {
 454                                         $ip = 1;
 455                                         $anchor = $rdg unless( $anchor );
 456                                         last;
 457                                 }
 458                         }
 459                         push( @need_weak, $rdg ) unless $ip;
 460                 }
 461                 $anchor
 462                         ? map { push( @collate_pairs, [ $r, $anchor, $_ ] ) } @need_weak
 463                         : print STDERR "No anchor found at $r\n";
 464         }
 465         foreach my $p ( @collate_pairs ) {
 466                 my $r = shift @$p;
 467                 $collation->add_relationship( @$p,
 468                         { 'type' => 'collated',
 469                           'annotation' => "Collated together for rank $r" } )
 470                         unless $collation->get_relationship( @$p )
 471         }
 472 }
 473
 474 sub throw {
 475         Text::Tradition::Error->throw(
 476                 'ident' => 'Parser::Tabular error',
 477                 'message' => $_[0],
 478                 );
 479 }
 480
 481 1;
 482
 483 =head1 LICENSE
 484
 485 This package is free software and is provided "as is" without express
 486 or implied warranty.  You can redistribute it and/or modify it under
 487 the same terms as Perl itself.
 488
 489 =head1 AUTHOR
 490
 491 Tara L Andrews E<lt>aurum@cpan.orgE<gt>