lib/Text/Tradition/Parser/Tabular.pm

   1 package Text::Tradition::Parser::Tabular;
   2
   3 use strict;
   4 use warnings;
   5 use Text::CSV;
   6 use Text::Tradition::Error;
   7 use TryCatch;
   8
   9 =head1 NAME
  10
  11 Text::Tradition::Parser::Tabular
  12
  13 =head1 SYNOPSIS
  14
  15   use Text::Tradition;
  16
  17   my $t_from_file = Text::Tradition->new(
  18     'name' => 'my text',
  19     'input' => 'Tabular',
  20     'file' => '/path/to/collation.csv',
  21     'sep_char' => ','
  22     );
  23
  24   my $t_from_string = Text::Tradition->new(
  25     'name' => 'my text',
  26     'input' => 'Tabular',
  27     'string' => $tab_separated_collation,
  28     'sep_char' => "\t",
  29     );
  30
  31 =head1 DESCRIPTION
  32
  33 Parser module for Text::Tradition to read an alignment table format, such as CSV.
  34
  35 =head1 METHODS
  36
  37 =head2 B<parse>( $tradition, $option_hash )
  38
  39 Takes an initialized tradition and a set of options; creates the
  40 appropriate nodes and edges on the graph, as well as the appropriate
  41 witness objects.  The $option_hash must contain either a 'file' or a
  42 'string' argument with the table to be parsed; it may also contain a
  43 'sep_char' argument to specify how the fields are separated.
  44
  45 The table should have witnesses arranged in columns, with the witness sigla
  46 in the first row.  Empty cells are interpreted as omissions (and thus
  47 stemmatologically relevant.) Longer lacunae in the text, to be disregarded
  48 in cladistic analysis, may be represented by filling the appropriate cells
  49 with the tag '#LACUNA#'.
  50
  51 If a witness name ends in the collation's ac_label, it will be treated as
  52 an 'ante-correction' version of the 'main' witness whose sigil it shares.
  53
  54 =begin testing
  55
  56 use Text::Tradition;
  57 binmode STDOUT, ":utf8";
  58 binmode STDERR, ":utf8";
  59 eval { no warnings; binmode $DB::OUT, ":utf8"; };
  60
  61 my $csv = 't/data/florilegium.csv';
  62 my $t = Text::Tradition->new(
  63     'name'  => 'inline',
  64     'input' => 'Tabular',
  65     'file'  => $csv,
  66     'sep_char' => ',',
  67     );
  68
  69 is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
  70
  71 ### TODO Check these figures
  72 if( $t ) {
  73     is( scalar $t->collation->readings, 311, "Collation has all readings" );
  74     is( scalar $t->collation->paths, 361, "Collation has all paths" );
  75     is( scalar $t->witnesses, 13, "Collation has all witnesses" );
  76 }
  77
  78 # Check that we have the right witnesses
  79 my %seen_wits;
  80 map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
  81 foreach my $wit ( $t->witnesses ) {
  82         $seen_wits{$wit->sigil} = 1;
  83 }
  84 is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
  85 foreach my $k ( keys %seen_wits ) {
  86         ok( $seen_wits{$k}, "Witness $k still exists" );
  87 }
  88
  89 # Check that the witnesses have the right texts
  90 foreach my $wit ( $t->witnesses ) {
  91         my $origtext = join( ' ', @{$wit->text} );
  92         my $graphtext = $t->collation->path_text( $wit->sigil );
  93         is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
  94 }
  95
  96 # Check that the a.c. witnesses have the right text
  97 map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
  98 foreach my $k ( keys %seen_wits ) {
  99         my $wit = $t->witness( $k );
 100         if( $seen_wits{$k} ) {
 101                 ok( $wit->is_layered, "Witness $k got marked as layered" );
 102                 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
 103                 my $origtext = join( ' ', @{$wit->layertext} );
 104                 my $acsig = $wit->sigil . $t->collation->ac_label;
 105                 my $graphtext = $t->collation->path_text( $acsig );
 106                 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
 107         } else {
 108                 ok( !$wit->is_layered, "Witness $k not marked as layered" );
 109                 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
 110         }
 111 }
 112
 113 # Check that we only have collation relationships where we need them
 114 is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
 115
 116 ## Check excel parsing
 117
 118 my $xls = 't/data/armexample.xls';
 119 my $xt = Text::Tradition->new(
 120         'name'  => 'excel test',
 121         'input' => 'Tabular',
 122         'file'  => $xls,
 123         'xls'   => 1
 124         );
 125
 126 is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
 127 my %xls_wits;
 128 map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
 129 foreach my $wit ( $xt->witnesses ) {
 130         $xls_wits{$wit->sigil} = 1;
 131 }
 132 is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
 133 foreach my $k ( keys %xls_wits ) {
 134         ok( $xls_wits{$k}, "Witness $k still exists" );
 135 }
 136 is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
 137 is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
 138 is( $xt->collation->reading('r5.1')->text, "\x{587}",
 139         "Correct decoding of at least one reading" );
 140
 141 =end testing
 142
 143 =cut
 144
 145 sub parse {
 146     my( $tradition, $opts ) = @_;
 147     my $c = $tradition->collation; # shorthand
 148     my $alignment_table;
 149     if( $opts->{'xls'} ) {
 150         try {
 151                 require Spreadsheet::ParseExcel;
 152         } catch {
 153                 throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
 154         }
 155         unless( exists $opts->{'file'} ) {
 156                 throw( "Must pass the filename for Excel parsing" );
 157         }
 158         my $parser = Spreadsheet::ParseExcel->new();
 159                 my $workbook = $parser->parse( $opts->{'file'} );
 160                 unless( defined $workbook && defined $workbook->worksheet(0) ) {
 161                         throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
 162                 }
 163                 # Use the first worksheet
 164                 my $sheet = $workbook->worksheet(0);
 165                 my( $rmin, $rmax ) = $sheet->row_range();
 166                 my( $cmin, $cmax ) = $sheet->col_range();
 167                 unless( $cmax && $rmax ) {
 168                         throw( "Found no rows or no columns in first worksheet" );
 169                 }
 170                 # Populate the alignment table. We only want columns that have
 171                 # a sigil in row zero.
 172                 my %sigcols = ();
 173                 push( @$alignment_table, [] );
 174                 foreach my $col ( $cmin .. $cmax ) {
 175                         my $cell = $sheet->get_cell( $rmin, $col );
 176                         my $cellval = $cell ? $cell->value() : undef;
 177                         if( $cellval ) {
 178                                 $sigcols{$col} = 1;
 179                                 push( @{$alignment_table->[0]}, $cellval );
 180                         }
 181                 }
 182                 # Now go through the rest of the rows and pick up the columns
 183                 # that were headed by a sigil.
 184                 foreach my $row ( $rmin+1 .. $rmax ) {
 185                         my @tablerow;
 186                         foreach my $col ( $cmin .. $cmax ) {
 187                                 next unless $sigcols{$col};
 188                                 my $cell = $sheet->get_cell( $row, $col );
 189                                 my $cellval = $cell ? $cell->value() : undef;
 190                                 push( @tablerow, $cell ? $cell->value() : undef );
 191                         }
 192                         push( @$alignment_table, \@tablerow );
 193                 }
 194     } else {
 195         # Assume it is a comma-, tab-, or whatever-separated format.
 196                 my $csv_options = { 'binary' => 1 };
 197                 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
 198                 if( $csv_options->{'sep_char'} eq "\t" ) {
 199                         # If it is really tab separated, nothing is an escape char.
 200                         $csv_options->{'quote_char'} = undef;
 201                         $csv_options->{'escape_char'} = undef;
 202                 }
 203                 my $csv = Text::CSV->new( $csv_options );
 204
 205                 if( exists $opts->{'string' } ) {
 206                         my @lines = split( "\n", $opts->{'string'} );
 207                         foreach my $l ( @lines ) {
 208                                 my $status = $csv->parse( $l );
 209                                 if( $status ) {
 210                                         push( @$alignment_table, [ $csv->fields ] );
 211                                 } else {
 212                                         warn "Could not parse line $l: " . $csv->error_input;
 213                                 }
 214                         }
 215                 } elsif( exists $opts->{'file'} ) {
 216                         open( my $fh, $opts->{'file'} )
 217                                 or warn "Could not open input file " . $opts->{'file'};
 218                         binmode( $fh, ':utf8' );
 219                         while( my $row = $csv->getline( $fh ) ) {
 220                                 push( @$alignment_table, $row );
 221                         }
 222                         close $fh;
 223                 } else {
 224                         warn "Could not find string or file option to parse";
 225                         return;
 226                 }
 227         }
 228     # Set up the witnesses we find in the first line
 229     my @witnesses;
 230     my %ac_wits;  # Track layered witness -> main witness mapping
 231     my $aclabel = $c->ac_label;
 232     foreach my $sigil ( @{$alignment_table->[0]} ) {
 233         if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
 234                 # Sanitize the sigil name to an XML name
 235                 $sigil = $1 . '_layered';
 236             $ac_wits{$sigil} = $1;
 237         }
 238         my $wit = $tradition->add_witness(
 239                 'sigil' => $sigil, 'sourcetype' => 'collation' );
 240         $wit->path( [ $c->start ] );
 241         push( @witnesses, $wit );
 242         my $aclabel = $c->ac_label;
 243     }
 244
 245     # Save the original witness text sequences. Have to loop back through
 246     # the witness columns after we have identified all the a.c. witnesses.
 247     foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
 248         my @sequence = map { $_->[$idx] } @{$alignment_table};
 249         my $sigil = shift @sequence;
 250         my $is_layer = exists( $ac_wits{$sigil} );
 251         my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
 252         # Now get rid of gaps and meta-readings like #LACUNA#
 253         my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
 254         $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
 255     }
 256
 257     my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
 258     print STDERR "Tradition too big for row collation\n" if $nocollate;
 259
 260     # Now for the next rows, make nodes as necessary, assign their ranks, and
 261     # add them to the witness paths.
 262     foreach my $idx ( 1 .. $#{$alignment_table} ) {
 263         my $row = $alignment_table->[$idx];
 264         my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
 265         foreach my $w ( 0 .. $#{$row} ) {
 266             # push the appropriate node onto the appropriate witness path
 267             my $word = $row->[$w];
 268             if( $word ) {
 269                 my $reading = $nodes->{$word};
 270                 my $wit = $witnesses[$w];
 271                 push( @{$wit->path}, $reading );
 272             } # else skip it for empty readings.
 273         }
 274     }
 275
 276     # Collapse our lacunae into a single node and
 277     # push the end node onto all paths.
 278     $c->end->rank( scalar @$alignment_table );
 279     foreach my $wit ( @witnesses ) {
 280         my $p = $wit->path;
 281         my $last_rdg = shift @$p;
 282         my $new_p = [ $last_rdg ];
 283         foreach my $rdg ( @$p ) {
 284                 # Omit the reading if we are in a lacuna already.
 285                 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
 286                         # Save the reading otherwise.
 287                         push( @$new_p, $rdg );
 288                         $last_rdg = $rdg;
 289         }
 290         push( @$new_p, $c->end );
 291         $wit->path( $new_p );
 292     }
 293
 294     # Fold any a.c. witnesses into their main witness objects, and
 295     # delete the independent a.c. versions.
 296     foreach my $a ( keys %ac_wits ) {
 297         my $ac_wit = $tradition->witness( $a );
 298         my $main_wit = $tradition->witness( $ac_wits{$a} );
 299         next unless $main_wit;
 300         $main_wit->is_layered(1);
 301         $main_wit->uncorrected_path( $ac_wit->path );
 302         $tradition->del_witness( $ac_wit );
 303     }
 304
 305     # Join up the paths.
 306     $c->make_witness_paths;
 307     # Delete our unused lacuna nodes.
 308         foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
 309                 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
 310         }
 311
 312         # Do a consistency check.
 313         foreach my $wit ( $tradition->witnesses ) {
 314                 my $pathtext = $c->path_text( $wit->sigil );
 315                 my $origtext = join( ' ', @{$wit->text} );
 316                 warn "Text differs for witness " . $wit->sigil
 317                         unless $pathtext eq $origtext;
 318                 if( $wit->is_layered ) {
 319                         $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
 320                         $origtext = join( ' ', @{$wit->layertext} );
 321                         warn "Ante-corr text differs for witness " . $wit->sigil
 322                                 unless $pathtext eq $origtext;
 323                 } else {
 324                         warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
 325                                 if $wit->has_layertext;
 326                 }
 327         }
 328
 329         # Note that our ranks and common readings are set.
 330         $c->_graphcalc_done(1);
 331         # Remove redundant collation relationships.
 332         $c->relations->filter_collations() unless $nocollate;
 333 }
 334
 335 sub _make_nodes {
 336     my( $collation, $row, $index, $nocollate ) = @_;
 337     my %unique;
 338     my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
 339     foreach my $w ( @$row ) {
 340         $unique{$w} = 1 if $w;
 341         $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
 342     }
 343     my $ctr = 1;
 344     foreach my $w ( keys %unique ) {
 345         my $rargs = {
 346                 'id' => "r$index.$ctr",
 347                 'rank' => $index,
 348                 'text' => $w,
 349                 };
 350         if( $w eq '#LACUNA#' ) {
 351                 $rargs->{'is_lacuna'} = 1;
 352         } elsif( $commonctr == 1 ) {
 353                 $rargs->{'is_common'} = 1;
 354         }
 355         my $r = $collation->add_reading( $rargs );
 356         $unique{$w} = $r;
 357         $ctr++;
 358     }
 359     # Collate this sequence of readings via a single 'collation' relationship.
 360     unless( $nocollate ) {
 361                 my @rankrdgs = values %unique;
 362                 my $collation_rel;
 363                 while( @rankrdgs ) {
 364                         my $r = shift @rankrdgs;
 365                         next if $r->is_meta;
 366                         foreach my $nr ( @rankrdgs ) {
 367                                 next if $nr->is_meta;
 368                                 if( $collation_rel ) {
 369                                         $collation->add_relationship( $r, $nr, $collation_rel );
 370                                 } else {
 371                                         $collation->add_relationship( $r, $nr,
 372                                                 { 'type' => 'collated',
 373                                                   'annotation' => "Parsed together for rank $index" } );
 374                                         $collation_rel = $collation->get_relationship( $r, $nr );
 375                                 }
 376                         }
 377                 }
 378         }
 379     return \%unique;
 380 }
 381
 382 1;
 383
 384 =head1 LICENSE
 385
 386 This package is free software and is provided "as is" without express
 387 or implied warranty.  You can redistribute it and/or modify it under
 388 the same terms as Perl itself.
 389
 390 =head1 AUTHOR
 391
 392 Tara L Andrews E<lt>aurum@cpan.orgE<gt>