lib/Text/Tradition/Parser/Tabular.pm

   1 package Text::Tradition::Parser::Tabular;
   2
   3 use strict;
   4 use warnings;
   5 use Text::CSV;
   6
   7 =head1 NAME
   8
   9 Text::Tradition::Parser::Tabular
  10
  11 =head1 SYNOPSIS
  12
  13   use Text::Tradition;
  14
  15   my $t_from_file = Text::Tradition->new(
  16     'name' => 'my text',
  17     'input' => 'Tabular',
  18     'file' => '/path/to/collation.csv',
  19     'sep_char' => ','
  20     );
  21
  22   my $t_from_string = Text::Tradition->new(
  23     'name' => 'my text',
  24     'input' => 'Tabular',
  25     'string' => $tab_separated_collation,
  26     'sep_char' => "\t",
  27     );
  28
  29 =head1 DESCRIPTION
  30
  31 Parser module for Text::Tradition to read an alignment table format, such as CSV.
  32
  33 =head1 METHODS
  34
  35 =head2 B<parse>( $tradition, $option_hash )
  36
  37 Takes an initialized tradition and a set of options; creates the
  38 appropriate nodes and edges on the graph, as well as the appropriate
  39 witness objects.  The $option_hash must contain either a 'file' or a
  40 'string' argument with the table to be parsed; it may also contain a
  41 'sep_char' argument to specify how the fields are separated.
  42
  43 The table should have witnesses arranged in columns, with the witness sigla
  44 in the first row.  Empty cells are interpreted as omissions (and thus
  45 stemmatologically relevant.) Longer lacunae in the text, to be disregarded
  46 in cladistic analysis, may be represented by filling the appropriate cells
  47 with the tag '#LACUNA#'.
  48
  49 If a witness name ends in the collation's ac_label, it will be treated as
  50 an 'ante-correction' version of the 'main' witness whose sigil it shares.
  51
  52 =begin testing
  53
  54 use Text::Tradition;
  55 binmode STDOUT, ":utf8";
  56 binmode STDERR, ":utf8";
  57 eval { no warnings; binmode $DB::OUT, ":utf8"; };
  58
  59 my $csv = 't/data/florilegium.csv';
  60 my $t = Text::Tradition->new(
  61     'name'  => 'inline',
  62     'input' => 'Tabular',
  63     'file'  => $csv,
  64     'sep_char' => ',',
  65     );
  66
  67 is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
  68
  69 ### TODO Check these figures
  70 if( $t ) {
  71     is( scalar $t->collation->readings, 311, "Collation has all readings" );
  72     is( scalar $t->collation->paths, 361, "Collation has all paths" );
  73     is( scalar $t->witnesses, 13, "Collation has all witnesses" );
  74 }
  75
  76 # Check that we have the right witnesses
  77 my %seen_wits;
  78 map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
  79 foreach my $wit ( $t->witnesses ) {
  80         $seen_wits{$wit->sigil} = 1;
  81 }
  82 is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
  83 foreach my $k ( keys %seen_wits ) {
  84         ok( $seen_wits{$k}, "Witness $k still exists" );
  85 }
  86
  87 # Check that the witnesses have the right texts
  88 foreach my $wit ( $t->witnesses ) {
  89         my $origtext = join( ' ', @{$wit->text} );
  90         my $graphtext = $t->collation->path_text( $wit->sigil );
  91         is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
  92 }
  93
  94 # Check that the a.c. witnesses have the right text
  95 map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
  96 foreach my $k ( keys %seen_wits ) {
  97         my $wit = $t->witness( $k );
  98         if( $seen_wits{$k} ) {
  99                 ok( $wit->is_layered, "Witness $k got marked as layered" );
 100                 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
 101                 my $origtext = join( ' ', @{$wit->layertext} );
 102                 my $acsig = $wit->sigil . $t->collation->ac_label;
 103                 my $graphtext = $t->collation->path_text( $acsig );
 104                 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
 105         } else {
 106                 ok( !$wit->is_layered, "Witness $k not marked as layered" );
 107                 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
 108         }
 109 }
 110
 111 =end testing
 112
 113 =cut
 114
 115 sub parse {
 116     my( $tradition, $opts ) = @_;
 117     my $c = $tradition->collation; # shorthand
 118     my $csv_options = { 'binary' => 1 };
 119     $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
 120     if( $csv_options->{'sep_char'} eq "\t" ) {
 121         # If it is really tab separated, nothing is an escape char.
 122         $csv_options->{'quote_char'} = undef;
 123         $csv_options->{'escape_char'} = undef;
 124     }
 125     my $csv = Text::CSV->new( $csv_options );
 126
 127     my $alignment_table;
 128     if( exists $opts->{'string' } ) {
 129         my @lines = split( "\n", $opts->{'string'} );
 130         foreach my $l ( @lines ) {
 131             my $status = $csv->parse( $l );
 132             if( $status ) {
 133                 push( @$alignment_table, [ $csv->fields ] );
 134             } else {
 135                 warn "Could not parse line $l: " . $csv->error_input;
 136             }
 137         }
 138     } elsif( exists $opts->{'file'} ) {
 139         open( my $fh, $opts->{'file'} )
 140             or warn "Could not open input file " . $opts->{'file'};
 141         binmode( $fh, ':utf8' );
 142         while( my $row = $csv->getline( $fh ) ) {
 143             push( @$alignment_table, $row );
 144         }
 145         close $fh;
 146     } else {
 147         warn "Could not find string or file option to parse";
 148         return;
 149     }
 150
 151     # Set up the witnesses we find in the first line
 152     my @witnesses;
 153     my %ac_wits;  # Track layered witness -> main witness mapping
 154     my $aclabel = $c->ac_label;
 155     foreach my $sigil ( @{$alignment_table->[0]} ) {
 156         if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
 157                 # Sanitize the sigil name to an XML name
 158                 $sigil = $1 . '_layered';
 159             $ac_wits{$sigil} = $1;
 160         }
 161         my $wit = $tradition->add_witness(
 162                 'sigil' => $sigil, 'sourcetype' => 'collation' );
 163         $wit->path( [ $c->start ] );
 164         push( @witnesses, $wit );
 165         my $aclabel = $c->ac_label;
 166     }
 167
 168     # Save the original witness text sequences. Have to loop back through
 169     # the witness columns after we have identified all the a.c. witnesses.
 170     foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
 171         my @sequence = map { $_->[$idx] } @{$alignment_table};
 172         my $sigil = shift @sequence;
 173         my $is_layer = exists( $ac_wits{$sigil} );
 174         my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
 175         # Now get rid of gaps and meta-readings like #LACUNA#
 176         my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
 177         $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
 178     }
 179
 180     # Now for the next rows, make nodes as necessary, assign their ranks, and
 181     # add them to the witness paths.
 182     foreach my $idx ( 1 .. $#{$alignment_table} ) {
 183         my $row = $alignment_table->[$idx];
 184         my $nodes = _make_nodes( $c, $row, $idx );
 185         foreach my $w ( 0 .. $#{$row} ) {
 186             # push the appropriate node onto the appropriate witness path
 187             my $word = $row->[$w];
 188             if( $word ) {
 189                 my $reading = $nodes->{$word};
 190                 my $wit = $witnesses[$w];
 191                 push( @{$wit->path}, $reading );
 192             } # else skip it for empty readings.
 193         }
 194     }
 195
 196     # Collapse our lacunae into a single node and
 197     # push the end node onto all paths.
 198     $c->end->rank( scalar @$alignment_table );
 199     foreach my $wit ( @witnesses ) {
 200         my $p = $wit->path;
 201         my $last_rdg = shift @$p;
 202         my $new_p = [ $last_rdg ];
 203         foreach my $rdg ( @$p ) {
 204                 # Omit the reading if we are in a lacuna already.
 205                 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
 206                         # Save the reading otherwise.
 207                         push( @$new_p, $rdg );
 208                         $last_rdg = $rdg;
 209         }
 210         push( @$new_p, $c->end );
 211         $wit->path( $new_p );
 212     }
 213
 214     # Fold any a.c. witnesses into their main witness objects, and
 215     # delete the independent a.c. versions.
 216     foreach my $a ( keys %ac_wits ) {
 217         my $ac_wit = $tradition->witness( $a );
 218         my $main_wit = $tradition->witness( $ac_wits{$a} );
 219         next unless $main_wit;
 220         $main_wit->is_layered(1);
 221         $main_wit->uncorrected_path( $ac_wit->path );
 222         $tradition->del_witness( $ac_wit );
 223     }
 224
 225     # Join up the paths.
 226     $c->make_witness_paths;
 227     # Delete our unused lacuna nodes.
 228         foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
 229                 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
 230         }
 231
 232         # Do a consistency check.
 233         foreach my $wit ( $tradition->witnesses ) {
 234                 my $pathtext = $c->path_text( $wit->sigil );
 235                 my $origtext = join( ' ', @{$wit->text} );
 236                 warn "Text differs for witness " . $wit->sigil
 237                         unless $pathtext eq $origtext;
 238                 if( $wit->is_layered ) {
 239                         $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
 240                         $origtext = join( ' ', @{$wit->layertext} );
 241                         warn "Ante-corr text differs for witness " . $wit->sigil
 242                                 unless $pathtext eq $origtext;
 243                 } else {
 244                         warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
 245                                 if $wit->has_layertext;
 246                 }
 247         }
 248
 249         # Note that our ranks and common readings are set.
 250         $c->_graphcalc_done(1);
 251 }
 252
 253 sub _make_nodes {
 254     my( $collation, $row, $index ) = @_;
 255     my %unique;
 256     my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
 257     foreach my $w ( @$row ) {
 258         $unique{$w} = 1 if $w;
 259         $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
 260     }
 261     my $ctr = 1;
 262     foreach my $w ( keys %unique ) {
 263         my $rargs = {
 264                 'id' => "$index,$ctr",
 265                 'rank' => $index,
 266                 'text' => $w,
 267                 };
 268         if( $w eq '#LACUNA#' ) {
 269                 $rargs->{'is_lacuna'} = 1;
 270         } elsif( $commonctr == 1 ) {
 271                 $rargs->{'is_common'} = 1;
 272         }
 273         my $r = $collation->add_reading( $rargs );
 274         $unique{$w} = $r;
 275         $ctr++;
 276     }
 277     # Collate this sequence of readings via a single 'collation' relationship.
 278     my @rankrdgs = values %unique;
 279     my $collation_rel;
 280     while( @rankrdgs ) {
 281         my $r = shift @rankrdgs;
 282         next if $r->is_meta;
 283         foreach my $nr ( @rankrdgs ) {
 284                 if( $collation_rel ) {
 285                         $collation->add_relationship( $r, $nr, $collation_rel );
 286                 } else {
 287                         $collation->add_relationship( $r, $nr,
 288                                 { 'type' => 'collated',
 289                                   'annotation' => "Parsed together for rank $index" } );
 290                         $collation_rel = $collation->get_relationship( $r, $nr );
 291                 }
 292         }
 293     }
 294
 295     return \%unique;
 296 }
 297
 298 1;
 299
 300 =head1 LICENSE
 301
 302 This package is free software and is provided "as is" without express
 303 or implied warranty.  You can redistribute it and/or modify it under
 304 the same terms as Perl itself.
 305
 306 =head1 AUTHOR
 307
 308 Tara L Andrews E<lt>aurum@cpan.orgE<gt>