lib/Text/Tradition/Parser/Tabular.pm

   1 package Text::Tradition::Parser::Tabular;
   2
   3 use strict;
   4 use warnings;
   5 use Text::CSV;
   6
   7 =head1 NAME
   8
   9 Text::Tradition::Parser::Tabular
  10
  11 =head1 SYNOPSIS
  12
  13   use Text::Tradition;
  14
  15   my $t_from_file = Text::Tradition->new(
  16     'name' => 'my text',
  17     'input' => 'Tabular',
  18     'file' => '/path/to/collation.csv',
  19     'sep_char' => ','
  20     );
  21
  22   my $t_from_string = Text::Tradition->new(
  23     'name' => 'my text',
  24     'input' => 'Tabular',
  25     'string' => $tab_separated_collation,
  26     'sep_char' => "\t",
  27     );
  28
  29 =head1 DESCRIPTION
  30
  31 Parser module for Text::Tradition to read an alignment table format, such as CSV.
  32
  33 =head1 METHODS
  34
  35 =head2 B<parse>( $tradition, $option_hash )
  36
  37 Takes an initialized tradition and a set of options; creates the
  38 appropriate nodes and edges on the graph, as well as the appropriate
  39 witness objects.  The $option_hash must contain either a 'file' or a
  40 'string' argument with the table to be parsed; it may also contain a
  41 'sep_char' argument to specify how the fields are separated.
  42
  43 The table should have witnesses arranged in columns, with the witness sigla
  44 in the first row.  Empty cells are interpreted as omissions (and thus
  45 stemmatologically relevant.) Longer lacunae in the text, to be disregarded
  46 in cladistic analysis, may be represented by filling the appropriate cells
  47 with the tag '#LACUNA#'.
  48
  49 If a witness name ends in the collation's ac_label, it will be treated as
  50 an 'ante-correction' version of the 'main' witness whose sigil it shares.
  51
  52 =begin testing
  53
  54 use Text::Tradition;
  55 binmode STDOUT, ":utf8";
  56 binmode STDERR, ":utf8";
  57 eval { no warnings; binmode $DB::OUT, ":utf8"; };
  58
  59 my $csv = 't/data/florilegium.csv';
  60 my $t = Text::Tradition->new(
  61     'name'  => 'inline',
  62     'input' => 'Tabular',
  63     'file'  => $csv,
  64     'sep_char' => ',',
  65     );
  66
  67 is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
  68
  69 ### TODO Check these figures
  70 if( $t ) {
  71     is( scalar $t->collation->readings, 311, "Collation has all readings" );
  72     is( scalar $t->collation->paths, 361, "Collation has all paths" );
  73     is( scalar $t->witnesses, 13, "Collation has all witnesses" );
  74 }
  75
  76 # Check that we have the right witnesses
  77 my %seen_wits;
  78 map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
  79 foreach my $wit ( $t->witnesses ) {
  80         $seen_wits{$wit->sigil} = 1;
  81 }
  82 is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
  83 foreach my $k ( keys %seen_wits ) {
  84         ok( $seen_wits{$k}, "Witness $k still exists" );
  85 }
  86
  87 # Check that the witnesses have the right texts
  88 foreach my $wit ( $t->witnesses ) {
  89         my $origtext = join( ' ', @{$wit->text} );
  90         my $graphtext = $t->collation->path_text( $wit->sigil );
  91         is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
  92 }
  93
  94 # Check that the a.c. witnesses have the right text
  95 map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
  96 foreach my $k ( keys %seen_wits ) {
  97         my $wit = $t->witness( $k );
  98         if( $seen_wits{$k} ) {
  99                 ok( $wit->is_layered, "Witness $k got marked as layered" );
 100                 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
 101                 my $origtext = join( ' ', @{$wit->layertext} );
 102                 my $acsig = $wit->sigil . $t->collation->ac_label;
 103                 my $graphtext = $t->collation->path_text( $acsig );
 104                 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
 105         } else {
 106                 ok( !$wit->is_layered, "Witness $k not marked as layered" );
 107                 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
 108         }
 109 }
 110
 111 # Check that we only have collation relationships where we need them
 112 is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
 113
 114 =end testing
 115
 116 =cut
 117
 118 sub parse {
 119     my( $tradition, $opts ) = @_;
 120     my $c = $tradition->collation; # shorthand
 121     my $csv_options = { 'binary' => 1 };
 122     $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
 123     if( $csv_options->{'sep_char'} eq "\t" ) {
 124         # If it is really tab separated, nothing is an escape char.
 125         $csv_options->{'quote_char'} = undef;
 126         $csv_options->{'escape_char'} = undef;
 127     }
 128     my $csv = Text::CSV->new( $csv_options );
 129
 130     my $alignment_table;
 131     if( exists $opts->{'string' } ) {
 132         my @lines = split( "\n", $opts->{'string'} );
 133         foreach my $l ( @lines ) {
 134             my $status = $csv->parse( $l );
 135             if( $status ) {
 136                 push( @$alignment_table, [ $csv->fields ] );
 137             } else {
 138                 warn "Could not parse line $l: " . $csv->error_input;
 139             }
 140         }
 141     } elsif( exists $opts->{'file'} ) {
 142         open( my $fh, $opts->{'file'} )
 143             or warn "Could not open input file " . $opts->{'file'};
 144         binmode( $fh, ':utf8' );
 145         while( my $row = $csv->getline( $fh ) ) {
 146             push( @$alignment_table, $row );
 147         }
 148         close $fh;
 149     } else {
 150         warn "Could not find string or file option to parse";
 151         return;
 152     }
 153
 154     # Set up the witnesses we find in the first line
 155     my @witnesses;
 156     my %ac_wits;  # Track layered witness -> main witness mapping
 157     my $aclabel = $c->ac_label;
 158     foreach my $sigil ( @{$alignment_table->[0]} ) {
 159         if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
 160                 # Sanitize the sigil name to an XML name
 161                 $sigil = $1 . '_layered';
 162             $ac_wits{$sigil} = $1;
 163         }
 164         my $wit = $tradition->add_witness(
 165                 'sigil' => $sigil, 'sourcetype' => 'collation' );
 166         $wit->path( [ $c->start ] );
 167         push( @witnesses, $wit );
 168         my $aclabel = $c->ac_label;
 169     }
 170
 171     # Save the original witness text sequences. Have to loop back through
 172     # the witness columns after we have identified all the a.c. witnesses.
 173     foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
 174         my @sequence = map { $_->[$idx] } @{$alignment_table};
 175         my $sigil = shift @sequence;
 176         my $is_layer = exists( $ac_wits{$sigil} );
 177         my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
 178         # Now get rid of gaps and meta-readings like #LACUNA#
 179         my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
 180         $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
 181     }
 182
 183     # Now for the next rows, make nodes as necessary, assign their ranks, and
 184     # add them to the witness paths.
 185     foreach my $idx ( 1 .. $#{$alignment_table} ) {
 186         my $row = $alignment_table->[$idx];
 187         my $nodes = _make_nodes( $c, $row, $idx );
 188         foreach my $w ( 0 .. $#{$row} ) {
 189             # push the appropriate node onto the appropriate witness path
 190             my $word = $row->[$w];
 191             if( $word ) {
 192                 my $reading = $nodes->{$word};
 193                 my $wit = $witnesses[$w];
 194                 push( @{$wit->path}, $reading );
 195             } # else skip it for empty readings.
 196         }
 197     }
 198
 199     # Collapse our lacunae into a single node and
 200     # push the end node onto all paths.
 201     $c->end->rank( scalar @$alignment_table );
 202     foreach my $wit ( @witnesses ) {
 203         my $p = $wit->path;
 204         my $last_rdg = shift @$p;
 205         my $new_p = [ $last_rdg ];
 206         foreach my $rdg ( @$p ) {
 207                 # Omit the reading if we are in a lacuna already.
 208                 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
 209                         # Save the reading otherwise.
 210                         push( @$new_p, $rdg );
 211                         $last_rdg = $rdg;
 212         }
 213         push( @$new_p, $c->end );
 214         $wit->path( $new_p );
 215     }
 216
 217     # Fold any a.c. witnesses into their main witness objects, and
 218     # delete the independent a.c. versions.
 219     foreach my $a ( keys %ac_wits ) {
 220         my $ac_wit = $tradition->witness( $a );
 221         my $main_wit = $tradition->witness( $ac_wits{$a} );
 222         next unless $main_wit;
 223         $main_wit->is_layered(1);
 224         $main_wit->uncorrected_path( $ac_wit->path );
 225         $tradition->del_witness( $ac_wit );
 226     }
 227
 228     # Join up the paths.
 229     $c->make_witness_paths;
 230     # Delete our unused lacuna nodes.
 231         foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
 232                 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
 233         }
 234
 235         # Do a consistency check.
 236         foreach my $wit ( $tradition->witnesses ) {
 237                 my $pathtext = $c->path_text( $wit->sigil );
 238                 my $origtext = join( ' ', @{$wit->text} );
 239                 warn "Text differs for witness " . $wit->sigil
 240                         unless $pathtext eq $origtext;
 241                 if( $wit->is_layered ) {
 242                         $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
 243                         $origtext = join( ' ', @{$wit->layertext} );
 244                         warn "Ante-corr text differs for witness " . $wit->sigil
 245                                 unless $pathtext eq $origtext;
 246                 } else {
 247                         warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
 248                                 if $wit->has_layertext;
 249                 }
 250         }
 251
 252         # Note that our ranks and common readings are set.
 253         $c->_graphcalc_done(1);
 254         # Remove redundant collation relationships.
 255         $c->relations->filter_collations();
 256 }
 257
 258 sub _make_nodes {
 259     my( $collation, $row, $index ) = @_;
 260     my %unique;
 261     my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
 262     foreach my $w ( @$row ) {
 263         $unique{$w} = 1 if $w;
 264         $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
 265     }
 266     my $ctr = 1;
 267     foreach my $w ( keys %unique ) {
 268         my $rargs = {
 269                 'id' => "$index,$ctr",
 270                 'rank' => $index,
 271                 'text' => $w,
 272                 };
 273         if( $w eq '#LACUNA#' ) {
 274                 $rargs->{'is_lacuna'} = 1;
 275         } elsif( $commonctr == 1 ) {
 276                 $rargs->{'is_common'} = 1;
 277         }
 278         my $r = $collation->add_reading( $rargs );
 279         $unique{$w} = $r;
 280         $ctr++;
 281     }
 282     # Collate this sequence of readings via a single 'collation' relationship.
 283     my @rankrdgs = values %unique;
 284     my $collation_rel;
 285     while( @rankrdgs ) {
 286         my $r = shift @rankrdgs;
 287         next if $r->is_meta;
 288         foreach my $nr ( @rankrdgs ) {
 289                 if( $collation_rel ) {
 290                         $collation->add_relationship( $r, $nr, $collation_rel );
 291                 } else {
 292                         $collation->add_relationship( $r, $nr,
 293                                 { 'type' => 'collated',
 294                                   'annotation' => "Parsed together for rank $index" } );
 295                         $collation_rel = $collation->get_relationship( $r, $nr );
 296                 }
 297         }
 298     }
 299
 300     return \%unique;
 301 }
 302
 303 1;
 304
 305 =head1 LICENSE
 306
 307 This package is free software and is provided "as is" without express
 308 or implied warranty.  You can redistribute it and/or modify it under
 309 the same terms as Perl itself.
 310
 311 =head1 AUTHOR
 312
 313 Tara L Andrews E<lt>aurum@cpan.orgE<gt>