1 package Text::Tradition::Parser::Tabular;
6 use Text::Tradition::Error;
11 Text::Tradition::Parser::Tabular
17 my $t_from_file = Text::Tradition->new(
20 'file' => '/path/to/collation.csv',
24 my $t_from_string = Text::Tradition->new(
27 'string' => $tab_separated_collation,
33 Parser module for Text::Tradition to read an alignment table format, such as CSV.
37 =head2 B<parse>( $tradition, $option_hash )
39 Takes an initialized tradition and a set of options; creates the
40 appropriate nodes and edges on the graph, as well as the appropriate
41 witness objects. The $option_hash must contain either a 'file' or a
42 'string' argument with the table to be parsed; it may also contain a
43 'sep_char' argument to specify how the fields are separated.
45 The table should have witnesses arranged in columns, with the witness sigla
46 in the first row. Empty cells are interpreted as omissions (and thus
47 stemmatologically relevant.) Longer lacunae in the text, to be disregarded
48 in cladistic analysis, may be represented by filling the appropriate cells
49 with the tag '#LACUNA#'.
51 If a witness name ends in the collation's ac_label, it will be treated as
52 an 'ante-correction' version of the 'main' witness whose sigil it shares.
57 binmode STDOUT, ":utf8";
58 binmode STDERR, ":utf8";
59 eval { no warnings; binmode $DB::OUT, ":utf8"; };
61 my $csv = 't/data/florilegium.csv';
62 my $t = Text::Tradition->new(
69 is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
71 ### TODO Check these figures
73 is( scalar $t->collation->readings, 311, "Collation has all readings" );
74 is( scalar $t->collation->paths, 361, "Collation has all paths" );
75 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
78 # Check that we have the right witnesses
80 map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
81 foreach my $wit ( $t->witnesses ) {
82 $seen_wits{$wit->sigil} = 1;
84 is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
85 foreach my $k ( keys %seen_wits ) {
86 ok( $seen_wits{$k}, "Witness $k still exists" );
89 # Check that the witnesses have the right texts
90 foreach my $wit ( $t->witnesses ) {
91 my $origtext = join( ' ', @{$wit->text} );
92 my $graphtext = $t->collation->path_text( $wit->sigil );
93 is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
96 # Check that the a.c. witnesses have the right text
97 map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
98 foreach my $k ( keys %seen_wits ) {
99 my $wit = $t->witness( $k );
100 if( $seen_wits{$k} ) {
101 ok( $wit->is_layered, "Witness $k got marked as layered" );
102 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
103 my $origtext = join( ' ', @{$wit->layertext} );
104 my $acsig = $wit->sigil . $t->collation->ac_label;
105 my $graphtext = $t->collation->path_text( $acsig );
106 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
108 ok( !$wit->is_layered, "Witness $k not marked as layered" );
109 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
113 # Check that we only have collation relationships where we need them
114 is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
116 ## Check excel parsing
118 my $xls = 't/data/armexample.xls';
119 my $xt = Text::Tradition->new(
120 'name' => 'excel test',
121 'input' => 'Tabular',
126 is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
128 map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
129 foreach my $wit ( $xt->witnesses ) {
130 $xls_wits{$wit->sigil} = 1;
132 is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
133 foreach my $k ( keys %xls_wits ) {
134 ok( $xls_wits{$k}, "Witness $k still exists" );
136 is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
137 is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
138 is( $xt->collation->reading('r5.1')->text, "\x{587}",
139 "Correct decoding of at least one reading" );
146 my( $tradition, $opts ) = @_;
147 my $c = $tradition->collation; # shorthand
149 if( $opts->{'xls'} ) {
151 require Spreadsheet::ParseExcel;
153 throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
155 unless( exists $opts->{'file'} ) {
156 throw( "Must pass the filename for Excel parsing" );
158 my $parser = Spreadsheet::ParseExcel->new();
159 my $workbook = $parser->parse( $opts->{'file'} );
160 unless( defined $workbook && defined $workbook->worksheet(0) ) {
161 throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
163 # Use the first worksheet
164 my $sheet = $workbook->worksheet(0);
165 my( $rmin, $rmax ) = $sheet->row_range();
166 my( $cmin, $cmax ) = $sheet->col_range();
167 unless( $cmax && $rmax ) {
168 throw( "Found no rows or no columns in first worksheet" );
170 # Populate the alignment table. We only want columns that have
171 # a sigil in row zero.
173 push( @$alignment_table, [] );
174 foreach my $col ( $cmin .. $cmax ) {
175 my $cell = $sheet->get_cell( $rmin, $col );
176 my $cellval = $cell ? $cell->value() : undef;
179 push( @{$alignment_table->[0]}, $cellval );
182 # Now go through the rest of the rows and pick up the columns
183 # that were headed by a sigil.
184 foreach my $row ( $rmin+1 .. $rmax ) {
186 foreach my $col ( $cmin .. $cmax ) {
187 next unless $sigcols{$col};
188 my $cell = $sheet->get_cell( $row, $col );
189 my $cellval = $cell ? $cell->value() : undef;
190 push( @tablerow, $cell ? $cell->value() : undef );
192 push( @$alignment_table, \@tablerow );
195 # Assume it is a comma-, tab-, or whatever-separated format.
196 my $csv_options = { 'binary' => 1 };
197 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
198 if( $csv_options->{'sep_char'} eq "\t" ) {
199 # If it is really tab separated, nothing is an escape char.
200 $csv_options->{'quote_char'} = undef;
201 $csv_options->{'escape_char'} = undef;
203 my $csv = Text::CSV->new( $csv_options );
205 if( exists $opts->{'string' } ) {
206 my @lines = split( "\n", $opts->{'string'} );
207 foreach my $l ( @lines ) {
208 my $status = $csv->parse( $l );
210 push( @$alignment_table, [ $csv->fields ] );
212 warn "Could not parse line $l: " . $csv->error_input;
215 } elsif( exists $opts->{'file'} ) {
216 open( my $fh, $opts->{'file'} )
217 or warn "Could not open input file " . $opts->{'file'};
218 binmode( $fh, ':utf8' );
219 while( my $row = $csv->getline( $fh ) ) {
220 push( @$alignment_table, $row );
224 warn "Could not find string or file option to parse";
228 # Set up the witnesses we find in the first line
230 my %ac_wits; # Track layered witness -> main witness mapping
231 my $aclabel = $c->ac_label;
232 foreach my $sigil ( @{$alignment_table->[0]} ) {
233 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
234 # Sanitize the sigil name to an XML name
235 $sigil = $1 . '_layered';
236 $ac_wits{$sigil} = $1;
238 my $wit = $tradition->add_witness(
239 'sigil' => $sigil, 'sourcetype' => 'collation' );
240 $wit->path( [ $c->start ] );
241 push( @witnesses, $wit );
242 my $aclabel = $c->ac_label;
245 # Save the original witness text sequences. Have to loop back through
246 # the witness columns after we have identified all the a.c. witnesses.
247 foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
248 my @sequence = map { $_->[$idx] } @{$alignment_table};
249 my $sigil = shift @sequence;
250 my $is_layer = exists( $ac_wits{$sigil} );
251 my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
252 # Now get rid of gaps and meta-readings like #LACUNA#
253 my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
254 $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
257 my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
258 print STDERR "Tradition too big for row collation\n" if $nocollate;
260 # Now for the next rows, make nodes as necessary, assign their ranks, and
261 # add them to the witness paths.
262 foreach my $idx ( 1 .. $#{$alignment_table} ) {
263 my $row = $alignment_table->[$idx];
264 my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
265 foreach my $w ( 0 .. $#{$row} ) {
266 # push the appropriate node onto the appropriate witness path
267 my $word = $row->[$w];
269 my $reading = $nodes->{$word};
270 my $wit = $witnesses[$w];
271 push( @{$wit->path}, $reading );
272 } # else skip it for empty readings.
276 # Collapse our lacunae into a single node and
277 # push the end node onto all paths.
278 $c->end->rank( scalar @$alignment_table );
279 foreach my $wit ( @witnesses ) {
281 my $last_rdg = shift @$p;
282 my $new_p = [ $last_rdg ];
283 foreach my $rdg ( @$p ) {
284 # Omit the reading if we are in a lacuna already.
285 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
286 # Save the reading otherwise.
287 push( @$new_p, $rdg );
290 push( @$new_p, $c->end );
291 $wit->path( $new_p );
294 # Fold any a.c. witnesses into their main witness objects, and
295 # delete the independent a.c. versions.
296 foreach my $a ( keys %ac_wits ) {
297 my $ac_wit = $tradition->witness( $a );
298 my $main_wit = $tradition->witness( $ac_wits{$a} );
299 next unless $main_wit;
300 $main_wit->is_layered(1);
301 $main_wit->uncorrected_path( $ac_wit->path );
302 $tradition->del_witness( $ac_wit );
306 $c->make_witness_paths;
307 # Delete our unused lacuna nodes.
308 foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
309 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
312 # Do a consistency check.
313 foreach my $wit ( $tradition->witnesses ) {
314 my $pathtext = $c->path_text( $wit->sigil );
315 my $origtext = join( ' ', @{$wit->text} );
316 warn "Text differs for witness " . $wit->sigil
317 unless $pathtext eq $origtext;
318 if( $wit->is_layered ) {
319 $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
320 $origtext = join( ' ', @{$wit->layertext} );
321 warn "Ante-corr text differs for witness " . $wit->sigil
322 unless $pathtext eq $origtext;
324 warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
325 if $wit->has_layertext;
329 # Note that our ranks and common readings are set.
330 $c->_graphcalc_done(1);
331 # Remove redundant collation relationships.
332 $c->relations->filter_collations() unless $nocollate;
336 my( $collation, $row, $index, $nocollate ) = @_;
338 my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
339 foreach my $w ( @$row ) {
340 $unique{$w} = 1 if $w;
341 $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
344 foreach my $w ( keys %unique ) {
346 'id' => "r$index.$ctr",
350 if( $w eq '#LACUNA#' ) {
351 $rargs->{'is_lacuna'} = 1;
352 } elsif( $commonctr == 1 ) {
353 $rargs->{'is_common'} = 1;
355 my $r = $collation->add_reading( $rargs );
359 # Collate this sequence of readings via a single 'collation' relationship.
360 unless( $nocollate ) {
361 my @rankrdgs = values %unique;
364 my $r = shift @rankrdgs;
366 foreach my $nr ( @rankrdgs ) {
367 next if $nr->is_meta;
368 if( $collation_rel ) {
369 $collation->add_relationship( $r, $nr, $collation_rel );
371 $collation->add_relationship( $r, $nr,
372 { 'type' => 'collated',
373 'annotation' => "Parsed together for rank $index" } );
374 $collation_rel = $collation->get_relationship( $r, $nr );
386 This package is free software and is provided "as is" without express
387 or implied warranty. You can redistribute it and/or modify it under
388 the same terms as Perl itself.
392 Tara L Andrews E<lt>aurum@cpan.orgE<gt>