add exceptions to the rest of the Tradition library
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Tabular.pm
CommitLineData
d9e873d0 1package Text::Tradition::Parser::Tabular;
2
3use strict;
4use warnings;
5use Text::CSV_XS;
6
7=head1 NAME
8
9Text::Tradition::Parser::Tabular
10
3b853983 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'Tabular',
18 'file' => '/path/to/collation.csv',
19 'sep_char' => ','
20 );
21
22 my $t_from_string = Text::Tradition->new(
23 'name' => 'my text',
24 'input' => 'Tabular',
25 'string' => $tab_separated_collation,
26 'sep_char' => "\t",
27 );
28
d9e873d0 29=head1 DESCRIPTION
30
31Parser module for Text::Tradition to read an alignment table format, such as CSV.
32
33=head1 METHODS
34
e867486f 35=head2 B<parse>( $tradition, $option_hash )
3b853983 36
37Takes an initialized tradition and a set of options; creates the
38appropriate nodes and edges on the graph, as well as the appropriate
39witness objects. The $option_hash must contain either a 'file' or a
40'string' argument with the table to be parsed; it may also contain a
41'sep_char' argument to specify how the fields are separated.
42
43The table should have witnesses arranged in columns, with the witness sigla
44in the first row. Empty cells are interpreted as omissions (and thus
45stemmatologically relevant.) Longer lacunae in the text, to be disregarded
46in cladistic analysis, may be represented by filling the appropriate cells
47with the tag '#LACUNA#'.
48
49If a witness name ends in the collation's ac_label, it will be treated as
50an 'ante-correction' version of the 'main' witness whose sigil it shares.
51
52=begin testing
53
54use Text::Tradition;
55binmode STDOUT, ":utf8";
56binmode STDERR, ":utf8";
57eval { no warnings; binmode $DB::OUT, ":utf8"; };
58
59my $csv = 't/data/florilegium.csv';
60my $t = Text::Tradition->new(
61 'name' => 'inline',
62 'input' => 'Tabular',
63 'file' => $csv,
64 'sep_char' => ',',
65 );
d9e873d0 66
3b853983 67is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
d9e873d0 68
3b853983 69### TODO Check these figures
70if( $t ) {
0e47f4f6 71 is( scalar $t->collation->readings, 311, "Collation has all readings" );
72 is( scalar $t->collation->paths, 361, "Collation has all paths" );
3b853983 73 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
74}
75
b0b4421a 76# Check that we have the right witnesses
77my %seen_wits;
78map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
79foreach my $wit ( $t->witnesses ) {
80 $seen_wits{$wit->sigil} = 1;
81}
82is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
83foreach my $k ( keys %seen_wits ) {
84 ok( $seen_wits{$k}, "Witness $k still exists" );
85}
86
87# Check that the witnesses have the right texts
88foreach my $wit ( $t->witnesses ) {
89 my $origtext = join( ' ', @{$wit->text} );
90 my $graphtext = $t->collation->path_text( $wit->sigil );
91 is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
92}
93
94# Check that the a.c. witnesses have the right text
95map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
96foreach my $k ( keys %seen_wits ) {
97 my $wit = $t->witness( $k );
98 if( $seen_wits{$k} ) {
99 ok( $wit->is_layered, "Witness $k got marked as layered" );
100 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
101 my $origtext = join( ' ', @{$wit->layertext} );
102 my $acsig = $wit->sigil . $t->collation->ac_label;
861c3e27 103 my $graphtext = $t->collation->path_text( $acsig );
b0b4421a 104 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
105 } else {
106 ok( !$wit->is_layered, "Witness $k not marked as layered" );
107 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
108 }
109}
110
3b853983 111=end testing
d9e873d0 112
113=cut
114
115sub parse {
dfc37e38 116 my( $tradition, $opts ) = @_;
d9e873d0 117 my $c = $tradition->collation; # shorthand
dfc37e38 118 my $csv = Text::CSV_XS->new( {
119 binary => 1, # binary for UTF-8
120 sep_char => exists $opts->{'sep_char'} ? $opts->{'sep_char'} : "\t" }
121 );
dfc37e38 122
d9e873d0 123 my $alignment_table;
dfc37e38 124 if( exists $opts->{'string' } ) {
125 my @lines = split( "\n", $opts->{'string'} );
126 foreach my $l ( @lines ) {
127 my $status = $csv->parse( $l );
128 if( $status ) {
129 push( @$alignment_table, [ $csv->fields ] );
130 } else {
131 warn "Could not parse line $l: " . $csv->error_input;
132 }
133 }
134 } elsif( exists $opts->{'file'} ) {
bb11025b 135 open( my $fh, $opts->{'file'} )
136 or warn "Could not open input file " . $opts->{'file'};
137 binmode( $fh, ':utf8' );
dfc37e38 138 while( my $row = $csv->getline( $fh ) ) {
139 push( @$alignment_table, $row );
d9e873d0 140 }
dfc37e38 141 close $fh;
142 } else {
143 warn "Could not find string or file option to parse";
144 return;
d9e873d0 145 }
dfc37e38 146
d9e873d0 147 # Set up the witnesses we find in the first line
148 my @witnesses;
b0b4421a 149 my %ac_wits; # Track layered witness -> main witness mapping
d9e873d0 150 foreach my $sigil ( @{$alignment_table->[0]} ) {
151 my $wit = $tradition->add_witness( 'sigil' => $sigil );
152 $wit->path( [ $c->start ] );
153 push( @witnesses, $wit );
3b853983 154 my $aclabel = $c->ac_label;
155 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
b0b4421a 156 $ac_wits{$sigil} = $1;
3b853983 157 }
d9e873d0 158 }
159
b0b4421a 160 # Save the original witness text sequences. Have to loop back through
161 # the witness columns after we have identified all the a.c. witnesses.
162 foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
163 my @sequence = map { $_->[$idx] } @{$alignment_table};
164 my $sigil = shift @sequence;
165 my $is_layer = exists( $ac_wits{$sigil} );
166 my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
167 # Now get rid of gaps and meta-readings like #LACUNA#
168 my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
169 $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
170 }
171
d9e873d0 172 # Now for the next rows, make nodes as necessary, assign their ranks, and
173 # add them to the witness paths.
d9e873d0 174 foreach my $idx ( 1 .. $#{$alignment_table} ) {
175 my $row = $alignment_table->[$idx];
176 my $nodes = make_nodes( $c, $row, $idx );
177 foreach my $w ( 0 .. $#{$row} ) {
178 # push the appropriate node onto the appropriate witness path
179 my $word = $row->[$w];
180 if( $word ) {
181 my $reading = $nodes->{$word};
182 my $wit = $witnesses[$w];
183 push( @{$wit->path}, $reading );
184 } # else skip it for empty readings.
185 }
186 }
187
eca16057 188 # Collapse our lacunae into a single node and
189 # push the end node onto all paths.
d9e873d0 190 $c->end->rank( scalar @$alignment_table );
191 foreach my $wit ( @witnesses ) {
eca16057 192 my $p = $wit->path;
193 my $last_rdg = shift @$p;
194 my $new_p = [ $last_rdg ];
195 foreach my $rdg ( @$p ) {
83d5ac3a 196 # Omit the reading if we are in a lacuna already.
197 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
198 # Save the reading otherwise.
199 push( @$new_p, $rdg );
200 $last_rdg = $rdg;
eca16057 201 }
202 push( @$new_p, $c->end );
203 $wit->path( $new_p );
d9e873d0 204 }
205
3b853983 206 # Fold any a.c. witnesses into their main witness objects, and
207 # delete the independent a.c. versions.
208 foreach my $a ( keys %ac_wits ) {
b0b4421a 209 my $ac_wit = $tradition->witness( $a );
210 my $main_wit = $tradition->witness( $ac_wits{$a} );
3b853983 211 next unless $main_wit;
861c3e27 212 $main_wit->is_layered(1);
3b853983 213 $main_wit->uncorrected_path( $ac_wit->path );
214 $tradition->del_witness( $ac_wit );
215 }
83d5ac3a 216
d9e873d0 217 # Join up the paths.
218 $c->make_witness_paths;
83d5ac3a 219 # Delete our unused lacuna nodes.
220 foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
221 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
222 }
861c3e27 223
224 # Do a consistency check.
225 foreach my $wit ( $tradition->witnesses ) {
226 my $pathtext = $c->path_text( $wit->sigil );
227 my $origtext = join( ' ', @{$wit->text} );
228 warn "Text differs for witness " . $wit->sigil
229 unless $pathtext eq $origtext;
230 if( $wit->is_layered ) {
231 $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
232 $origtext = join( ' ', @{$wit->layertext} );
233 warn "Ante-corr text differs for witness " . $wit->sigil
234 unless $pathtext eq $origtext;
235 } else {
236 warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
237 if $wit->has_layertext;
238 }
239 }
d9e873d0 240}
241
242sub make_nodes {
243 my( $collation, $row, $index ) = @_;
244 my %unique;
245 foreach my $w ( @$row ) {
246 $unique{$w} = 1 if $w;
247 }
248 my $ctr = 1;
249 foreach my $w ( keys %unique ) {
a753cc84 250 my $rargs = {
a753cc84 251 'id' => "$index,$ctr",
252 'rank' => $index,
253 'text' => $w,
254 };
9f213786 255 $rargs->{'is_lacuna'} = 1 if $w eq '#LACUNA#';
a753cc84 256 my $r = $collation->add_reading( $rargs );
d9e873d0 257 $unique{$w} = $r;
a753cc84 258 $ctr++;
d9e873d0 259 }
260 return \%unique;
261}
262
3b853983 2631;
264
265=head1 LICENSE
266
267This package is free software and is provided "as is" without express
268or implied warranty. You can redistribute it and/or modify it under
269the same terms as Perl itself.
270
271=head1 AUTHOR
272
273Tara L Andrews E<lt>aurum@cpan.orgE<gt>