make the rest of the tests work with the new Witness
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Tabular.pm
CommitLineData
d9e873d0 1package Text::Tradition::Parser::Tabular;
2
3use strict;
4use warnings;
82fa4d57 5use Text::CSV;
d9e873d0 6
7=head1 NAME
8
9Text::Tradition::Parser::Tabular
10
3b853983 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'Tabular',
18 'file' => '/path/to/collation.csv',
19 'sep_char' => ','
20 );
21
22 my $t_from_string = Text::Tradition->new(
23 'name' => 'my text',
24 'input' => 'Tabular',
25 'string' => $tab_separated_collation,
26 'sep_char' => "\t",
27 );
28
d9e873d0 29=head1 DESCRIPTION
30
31Parser module for Text::Tradition to read an alignment table format, such as CSV.
32
33=head1 METHODS
34
e867486f 35=head2 B<parse>( $tradition, $option_hash )
3b853983 36
37Takes an initialized tradition and a set of options; creates the
38appropriate nodes and edges on the graph, as well as the appropriate
39witness objects. The $option_hash must contain either a 'file' or a
40'string' argument with the table to be parsed; it may also contain a
41'sep_char' argument to specify how the fields are separated.
42
43The table should have witnesses arranged in columns, with the witness sigla
44in the first row. Empty cells are interpreted as omissions (and thus
45stemmatologically relevant.) Longer lacunae in the text, to be disregarded
46in cladistic analysis, may be represented by filling the appropriate cells
47with the tag '#LACUNA#'.
48
49If a witness name ends in the collation's ac_label, it will be treated as
50an 'ante-correction' version of the 'main' witness whose sigil it shares.
51
52=begin testing
53
54use Text::Tradition;
55binmode STDOUT, ":utf8";
56binmode STDERR, ":utf8";
57eval { no warnings; binmode $DB::OUT, ":utf8"; };
58
59my $csv = 't/data/florilegium.csv';
60my $t = Text::Tradition->new(
61 'name' => 'inline',
62 'input' => 'Tabular',
63 'file' => $csv,
64 'sep_char' => ',',
65 );
d9e873d0 66
3b853983 67is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
d9e873d0 68
3b853983 69### TODO Check these figures
70if( $t ) {
0e47f4f6 71 is( scalar $t->collation->readings, 311, "Collation has all readings" );
72 is( scalar $t->collation->paths, 361, "Collation has all paths" );
3b853983 73 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
74}
75
b0b4421a 76# Check that we have the right witnesses
77my %seen_wits;
78map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
79foreach my $wit ( $t->witnesses ) {
80 $seen_wits{$wit->sigil} = 1;
81}
82is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
83foreach my $k ( keys %seen_wits ) {
84 ok( $seen_wits{$k}, "Witness $k still exists" );
85}
86
87# Check that the witnesses have the right texts
88foreach my $wit ( $t->witnesses ) {
89 my $origtext = join( ' ', @{$wit->text} );
90 my $graphtext = $t->collation->path_text( $wit->sigil );
91 is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
92}
93
94# Check that the a.c. witnesses have the right text
95map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
96foreach my $k ( keys %seen_wits ) {
97 my $wit = $t->witness( $k );
98 if( $seen_wits{$k} ) {
99 ok( $wit->is_layered, "Witness $k got marked as layered" );
100 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
101 my $origtext = join( ' ', @{$wit->layertext} );
102 my $acsig = $wit->sigil . $t->collation->ac_label;
861c3e27 103 my $graphtext = $t->collation->path_text( $acsig );
b0b4421a 104 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
105 } else {
106 ok( !$wit->is_layered, "Witness $k not marked as layered" );
107 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
108 }
109}
110
3b853983 111=end testing
d9e873d0 112
113=cut
114
115sub parse {
dfc37e38 116 my( $tradition, $opts ) = @_;
d9e873d0 117 my $c = $tradition->collation; # shorthand
97a52a67 118 my $csv_options = { 'binary' => 1 };
119 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
120 if( $csv_options->{'sep_char'} eq "\t" ) {
121 # If it is really tab separated, nothing is an escape char.
122 $csv_options->{'quote_char'} = undef;
bba696c6 123 $csv_options->{'escape_char'} = undef;
97a52a67 124 }
82fa4d57 125 my $csv = Text::CSV->new( $csv_options );
dfc37e38 126
d9e873d0 127 my $alignment_table;
dfc37e38 128 if( exists $opts->{'string' } ) {
129 my @lines = split( "\n", $opts->{'string'} );
130 foreach my $l ( @lines ) {
131 my $status = $csv->parse( $l );
132 if( $status ) {
133 push( @$alignment_table, [ $csv->fields ] );
134 } else {
135 warn "Could not parse line $l: " . $csv->error_input;
136 }
137 }
138 } elsif( exists $opts->{'file'} ) {
bb11025b 139 open( my $fh, $opts->{'file'} )
140 or warn "Could not open input file " . $opts->{'file'};
141 binmode( $fh, ':utf8' );
dfc37e38 142 while( my $row = $csv->getline( $fh ) ) {
143 push( @$alignment_table, $row );
d9e873d0 144 }
dfc37e38 145 close $fh;
146 } else {
147 warn "Could not find string or file option to parse";
148 return;
d9e873d0 149 }
dfc37e38 150
d9e873d0 151 # Set up the witnesses we find in the first line
152 my @witnesses;
b0b4421a 153 my %ac_wits; # Track layered witness -> main witness mapping
82fa4d57 154 my $aclabel = $c->ac_label;
d9e873d0 155 foreach my $sigil ( @{$alignment_table->[0]} ) {
3b853983 156 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
82fa4d57 157 # Sanitize the sigil name to an XML name
158 $sigil = $1 . '_layered';
b0b4421a 159 $ac_wits{$sigil} = $1;
3b853983 160 }
82fa4d57 161 my $wit = $tradition->add_witness(
162 'sigil' => $sigil, 'sourcetype' => 'collation' );
163 $wit->path( [ $c->start ] );
164 push( @witnesses, $wit );
165 my $aclabel = $c->ac_label;
d9e873d0 166 }
167
b0b4421a 168 # Save the original witness text sequences. Have to loop back through
169 # the witness columns after we have identified all the a.c. witnesses.
170 foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
171 my @sequence = map { $_->[$idx] } @{$alignment_table};
172 my $sigil = shift @sequence;
173 my $is_layer = exists( $ac_wits{$sigil} );
174 my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
175 # Now get rid of gaps and meta-readings like #LACUNA#
176 my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
177 $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
178 }
179
d9e873d0 180 # Now for the next rows, make nodes as necessary, assign their ranks, and
181 # add them to the witness paths.
d9e873d0 182 foreach my $idx ( 1 .. $#{$alignment_table} ) {
183 my $row = $alignment_table->[$idx];
027d819c 184 my $nodes = _make_nodes( $c, $row, $idx );
d9e873d0 185 foreach my $w ( 0 .. $#{$row} ) {
186 # push the appropriate node onto the appropriate witness path
187 my $word = $row->[$w];
188 if( $word ) {
189 my $reading = $nodes->{$word};
190 my $wit = $witnesses[$w];
191 push( @{$wit->path}, $reading );
192 } # else skip it for empty readings.
193 }
194 }
195
eca16057 196 # Collapse our lacunae into a single node and
197 # push the end node onto all paths.
d9e873d0 198 $c->end->rank( scalar @$alignment_table );
199 foreach my $wit ( @witnesses ) {
eca16057 200 my $p = $wit->path;
201 my $last_rdg = shift @$p;
202 my $new_p = [ $last_rdg ];
203 foreach my $rdg ( @$p ) {
83d5ac3a 204 # Omit the reading if we are in a lacuna already.
205 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
206 # Save the reading otherwise.
207 push( @$new_p, $rdg );
208 $last_rdg = $rdg;
eca16057 209 }
210 push( @$new_p, $c->end );
211 $wit->path( $new_p );
d9e873d0 212 }
213
3b853983 214 # Fold any a.c. witnesses into their main witness objects, and
215 # delete the independent a.c. versions.
216 foreach my $a ( keys %ac_wits ) {
b0b4421a 217 my $ac_wit = $tradition->witness( $a );
218 my $main_wit = $tradition->witness( $ac_wits{$a} );
3b853983 219 next unless $main_wit;
861c3e27 220 $main_wit->is_layered(1);
3b853983 221 $main_wit->uncorrected_path( $ac_wit->path );
222 $tradition->del_witness( $ac_wit );
223 }
83d5ac3a 224
d9e873d0 225 # Join up the paths.
226 $c->make_witness_paths;
83d5ac3a 227 # Delete our unused lacuna nodes.
228 foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
229 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
230 }
861c3e27 231
232 # Do a consistency check.
233 foreach my $wit ( $tradition->witnesses ) {
234 my $pathtext = $c->path_text( $wit->sigil );
235 my $origtext = join( ' ', @{$wit->text} );
236 warn "Text differs for witness " . $wit->sigil
237 unless $pathtext eq $origtext;
238 if( $wit->is_layered ) {
239 $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
240 $origtext = join( ' ', @{$wit->layertext} );
241 warn "Ante-corr text differs for witness " . $wit->sigil
242 unless $pathtext eq $origtext;
243 } else {
244 warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
245 if $wit->has_layertext;
246 }
247 }
202ccb18 248
249 # Note that our ranks and common readings are set.
250 $c->_graphcalc_done(1);
d9e873d0 251}
252
027d819c 253sub _make_nodes {
d9e873d0 254 my( $collation, $row, $index ) = @_;
255 my %unique;
15db7774 256 my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
d9e873d0 257 foreach my $w ( @$row ) {
258 $unique{$w} = 1 if $w;
15db7774 259 $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
d9e873d0 260 }
261 my $ctr = 1;
262 foreach my $w ( keys %unique ) {
a753cc84 263 my $rargs = {
a753cc84 264 'id' => "$index,$ctr",
265 'rank' => $index,
266 'text' => $w,
267 };
15db7774 268 if( $w eq '#LACUNA#' ) {
269 $rargs->{'is_lacuna'} = 1;
270 } elsif( $commonctr == 1 ) {
271 $rargs->{'is_common'} = 1;
272 }
a753cc84 273 my $r = $collation->add_reading( $rargs );
d9e873d0 274 $unique{$w} = $r;
a753cc84 275 $ctr++;
d9e873d0 276 }
bf6e338d 277 # Collate this sequence of readings via a single 'collation' relationship.
278 my @rankrdgs = values %unique;
279 my $collation_rel;
280 while( @rankrdgs ) {
281 my $r = shift @rankrdgs;
282 next if $r->is_meta;
283 foreach my $nr ( @rankrdgs ) {
284 if( $collation_rel ) {
285 $collation->add_relationship( $r, $nr, $collation_rel );
286 } else {
287 $collation->add_relationship( $r, $nr,
288 { 'type' => 'collated',
289 'annotation' => "Parsed together for rank $index" } );
290 $collation_rel = $collation->get_relationship( $r, $nr );
291 }
292 }
293 }
294
d9e873d0 295 return \%unique;
296}
297
3b853983 2981;
299
300=head1 LICENSE
301
302This package is free software and is provided "as is" without express
303or implied warranty. You can redistribute it and/or modify it under
304the same terms as Perl itself.
305
306=head1 AUTHOR
307
308Tara L Andrews E<lt>aurum@cpan.orgE<gt>