use changed API for stemmadot specification
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Tabular.pm
CommitLineData
d9e873d0 1package Text::Tradition::Parser::Tabular;
2
3use strict;
4use warnings;
5use Text::CSV_XS;
6
7=head1 NAME
8
9Text::Tradition::Parser::Tabular
10
3b853983 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'Tabular',
18 'file' => '/path/to/collation.csv',
19 'sep_char' => ','
20 );
21
22 my $t_from_string = Text::Tradition->new(
23 'name' => 'my text',
24 'input' => 'Tabular',
25 'string' => $tab_separated_collation,
26 'sep_char' => "\t",
27 );
28
d9e873d0 29=head1 DESCRIPTION
30
31Parser module for Text::Tradition to read an alignment table format, such as CSV.
32
33=head1 METHODS
34
e867486f 35=head2 B<parse>( $tradition, $option_hash )
3b853983 36
37Takes an initialized tradition and a set of options; creates the
38appropriate nodes and edges on the graph, as well as the appropriate
39witness objects. The $option_hash must contain either a 'file' or a
40'string' argument with the table to be parsed; it may also contain a
41'sep_char' argument to specify how the fields are separated.
42
43The table should have witnesses arranged in columns, with the witness sigla
44in the first row. Empty cells are interpreted as omissions (and thus
45stemmatologically relevant.) Longer lacunae in the text, to be disregarded
46in cladistic analysis, may be represented by filling the appropriate cells
47with the tag '#LACUNA#'.
48
49If a witness name ends in the collation's ac_label, it will be treated as
50an 'ante-correction' version of the 'main' witness whose sigil it shares.
51
52=begin testing
53
54use Text::Tradition;
55binmode STDOUT, ":utf8";
56binmode STDERR, ":utf8";
57eval { no warnings; binmode $DB::OUT, ":utf8"; };
58
59my $csv = 't/data/florilegium.csv';
60my $t = Text::Tradition->new(
61 'name' => 'inline',
62 'input' => 'Tabular',
63 'file' => $csv,
64 'sep_char' => ',',
65 );
d9e873d0 66
3b853983 67is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
d9e873d0 68
3b853983 69### TODO Check these figures
70if( $t ) {
0e47f4f6 71 is( scalar $t->collation->readings, 311, "Collation has all readings" );
72 is( scalar $t->collation->paths, 361, "Collation has all paths" );
3b853983 73 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
74}
75
b0b4421a 76# Check that we have the right witnesses
77my %seen_wits;
78map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
79foreach my $wit ( $t->witnesses ) {
80 $seen_wits{$wit->sigil} = 1;
81}
82is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
83foreach my $k ( keys %seen_wits ) {
84 ok( $seen_wits{$k}, "Witness $k still exists" );
85}
86
87# Check that the witnesses have the right texts
88foreach my $wit ( $t->witnesses ) {
89 my $origtext = join( ' ', @{$wit->text} );
90 my $graphtext = $t->collation->path_text( $wit->sigil );
91 is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
92}
93
94# Check that the a.c. witnesses have the right text
95map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
96foreach my $k ( keys %seen_wits ) {
97 my $wit = $t->witness( $k );
98 if( $seen_wits{$k} ) {
99 ok( $wit->is_layered, "Witness $k got marked as layered" );
100 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
101 my $origtext = join( ' ', @{$wit->layertext} );
102 my $acsig = $wit->sigil . $t->collation->ac_label;
861c3e27 103 my $graphtext = $t->collation->path_text( $acsig );
b0b4421a 104 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
105 } else {
106 ok( !$wit->is_layered, "Witness $k not marked as layered" );
107 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
108 }
109}
110
3b853983 111=end testing
d9e873d0 112
113=cut
114
115sub parse {
dfc37e38 116 my( $tradition, $opts ) = @_;
d9e873d0 117 my $c = $tradition->collation; # shorthand
97a52a67 118 my $csv_options = { 'binary' => 1 };
119 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
120 if( $csv_options->{'sep_char'} eq "\t" ) {
121 # If it is really tab separated, nothing is an escape char.
122 $csv_options->{'quote_char'} = undef;
123 }
124 my $csv = Text::CSV_XS->new( $csv_options );
dfc37e38 125
d9e873d0 126 my $alignment_table;
dfc37e38 127 if( exists $opts->{'string' } ) {
128 my @lines = split( "\n", $opts->{'string'} );
129 foreach my $l ( @lines ) {
130 my $status = $csv->parse( $l );
131 if( $status ) {
132 push( @$alignment_table, [ $csv->fields ] );
133 } else {
134 warn "Could not parse line $l: " . $csv->error_input;
135 }
136 }
137 } elsif( exists $opts->{'file'} ) {
bb11025b 138 open( my $fh, $opts->{'file'} )
139 or warn "Could not open input file " . $opts->{'file'};
140 binmode( $fh, ':utf8' );
dfc37e38 141 while( my $row = $csv->getline( $fh ) ) {
142 push( @$alignment_table, $row );
d9e873d0 143 }
dfc37e38 144 close $fh;
145 } else {
146 warn "Could not find string or file option to parse";
147 return;
d9e873d0 148 }
dfc37e38 149
d9e873d0 150 # Set up the witnesses we find in the first line
151 my @witnesses;
b0b4421a 152 my %ac_wits; # Track layered witness -> main witness mapping
d9e873d0 153 foreach my $sigil ( @{$alignment_table->[0]} ) {
154 my $wit = $tradition->add_witness( 'sigil' => $sigil );
155 $wit->path( [ $c->start ] );
156 push( @witnesses, $wit );
3b853983 157 my $aclabel = $c->ac_label;
158 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
b0b4421a 159 $ac_wits{$sigil} = $1;
3b853983 160 }
d9e873d0 161 }
162
b0b4421a 163 # Save the original witness text sequences. Have to loop back through
164 # the witness columns after we have identified all the a.c. witnesses.
165 foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
166 my @sequence = map { $_->[$idx] } @{$alignment_table};
167 my $sigil = shift @sequence;
168 my $is_layer = exists( $ac_wits{$sigil} );
169 my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
170 # Now get rid of gaps and meta-readings like #LACUNA#
171 my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
172 $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
173 }
174
d9e873d0 175 # Now for the next rows, make nodes as necessary, assign their ranks, and
176 # add them to the witness paths.
d9e873d0 177 foreach my $idx ( 1 .. $#{$alignment_table} ) {
178 my $row = $alignment_table->[$idx];
179 my $nodes = make_nodes( $c, $row, $idx );
180 foreach my $w ( 0 .. $#{$row} ) {
181 # push the appropriate node onto the appropriate witness path
182 my $word = $row->[$w];
183 if( $word ) {
184 my $reading = $nodes->{$word};
185 my $wit = $witnesses[$w];
186 push( @{$wit->path}, $reading );
187 } # else skip it for empty readings.
188 }
189 }
190
eca16057 191 # Collapse our lacunae into a single node and
192 # push the end node onto all paths.
d9e873d0 193 $c->end->rank( scalar @$alignment_table );
194 foreach my $wit ( @witnesses ) {
eca16057 195 my $p = $wit->path;
196 my $last_rdg = shift @$p;
197 my $new_p = [ $last_rdg ];
198 foreach my $rdg ( @$p ) {
83d5ac3a 199 # Omit the reading if we are in a lacuna already.
200 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
201 # Save the reading otherwise.
202 push( @$new_p, $rdg );
203 $last_rdg = $rdg;
eca16057 204 }
205 push( @$new_p, $c->end );
206 $wit->path( $new_p );
d9e873d0 207 }
208
3b853983 209 # Fold any a.c. witnesses into their main witness objects, and
210 # delete the independent a.c. versions.
211 foreach my $a ( keys %ac_wits ) {
b0b4421a 212 my $ac_wit = $tradition->witness( $a );
213 my $main_wit = $tradition->witness( $ac_wits{$a} );
3b853983 214 next unless $main_wit;
861c3e27 215 $main_wit->is_layered(1);
3b853983 216 $main_wit->uncorrected_path( $ac_wit->path );
217 $tradition->del_witness( $ac_wit );
218 }
83d5ac3a 219
d9e873d0 220 # Join up the paths.
221 $c->make_witness_paths;
83d5ac3a 222 # Delete our unused lacuna nodes.
223 foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
224 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
225 }
861c3e27 226
227 # Do a consistency check.
228 foreach my $wit ( $tradition->witnesses ) {
229 my $pathtext = $c->path_text( $wit->sigil );
230 my $origtext = join( ' ', @{$wit->text} );
231 warn "Text differs for witness " . $wit->sigil
232 unless $pathtext eq $origtext;
233 if( $wit->is_layered ) {
234 $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
235 $origtext = join( ' ', @{$wit->layertext} );
236 warn "Ante-corr text differs for witness " . $wit->sigil
237 unless $pathtext eq $origtext;
238 } else {
239 warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
240 if $wit->has_layertext;
241 }
242 }
d9e873d0 243}
244
245sub make_nodes {
246 my( $collation, $row, $index ) = @_;
247 my %unique;
248 foreach my $w ( @$row ) {
249 $unique{$w} = 1 if $w;
250 }
251 my $ctr = 1;
252 foreach my $w ( keys %unique ) {
a753cc84 253 my $rargs = {
a753cc84 254 'id' => "$index,$ctr",
255 'rank' => $index,
256 'text' => $w,
257 };
9f213786 258 $rargs->{'is_lacuna'} = 1 if $w eq '#LACUNA#';
a753cc84 259 my $r = $collation->add_reading( $rargs );
d9e873d0 260 $unique{$w} = $r;
a753cc84 261 $ctr++;
d9e873d0 262 }
263 return \%unique;
264}
265
3b853983 2661;
267
268=head1 LICENSE
269
270This package is free software and is provided "as is" without express
271or implied warranty. You can redistribute it and/or modify it under
272the same terms as Perl itself.
273
274=head1 AUTHOR
275
276Tara L Andrews E<lt>aurum@cpan.orgE<gt>