integrate updates from shared lemmatizer repo
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Tabular.pm
CommitLineData
d9e873d0 1package Text::Tradition::Parser::Tabular;
2
3use strict;
4use warnings;
82fa4d57 5use Text::CSV;
d9e873d0 6
7=head1 NAME
8
9Text::Tradition::Parser::Tabular
10
3b853983 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'Tabular',
18 'file' => '/path/to/collation.csv',
19 'sep_char' => ','
20 );
21
22 my $t_from_string = Text::Tradition->new(
23 'name' => 'my text',
24 'input' => 'Tabular',
25 'string' => $tab_separated_collation,
26 'sep_char' => "\t",
27 );
28
d9e873d0 29=head1 DESCRIPTION
30
31Parser module for Text::Tradition to read an alignment table format, such as CSV.
32
33=head1 METHODS
34
e867486f 35=head2 B<parse>( $tradition, $option_hash )
3b853983 36
37Takes an initialized tradition and a set of options; creates the
38appropriate nodes and edges on the graph, as well as the appropriate
39witness objects. The $option_hash must contain either a 'file' or a
40'string' argument with the table to be parsed; it may also contain a
41'sep_char' argument to specify how the fields are separated.
42
43The table should have witnesses arranged in columns, with the witness sigla
44in the first row. Empty cells are interpreted as omissions (and thus
45stemmatologically relevant.) Longer lacunae in the text, to be disregarded
46in cladistic analysis, may be represented by filling the appropriate cells
47with the tag '#LACUNA#'.
48
49If a witness name ends in the collation's ac_label, it will be treated as
50an 'ante-correction' version of the 'main' witness whose sigil it shares.
51
52=begin testing
53
54use Text::Tradition;
55binmode STDOUT, ":utf8";
56binmode STDERR, ":utf8";
57eval { no warnings; binmode $DB::OUT, ":utf8"; };
58
59my $csv = 't/data/florilegium.csv';
60my $t = Text::Tradition->new(
61 'name' => 'inline',
62 'input' => 'Tabular',
63 'file' => $csv,
64 'sep_char' => ',',
65 );
d9e873d0 66
3b853983 67is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
d9e873d0 68
3b853983 69### TODO Check these figures
70if( $t ) {
0e47f4f6 71 is( scalar $t->collation->readings, 311, "Collation has all readings" );
72 is( scalar $t->collation->paths, 361, "Collation has all paths" );
3b853983 73 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
74}
75
b0b4421a 76# Check that we have the right witnesses
77my %seen_wits;
78map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
79foreach my $wit ( $t->witnesses ) {
80 $seen_wits{$wit->sigil} = 1;
81}
82is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
83foreach my $k ( keys %seen_wits ) {
84 ok( $seen_wits{$k}, "Witness $k still exists" );
85}
86
87# Check that the witnesses have the right texts
88foreach my $wit ( $t->witnesses ) {
89 my $origtext = join( ' ', @{$wit->text} );
90 my $graphtext = $t->collation->path_text( $wit->sigil );
91 is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
92}
93
94# Check that the a.c. witnesses have the right text
95map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
96foreach my $k ( keys %seen_wits ) {
97 my $wit = $t->witness( $k );
98 if( $seen_wits{$k} ) {
99 ok( $wit->is_layered, "Witness $k got marked as layered" );
100 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
101 my $origtext = join( ' ', @{$wit->layertext} );
102 my $acsig = $wit->sigil . $t->collation->ac_label;
861c3e27 103 my $graphtext = $t->collation->path_text( $acsig );
b0b4421a 104 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
105 } else {
106 ok( !$wit->is_layered, "Witness $k not marked as layered" );
107 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
108 }
109}
110
cc31ebaa 111# Check that we only have collation relationships where we need them
112is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
cc31ebaa 113
3b853983 114=end testing
d9e873d0 115
116=cut
117
118sub parse {
dfc37e38 119 my( $tradition, $opts ) = @_;
d9e873d0 120 my $c = $tradition->collation; # shorthand
97a52a67 121 my $csv_options = { 'binary' => 1 };
122 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
123 if( $csv_options->{'sep_char'} eq "\t" ) {
124 # If it is really tab separated, nothing is an escape char.
125 $csv_options->{'quote_char'} = undef;
bba696c6 126 $csv_options->{'escape_char'} = undef;
97a52a67 127 }
82fa4d57 128 my $csv = Text::CSV->new( $csv_options );
dfc37e38 129
d9e873d0 130 my $alignment_table;
dfc37e38 131 if( exists $opts->{'string' } ) {
132 my @lines = split( "\n", $opts->{'string'} );
133 foreach my $l ( @lines ) {
134 my $status = $csv->parse( $l );
135 if( $status ) {
136 push( @$alignment_table, [ $csv->fields ] );
137 } else {
138 warn "Could not parse line $l: " . $csv->error_input;
139 }
140 }
141 } elsif( exists $opts->{'file'} ) {
bb11025b 142 open( my $fh, $opts->{'file'} )
143 or warn "Could not open input file " . $opts->{'file'};
144 binmode( $fh, ':utf8' );
dfc37e38 145 while( my $row = $csv->getline( $fh ) ) {
146 push( @$alignment_table, $row );
d9e873d0 147 }
dfc37e38 148 close $fh;
149 } else {
150 warn "Could not find string or file option to parse";
151 return;
d9e873d0 152 }
dfc37e38 153
d9e873d0 154 # Set up the witnesses we find in the first line
155 my @witnesses;
b0b4421a 156 my %ac_wits; # Track layered witness -> main witness mapping
82fa4d57 157 my $aclabel = $c->ac_label;
d9e873d0 158 foreach my $sigil ( @{$alignment_table->[0]} ) {
3b853983 159 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
82fa4d57 160 # Sanitize the sigil name to an XML name
161 $sigil = $1 . '_layered';
b0b4421a 162 $ac_wits{$sigil} = $1;
3b853983 163 }
82fa4d57 164 my $wit = $tradition->add_witness(
165 'sigil' => $sigil, 'sourcetype' => 'collation' );
166 $wit->path( [ $c->start ] );
167 push( @witnesses, $wit );
168 my $aclabel = $c->ac_label;
d9e873d0 169 }
170
b0b4421a 171 # Save the original witness text sequences. Have to loop back through
172 # the witness columns after we have identified all the a.c. witnesses.
173 foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
174 my @sequence = map { $_->[$idx] } @{$alignment_table};
175 my $sigil = shift @sequence;
176 my $is_layer = exists( $ac_wits{$sigil} );
177 my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
178 # Now get rid of gaps and meta-readings like #LACUNA#
179 my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
180 $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
181 }
182
d9e873d0 183 # Now for the next rows, make nodes as necessary, assign their ranks, and
184 # add them to the witness paths.
d9e873d0 185 foreach my $idx ( 1 .. $#{$alignment_table} ) {
186 my $row = $alignment_table->[$idx];
027d819c 187 my $nodes = _make_nodes( $c, $row, $idx );
d9e873d0 188 foreach my $w ( 0 .. $#{$row} ) {
189 # push the appropriate node onto the appropriate witness path
190 my $word = $row->[$w];
191 if( $word ) {
192 my $reading = $nodes->{$word};
193 my $wit = $witnesses[$w];
194 push( @{$wit->path}, $reading );
195 } # else skip it for empty readings.
196 }
197 }
198
eca16057 199 # Collapse our lacunae into a single node and
200 # push the end node onto all paths.
d9e873d0 201 $c->end->rank( scalar @$alignment_table );
202 foreach my $wit ( @witnesses ) {
eca16057 203 my $p = $wit->path;
204 my $last_rdg = shift @$p;
205 my $new_p = [ $last_rdg ];
206 foreach my $rdg ( @$p ) {
83d5ac3a 207 # Omit the reading if we are in a lacuna already.
208 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
209 # Save the reading otherwise.
210 push( @$new_p, $rdg );
211 $last_rdg = $rdg;
eca16057 212 }
213 push( @$new_p, $c->end );
214 $wit->path( $new_p );
d9e873d0 215 }
216
3b853983 217 # Fold any a.c. witnesses into their main witness objects, and
218 # delete the independent a.c. versions.
219 foreach my $a ( keys %ac_wits ) {
b0b4421a 220 my $ac_wit = $tradition->witness( $a );
221 my $main_wit = $tradition->witness( $ac_wits{$a} );
3b853983 222 next unless $main_wit;
861c3e27 223 $main_wit->is_layered(1);
3b853983 224 $main_wit->uncorrected_path( $ac_wit->path );
225 $tradition->del_witness( $ac_wit );
226 }
83d5ac3a 227
d9e873d0 228 # Join up the paths.
229 $c->make_witness_paths;
83d5ac3a 230 # Delete our unused lacuna nodes.
231 foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
232 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
233 }
861c3e27 234
235 # Do a consistency check.
236 foreach my $wit ( $tradition->witnesses ) {
237 my $pathtext = $c->path_text( $wit->sigil );
238 my $origtext = join( ' ', @{$wit->text} );
239 warn "Text differs for witness " . $wit->sigil
240 unless $pathtext eq $origtext;
241 if( $wit->is_layered ) {
242 $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
243 $origtext = join( ' ', @{$wit->layertext} );
244 warn "Ante-corr text differs for witness " . $wit->sigil
245 unless $pathtext eq $origtext;
246 } else {
247 warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
248 if $wit->has_layertext;
249 }
250 }
202ccb18 251
252 # Note that our ranks and common readings are set.
253 $c->_graphcalc_done(1);
cc31ebaa 254 # Remove redundant collation relationships.
255 $c->relations->filter_collations();
d9e873d0 256}
257
027d819c 258sub _make_nodes {
d9e873d0 259 my( $collation, $row, $index ) = @_;
260 my %unique;
15db7774 261 my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
d9e873d0 262 foreach my $w ( @$row ) {
263 $unique{$w} = 1 if $w;
15db7774 264 $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
d9e873d0 265 }
266 my $ctr = 1;
267 foreach my $w ( keys %unique ) {
a753cc84 268 my $rargs = {
a753cc84 269 'id' => "$index,$ctr",
270 'rank' => $index,
271 'text' => $w,
272 };
15db7774 273 if( $w eq '#LACUNA#' ) {
274 $rargs->{'is_lacuna'} = 1;
275 } elsif( $commonctr == 1 ) {
276 $rargs->{'is_common'} = 1;
277 }
a753cc84 278 my $r = $collation->add_reading( $rargs );
d9e873d0 279 $unique{$w} = $r;
a753cc84 280 $ctr++;
d9e873d0 281 }
bf6e338d 282 # Collate this sequence of readings via a single 'collation' relationship.
283 my @rankrdgs = values %unique;
284 my $collation_rel;
285 while( @rankrdgs ) {
286 my $r = shift @rankrdgs;
287 next if $r->is_meta;
288 foreach my $nr ( @rankrdgs ) {
2889d396 289 next if $nr->is_meta;
bf6e338d 290 if( $collation_rel ) {
291 $collation->add_relationship( $r, $nr, $collation_rel );
292 } else {
293 $collation->add_relationship( $r, $nr,
294 { 'type' => 'collated',
295 'annotation' => "Parsed together for rank $index" } );
296 $collation_rel = $collation->get_relationship( $r, $nr );
297 }
298 }
299 }
300
d9e873d0 301 return \%unique;
302}
303
3b853983 3041;
305
306=head1 LICENSE
307
308This package is free software and is provided "as is" without express
309or implied warranty. You can redistribute it and/or modify it under
310the same terms as Perl itself.
311
312=head1 AUTHOR
313
314Tara L Andrews E<lt>aurum@cpan.orgE<gt>