fix graphml subgraph generation; fix tab parsing to only set 'collated' where needed...
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Tabular.pm
CommitLineData
d9e873d0 1package Text::Tradition::Parser::Tabular;
2
3use strict;
4use warnings;
82fa4d57 5use Text::CSV;
d9e873d0 6
7=head1 NAME
8
9Text::Tradition::Parser::Tabular
10
3b853983 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'Tabular',
18 'file' => '/path/to/collation.csv',
19 'sep_char' => ','
20 );
21
22 my $t_from_string = Text::Tradition->new(
23 'name' => 'my text',
24 'input' => 'Tabular',
25 'string' => $tab_separated_collation,
26 'sep_char' => "\t",
27 );
28
d9e873d0 29=head1 DESCRIPTION
30
31Parser module for Text::Tradition to read an alignment table format, such as CSV.
32
33=head1 METHODS
34
e867486f 35=head2 B<parse>( $tradition, $option_hash )
3b853983 36
37Takes an initialized tradition and a set of options; creates the
38appropriate nodes and edges on the graph, as well as the appropriate
39witness objects. The $option_hash must contain either a 'file' or a
40'string' argument with the table to be parsed; it may also contain a
41'sep_char' argument to specify how the fields are separated.
42
43The table should have witnesses arranged in columns, with the witness sigla
44in the first row. Empty cells are interpreted as omissions (and thus
45stemmatologically relevant.) Longer lacunae in the text, to be disregarded
46in cladistic analysis, may be represented by filling the appropriate cells
47with the tag '#LACUNA#'.
48
49If a witness name ends in the collation's ac_label, it will be treated as
50an 'ante-correction' version of the 'main' witness whose sigil it shares.
51
52=begin testing
53
54use Text::Tradition;
55binmode STDOUT, ":utf8";
56binmode STDERR, ":utf8";
57eval { no warnings; binmode $DB::OUT, ":utf8"; };
58
59my $csv = 't/data/florilegium.csv';
60my $t = Text::Tradition->new(
61 'name' => 'inline',
62 'input' => 'Tabular',
63 'file' => $csv,
64 'sep_char' => ',',
65 );
d9e873d0 66
3b853983 67is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
d9e873d0 68
3b853983 69### TODO Check these figures
70if( $t ) {
0e47f4f6 71 is( scalar $t->collation->readings, 311, "Collation has all readings" );
72 is( scalar $t->collation->paths, 361, "Collation has all paths" );
3b853983 73 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
74}
75
b0b4421a 76# Check that we have the right witnesses
77my %seen_wits;
78map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
79foreach my $wit ( $t->witnesses ) {
80 $seen_wits{$wit->sigil} = 1;
81}
82is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
83foreach my $k ( keys %seen_wits ) {
84 ok( $seen_wits{$k}, "Witness $k still exists" );
85}
86
87# Check that the witnesses have the right texts
88foreach my $wit ( $t->witnesses ) {
89 my $origtext = join( ' ', @{$wit->text} );
90 my $graphtext = $t->collation->path_text( $wit->sigil );
91 is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
92}
93
94# Check that the a.c. witnesses have the right text
95map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
96foreach my $k ( keys %seen_wits ) {
97 my $wit = $t->witness( $k );
98 if( $seen_wits{$k} ) {
99 ok( $wit->is_layered, "Witness $k got marked as layered" );
100 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
101 my $origtext = join( ' ', @{$wit->layertext} );
102 my $acsig = $wit->sigil . $t->collation->ac_label;
861c3e27 103 my $graphtext = $t->collation->path_text( $acsig );
b0b4421a 104 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
105 } else {
106 ok( !$wit->is_layered, "Witness $k not marked as layered" );
107 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
108 }
109}
110
cc31ebaa 111# Check that we only have collation relationships where we need them
112is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
113foreach my $rel ( $t->collation->relationships ) {
114 print STDERR $rel->[0] . " -> " . $rel->[1] . "\n";
115}
116
3b853983 117=end testing
d9e873d0 118
119=cut
120
121sub parse {
dfc37e38 122 my( $tradition, $opts ) = @_;
d9e873d0 123 my $c = $tradition->collation; # shorthand
97a52a67 124 my $csv_options = { 'binary' => 1 };
125 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
126 if( $csv_options->{'sep_char'} eq "\t" ) {
127 # If it is really tab separated, nothing is an escape char.
128 $csv_options->{'quote_char'} = undef;
bba696c6 129 $csv_options->{'escape_char'} = undef;
97a52a67 130 }
82fa4d57 131 my $csv = Text::CSV->new( $csv_options );
dfc37e38 132
d9e873d0 133 my $alignment_table;
dfc37e38 134 if( exists $opts->{'string' } ) {
135 my @lines = split( "\n", $opts->{'string'} );
136 foreach my $l ( @lines ) {
137 my $status = $csv->parse( $l );
138 if( $status ) {
139 push( @$alignment_table, [ $csv->fields ] );
140 } else {
141 warn "Could not parse line $l: " . $csv->error_input;
142 }
143 }
144 } elsif( exists $opts->{'file'} ) {
bb11025b 145 open( my $fh, $opts->{'file'} )
146 or warn "Could not open input file " . $opts->{'file'};
147 binmode( $fh, ':utf8' );
dfc37e38 148 while( my $row = $csv->getline( $fh ) ) {
149 push( @$alignment_table, $row );
d9e873d0 150 }
dfc37e38 151 close $fh;
152 } else {
153 warn "Could not find string or file option to parse";
154 return;
d9e873d0 155 }
dfc37e38 156
d9e873d0 157 # Set up the witnesses we find in the first line
158 my @witnesses;
b0b4421a 159 my %ac_wits; # Track layered witness -> main witness mapping
82fa4d57 160 my $aclabel = $c->ac_label;
d9e873d0 161 foreach my $sigil ( @{$alignment_table->[0]} ) {
3b853983 162 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
82fa4d57 163 # Sanitize the sigil name to an XML name
164 $sigil = $1 . '_layered';
b0b4421a 165 $ac_wits{$sigil} = $1;
3b853983 166 }
82fa4d57 167 my $wit = $tradition->add_witness(
168 'sigil' => $sigil, 'sourcetype' => 'collation' );
169 $wit->path( [ $c->start ] );
170 push( @witnesses, $wit );
171 my $aclabel = $c->ac_label;
d9e873d0 172 }
173
b0b4421a 174 # Save the original witness text sequences. Have to loop back through
175 # the witness columns after we have identified all the a.c. witnesses.
176 foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
177 my @sequence = map { $_->[$idx] } @{$alignment_table};
178 my $sigil = shift @sequence;
179 my $is_layer = exists( $ac_wits{$sigil} );
180 my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
181 # Now get rid of gaps and meta-readings like #LACUNA#
182 my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
183 $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
184 }
185
d9e873d0 186 # Now for the next rows, make nodes as necessary, assign their ranks, and
187 # add them to the witness paths.
d9e873d0 188 foreach my $idx ( 1 .. $#{$alignment_table} ) {
189 my $row = $alignment_table->[$idx];
027d819c 190 my $nodes = _make_nodes( $c, $row, $idx );
d9e873d0 191 foreach my $w ( 0 .. $#{$row} ) {
192 # push the appropriate node onto the appropriate witness path
193 my $word = $row->[$w];
194 if( $word ) {
195 my $reading = $nodes->{$word};
196 my $wit = $witnesses[$w];
197 push( @{$wit->path}, $reading );
198 } # else skip it for empty readings.
199 }
200 }
201
eca16057 202 # Collapse our lacunae into a single node and
203 # push the end node onto all paths.
d9e873d0 204 $c->end->rank( scalar @$alignment_table );
205 foreach my $wit ( @witnesses ) {
eca16057 206 my $p = $wit->path;
207 my $last_rdg = shift @$p;
208 my $new_p = [ $last_rdg ];
209 foreach my $rdg ( @$p ) {
83d5ac3a 210 # Omit the reading if we are in a lacuna already.
211 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
212 # Save the reading otherwise.
213 push( @$new_p, $rdg );
214 $last_rdg = $rdg;
eca16057 215 }
216 push( @$new_p, $c->end );
217 $wit->path( $new_p );
d9e873d0 218 }
219
3b853983 220 # Fold any a.c. witnesses into their main witness objects, and
221 # delete the independent a.c. versions.
222 foreach my $a ( keys %ac_wits ) {
b0b4421a 223 my $ac_wit = $tradition->witness( $a );
224 my $main_wit = $tradition->witness( $ac_wits{$a} );
3b853983 225 next unless $main_wit;
861c3e27 226 $main_wit->is_layered(1);
3b853983 227 $main_wit->uncorrected_path( $ac_wit->path );
228 $tradition->del_witness( $ac_wit );
229 }
83d5ac3a 230
d9e873d0 231 # Join up the paths.
232 $c->make_witness_paths;
83d5ac3a 233 # Delete our unused lacuna nodes.
234 foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
235 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
236 }
861c3e27 237
238 # Do a consistency check.
239 foreach my $wit ( $tradition->witnesses ) {
240 my $pathtext = $c->path_text( $wit->sigil );
241 my $origtext = join( ' ', @{$wit->text} );
242 warn "Text differs for witness " . $wit->sigil
243 unless $pathtext eq $origtext;
244 if( $wit->is_layered ) {
245 $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
246 $origtext = join( ' ', @{$wit->layertext} );
247 warn "Ante-corr text differs for witness " . $wit->sigil
248 unless $pathtext eq $origtext;
249 } else {
250 warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
251 if $wit->has_layertext;
252 }
253 }
202ccb18 254
255 # Note that our ranks and common readings are set.
256 $c->_graphcalc_done(1);
cc31ebaa 257 # Remove redundant collation relationships.
258 $c->relations->filter_collations();
d9e873d0 259}
260
027d819c 261sub _make_nodes {
d9e873d0 262 my( $collation, $row, $index ) = @_;
263 my %unique;
15db7774 264 my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
d9e873d0 265 foreach my $w ( @$row ) {
266 $unique{$w} = 1 if $w;
15db7774 267 $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
d9e873d0 268 }
269 my $ctr = 1;
270 foreach my $w ( keys %unique ) {
a753cc84 271 my $rargs = {
a753cc84 272 'id' => "$index,$ctr",
273 'rank' => $index,
274 'text' => $w,
275 };
15db7774 276 if( $w eq '#LACUNA#' ) {
277 $rargs->{'is_lacuna'} = 1;
278 } elsif( $commonctr == 1 ) {
279 $rargs->{'is_common'} = 1;
280 }
a753cc84 281 my $r = $collation->add_reading( $rargs );
d9e873d0 282 $unique{$w} = $r;
a753cc84 283 $ctr++;
d9e873d0 284 }
bf6e338d 285 # Collate this sequence of readings via a single 'collation' relationship.
286 my @rankrdgs = values %unique;
287 my $collation_rel;
288 while( @rankrdgs ) {
289 my $r = shift @rankrdgs;
290 next if $r->is_meta;
291 foreach my $nr ( @rankrdgs ) {
292 if( $collation_rel ) {
293 $collation->add_relationship( $r, $nr, $collation_rel );
294 } else {
295 $collation->add_relationship( $r, $nr,
296 { 'type' => 'collated',
297 'annotation' => "Parsed together for rank $index" } );
298 $collation_rel = $collation->get_relationship( $r, $nr );
299 }
300 }
301 }
302
d9e873d0 303 return \%unique;
304}
305
3b853983 3061;
307
308=head1 LICENSE
309
310This package is free software and is provided "as is" without express
311or implied warranty. You can redistribute it and/or modify it under
312the same terms as Perl itself.
313
314=head1 AUTHOR
315
316Tara L Andrews E<lt>aurum@cpan.orgE<gt>