1 package Text::Tradition::Parser::Tabular;
9 Text::Tradition::Parser::Tabular
15 my $t_from_file = Text::Tradition->new(
18 'file' => '/path/to/collation.csv',
22 my $t_from_string = Text::Tradition->new(
25 'string' => $tab_separated_collation,
31 Parser module for Text::Tradition to read an alignment table format, such as CSV.
35 =head2 B<parse>( $tradition, $option_hash )
37 Takes an initialized tradition and a set of options; creates the
38 appropriate nodes and edges on the graph, as well as the appropriate
39 witness objects. The $option_hash must contain either a 'file' or a
40 'string' argument with the table to be parsed; it may also contain a
41 'sep_char' argument to specify how the fields are separated.
43 The table should have witnesses arranged in columns, with the witness sigla
44 in the first row. Empty cells are interpreted as omissions (and thus
45 stemmatologically relevant.) Longer lacunae in the text, to be disregarded
46 in cladistic analysis, may be represented by filling the appropriate cells
47 with the tag '#LACUNA#'.
49 If a witness name ends in the collation's ac_label, it will be treated as
50 an 'ante-correction' version of the 'main' witness whose sigil it shares.
55 binmode STDOUT, ":utf8";
56 binmode STDERR, ":utf8";
57 eval { no warnings; binmode $DB::OUT, ":utf8"; };
59 my $csv = 't/data/florilegium.csv';
60 my $t = Text::Tradition->new(
67 is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
69 ### TODO Check these figures
71 is( scalar $t->collation->readings, 311, "Collation has all readings" );
72 is( scalar $t->collation->paths, 361, "Collation has all paths" );
73 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
76 # Check that we have the right witnesses
78 map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
79 foreach my $wit ( $t->witnesses ) {
80 $seen_wits{$wit->sigil} = 1;
82 is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
83 foreach my $k ( keys %seen_wits ) {
84 ok( $seen_wits{$k}, "Witness $k still exists" );
87 # Check that the witnesses have the right texts
88 foreach my $wit ( $t->witnesses ) {
89 my $origtext = join( ' ', @{$wit->text} );
90 my $graphtext = $t->collation->path_text( $wit->sigil );
91 is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
94 # Check that the a.c. witnesses have the right text
95 map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
96 foreach my $k ( keys %seen_wits ) {
97 my $wit = $t->witness( $k );
98 if( $seen_wits{$k} ) {
99 ok( $wit->is_layered, "Witness $k got marked as layered" );
100 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
101 my $origtext = join( ' ', @{$wit->layertext} );
102 my $acsig = $wit->sigil . $t->collation->ac_label;
103 my $graphtext = $t->collation->path_text( $acsig );
104 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
106 ok( !$wit->is_layered, "Witness $k not marked as layered" );
107 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
111 # Check that we only have collation relationships where we need them
112 is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
119 my( $tradition, $opts ) = @_;
120 my $c = $tradition->collation; # shorthand
121 my $csv_options = { 'binary' => 1 };
122 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
123 if( $csv_options->{'sep_char'} eq "\t" ) {
124 # If it is really tab separated, nothing is an escape char.
125 $csv_options->{'quote_char'} = undef;
126 $csv_options->{'escape_char'} = undef;
128 my $csv = Text::CSV->new( $csv_options );
131 if( exists $opts->{'string' } ) {
132 my @lines = split( "\n", $opts->{'string'} );
133 foreach my $l ( @lines ) {
134 my $status = $csv->parse( $l );
136 push( @$alignment_table, [ $csv->fields ] );
138 warn "Could not parse line $l: " . $csv->error_input;
141 } elsif( exists $opts->{'file'} ) {
142 open( my $fh, $opts->{'file'} )
143 or warn "Could not open input file " . $opts->{'file'};
144 binmode( $fh, ':utf8' );
145 while( my $row = $csv->getline( $fh ) ) {
146 push( @$alignment_table, $row );
150 warn "Could not find string or file option to parse";
154 # Set up the witnesses we find in the first line
156 my %ac_wits; # Track layered witness -> main witness mapping
157 my $aclabel = $c->ac_label;
158 foreach my $sigil ( @{$alignment_table->[0]} ) {
159 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
160 # Sanitize the sigil name to an XML name
161 $sigil = $1 . '_layered';
162 $ac_wits{$sigil} = $1;
164 my $wit = $tradition->add_witness(
165 'sigil' => $sigil, 'sourcetype' => 'collation' );
166 $wit->path( [ $c->start ] );
167 push( @witnesses, $wit );
168 my $aclabel = $c->ac_label;
171 # Save the original witness text sequences. Have to loop back through
172 # the witness columns after we have identified all the a.c. witnesses.
173 foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
174 my @sequence = map { $_->[$idx] } @{$alignment_table};
175 my $sigil = shift @sequence;
176 my $is_layer = exists( $ac_wits{$sigil} );
177 my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
178 # Now get rid of gaps and meta-readings like #LACUNA#
179 my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
180 $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
183 my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
184 print STDERR "Tradition too big for row collation\n" if $nocollate;
186 # Now for the next rows, make nodes as necessary, assign their ranks, and
187 # add them to the witness paths.
188 foreach my $idx ( 1 .. $#{$alignment_table} ) {
189 my $row = $alignment_table->[$idx];
190 my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
191 foreach my $w ( 0 .. $#{$row} ) {
192 # push the appropriate node onto the appropriate witness path
193 my $word = $row->[$w];
195 my $reading = $nodes->{$word};
196 my $wit = $witnesses[$w];
197 push( @{$wit->path}, $reading );
198 } # else skip it for empty readings.
202 # Collapse our lacunae into a single node and
203 # push the end node onto all paths.
204 $c->end->rank( scalar @$alignment_table );
205 foreach my $wit ( @witnesses ) {
207 my $last_rdg = shift @$p;
208 my $new_p = [ $last_rdg ];
209 foreach my $rdg ( @$p ) {
210 # Omit the reading if we are in a lacuna already.
211 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
212 # Save the reading otherwise.
213 push( @$new_p, $rdg );
216 push( @$new_p, $c->end );
217 $wit->path( $new_p );
220 # Fold any a.c. witnesses into their main witness objects, and
221 # delete the independent a.c. versions.
222 foreach my $a ( keys %ac_wits ) {
223 my $ac_wit = $tradition->witness( $a );
224 my $main_wit = $tradition->witness( $ac_wits{$a} );
225 next unless $main_wit;
226 $main_wit->is_layered(1);
227 $main_wit->uncorrected_path( $ac_wit->path );
228 $tradition->del_witness( $ac_wit );
232 $c->make_witness_paths;
233 # Delete our unused lacuna nodes.
234 foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
235 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
238 # Do a consistency check.
239 foreach my $wit ( $tradition->witnesses ) {
240 my $pathtext = $c->path_text( $wit->sigil );
241 my $origtext = join( ' ', @{$wit->text} );
242 warn "Text differs for witness " . $wit->sigil
243 unless $pathtext eq $origtext;
244 if( $wit->is_layered ) {
245 $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
246 $origtext = join( ' ', @{$wit->layertext} );
247 warn "Ante-corr text differs for witness " . $wit->sigil
248 unless $pathtext eq $origtext;
250 warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
251 if $wit->has_layertext;
255 # Note that our ranks and common readings are set.
256 $c->_graphcalc_done(1);
257 # Remove redundant collation relationships.
258 $c->relations->filter_collations() unless $nocollate;
262 my( $collation, $row, $index, $nocollate ) = @_;
264 my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
265 foreach my $w ( @$row ) {
266 $unique{$w} = 1 if $w;
267 $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
270 foreach my $w ( keys %unique ) {
272 'id' => "r$index.$ctr",
276 if( $w eq '#LACUNA#' ) {
277 $rargs->{'is_lacuna'} = 1;
278 } elsif( $commonctr == 1 ) {
279 $rargs->{'is_common'} = 1;
281 my $r = $collation->add_reading( $rargs );
285 # Collate this sequence of readings via a single 'collation' relationship.
286 unless( $nocollate ) {
287 my @rankrdgs = values %unique;
290 my $r = shift @rankrdgs;
292 foreach my $nr ( @rankrdgs ) {
293 next if $nr->is_meta;
294 if( $collation_rel ) {
295 $collation->add_relationship( $r, $nr, $collation_rel );
297 $collation->add_relationship( $r, $nr,
298 { 'type' => 'collated',
299 'annotation' => "Parsed together for rank $index" } );
300 $collation_rel = $collation->get_relationship( $r, $nr );
312 This package is free software and is provided "as is" without express
313 or implied warranty. You can redistribute it and/or modify it under
314 the same terms as Perl itself.
318 Tara L Andrews E<lt>aurum@cpan.orgE<gt>