1 package Text::Tradition::Parser::Tabular;
9 Text::Tradition::Parser::Tabular
15 my $t_from_file = Text::Tradition->new(
18 'file' => '/path/to/collation.csv',
22 my $t_from_string = Text::Tradition->new(
25 'string' => $tab_separated_collation,
31 Parser module for Text::Tradition to read an alignment table format, such as CSV.
35 =head2 B<parse>( $tradition, $option_hash )
37 Takes an initialized tradition and a set of options; creates the
38 appropriate nodes and edges on the graph, as well as the appropriate
39 witness objects. The $option_hash must contain either a 'file' or a
40 'string' argument with the table to be parsed; it may also contain a
41 'sep_char' argument to specify how the fields are separated.
43 The table should have witnesses arranged in columns, with the witness sigla
44 in the first row. Empty cells are interpreted as omissions (and thus
45 stemmatologically relevant.) Longer lacunae in the text, to be disregarded
46 in cladistic analysis, may be represented by filling the appropriate cells
47 with the tag '#LACUNA#'.
49 If a witness name ends in the collation's ac_label, it will be treated as
50 an 'ante-correction' version of the 'main' witness whose sigil it shares.
55 binmode STDOUT, ":utf8";
56 binmode STDERR, ":utf8";
57 eval { no warnings; binmode $DB::OUT, ":utf8"; };
59 my $csv = 't/data/florilegium.csv';
60 my $t = Text::Tradition->new(
67 is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
69 ### TODO Check these figures
71 is( scalar $t->collation->readings, 311, "Collation has all readings" );
72 is( scalar $t->collation->paths, 361, "Collation has all paths" );
73 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
76 # Check that we have the right witnesses
78 map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
79 foreach my $wit ( $t->witnesses ) {
80 $seen_wits{$wit->sigil} = 1;
82 is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
83 foreach my $k ( keys %seen_wits ) {
84 ok( $seen_wits{$k}, "Witness $k still exists" );
87 # Check that the witnesses have the right texts
88 foreach my $wit ( $t->witnesses ) {
89 my $origtext = join( ' ', @{$wit->text} );
90 my $graphtext = $t->collation->path_text( $wit->sigil );
91 is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
94 # Check that the a.c. witnesses have the right text
95 map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
96 foreach my $k ( keys %seen_wits ) {
97 my $wit = $t->witness( $k );
98 if( $seen_wits{$k} ) {
99 ok( $wit->is_layered, "Witness $k got marked as layered" );
100 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
101 my $origtext = join( ' ', @{$wit->layertext} );
102 my $acsig = $wit->sigil . $t->collation->ac_label;
103 my $graphtext = $t->collation->path_text( $acsig );
104 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
106 ok( !$wit->is_layered, "Witness $k not marked as layered" );
107 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
116 my( $tradition, $opts ) = @_;
117 my $c = $tradition->collation; # shorthand
118 my $csv_options = { 'binary' => 1 };
119 $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
120 if( $csv_options->{'sep_char'} eq "\t" ) {
121 # If it is really tab separated, nothing is an escape char.
122 $csv_options->{'quote_char'} = undef;
123 $csv_options->{'escape_char'} = undef;
125 my $csv = Text::CSV->new( $csv_options );
128 if( exists $opts->{'string' } ) {
129 my @lines = split( "\n", $opts->{'string'} );
130 foreach my $l ( @lines ) {
131 my $status = $csv->parse( $l );
133 push( @$alignment_table, [ $csv->fields ] );
135 warn "Could not parse line $l: " . $csv->error_input;
138 } elsif( exists $opts->{'file'} ) {
139 open( my $fh, $opts->{'file'} )
140 or warn "Could not open input file " . $opts->{'file'};
141 binmode( $fh, ':utf8' );
142 while( my $row = $csv->getline( $fh ) ) {
143 push( @$alignment_table, $row );
147 warn "Could not find string or file option to parse";
151 # Set up the witnesses we find in the first line
153 my %ac_wits; # Track layered witness -> main witness mapping
154 my $aclabel = $c->ac_label;
155 foreach my $sigil ( @{$alignment_table->[0]} ) {
156 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
157 # Sanitize the sigil name to an XML name
158 $sigil = $1 . '_layered';
159 $ac_wits{$sigil} = $1;
161 my $wit = $tradition->add_witness(
162 'sigil' => $sigil, 'sourcetype' => 'collation' );
163 $wit->path( [ $c->start ] );
164 push( @witnesses, $wit );
165 my $aclabel = $c->ac_label;
168 # Save the original witness text sequences. Have to loop back through
169 # the witness columns after we have identified all the a.c. witnesses.
170 foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
171 my @sequence = map { $_->[$idx] } @{$alignment_table};
172 my $sigil = shift @sequence;
173 my $is_layer = exists( $ac_wits{$sigil} );
174 my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
175 # Now get rid of gaps and meta-readings like #LACUNA#
176 my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
177 $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
180 # Now for the next rows, make nodes as necessary, assign their ranks, and
181 # add them to the witness paths.
182 foreach my $idx ( 1 .. $#{$alignment_table} ) {
183 my $row = $alignment_table->[$idx];
184 my $nodes = _make_nodes( $c, $row, $idx );
185 foreach my $w ( 0 .. $#{$row} ) {
186 # push the appropriate node onto the appropriate witness path
187 my $word = $row->[$w];
189 my $reading = $nodes->{$word};
190 my $wit = $witnesses[$w];
191 push( @{$wit->path}, $reading );
192 } # else skip it for empty readings.
196 # Collapse our lacunae into a single node and
197 # push the end node onto all paths.
198 $c->end->rank( scalar @$alignment_table );
199 foreach my $wit ( @witnesses ) {
201 my $last_rdg = shift @$p;
202 my $new_p = [ $last_rdg ];
203 foreach my $rdg ( @$p ) {
204 # Omit the reading if we are in a lacuna already.
205 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
206 # Save the reading otherwise.
207 push( @$new_p, $rdg );
210 push( @$new_p, $c->end );
211 $wit->path( $new_p );
214 # Fold any a.c. witnesses into their main witness objects, and
215 # delete the independent a.c. versions.
216 foreach my $a ( keys %ac_wits ) {
217 my $ac_wit = $tradition->witness( $a );
218 my $main_wit = $tradition->witness( $ac_wits{$a} );
219 next unless $main_wit;
220 $main_wit->is_layered(1);
221 $main_wit->uncorrected_path( $ac_wit->path );
222 $tradition->del_witness( $ac_wit );
226 $c->make_witness_paths;
227 # Delete our unused lacuna nodes.
228 foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
229 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
232 # Do a consistency check.
233 foreach my $wit ( $tradition->witnesses ) {
234 my $pathtext = $c->path_text( $wit->sigil );
235 my $origtext = join( ' ', @{$wit->text} );
236 warn "Text differs for witness " . $wit->sigil
237 unless $pathtext eq $origtext;
238 if( $wit->is_layered ) {
239 $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
240 $origtext = join( ' ', @{$wit->layertext} );
241 warn "Ante-corr text differs for witness " . $wit->sigil
242 unless $pathtext eq $origtext;
244 warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
245 if $wit->has_layertext;
249 # Note that our ranks and common readings are set.
250 $c->_graphcalc_done(1);
254 my( $collation, $row, $index ) = @_;
256 my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
257 foreach my $w ( @$row ) {
258 $unique{$w} = 1 if $w;
259 $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
262 foreach my $w ( keys %unique ) {
264 'id' => "$index,$ctr",
268 if( $w eq '#LACUNA#' ) {
269 $rargs->{'is_lacuna'} = 1;
270 } elsif( $commonctr == 1 ) {
271 $rargs->{'is_common'} = 1;
273 my $r = $collation->add_reading( $rargs );
277 # Collate this sequence of readings via a single 'collation' relationship.
278 my @rankrdgs = values %unique;
281 my $r = shift @rankrdgs;
283 foreach my $nr ( @rankrdgs ) {
284 if( $collation_rel ) {
285 $collation->add_relationship( $r, $nr, $collation_rel );
287 $collation->add_relationship( $r, $nr,
288 { 'type' => 'collated',
289 'annotation' => "Parsed together for rank $index" } );
290 $collation_rel = $collation->get_relationship( $r, $nr );
302 This package is free software and is provided "as is" without express
303 or implied warranty. You can redistribute it and/or modify it under
304 the same terms as Perl itself.
308 Tara L Andrews E<lt>aurum@cpan.orgE<gt>