1 package Text::Tradition::Parser::Tabular;
9 Text::Tradition::Parser::Tabular
15 my $t_from_file = Text::Tradition->new(
18 'file' => '/path/to/collation.csv',
22 my $t_from_string = Text::Tradition->new(
25 'string' => $tab_separated_collation,
31 Parser module for Text::Tradition to read an alignment table format, such as CSV.
35 =head2 B<parse>( $tradition, $option_hash )
37 Takes an initialized tradition and a set of options; creates the
38 appropriate nodes and edges on the graph, as well as the appropriate
39 witness objects. The $option_hash must contain either a 'file' or a
40 'string' argument with the table to be parsed; it may also contain a
41 'sep_char' argument to specify how the fields are separated.
43 The table should have witnesses arranged in columns, with the witness sigla
44 in the first row. Empty cells are interpreted as omissions (and thus
45 stemmatologically relevant.) Longer lacunae in the text, to be disregarded
46 in cladistic analysis, may be represented by filling the appropriate cells
47 with the tag '#LACUNA#'.
49 If a witness name ends in the collation's ac_label, it will be treated as
50 an 'ante-correction' version of the 'main' witness whose sigil it shares.
55 binmode STDOUT, ":utf8";
56 binmode STDERR, ":utf8";
57 eval { no warnings; binmode $DB::OUT, ":utf8"; };
59 my $csv = 't/data/florilegium.csv';
60 my $t = Text::Tradition->new(
67 is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
69 ### TODO Check these figures
71 is( scalar $t->collation->readings, 311, "Collation has all readings" );
72 is( scalar $t->collation->paths, 361, "Collation has all paths" );
73 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
76 # Check that we have the right witnesses
78 map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
79 foreach my $wit ( $t->witnesses ) {
80 $seen_wits{$wit->sigil} = 1;
82 is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
83 foreach my $k ( keys %seen_wits ) {
84 ok( $seen_wits{$k}, "Witness $k still exists" );
87 # Check that the witnesses have the right texts
88 foreach my $wit ( $t->witnesses ) {
89 my $origtext = join( ' ', @{$wit->text} );
90 my $graphtext = $t->collation->path_text( $wit->sigil );
91 is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
94 # Check that the a.c. witnesses have the right text
95 map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
96 foreach my $k ( keys %seen_wits ) {
97 my $wit = $t->witness( $k );
98 if( $seen_wits{$k} ) {
99 ok( $wit->is_layered, "Witness $k got marked as layered" );
100 ok( $wit->has_layertext, "Witness $k has an a.c. version" );
101 my $origtext = join( ' ', @{$wit->layertext} );
102 my $acsig = $wit->sigil . $t->collation->ac_label;
103 my $graphtext = $t->collation->path_text( $acsig, $wit->sigil );
104 is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
106 ok( !$wit->is_layered, "Witness $k not marked as layered" );
107 ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
116 my( $tradition, $opts ) = @_;
117 my $c = $tradition->collation; # shorthand
118 my $csv = Text::CSV_XS->new( {
119 binary => 1, # binary for UTF-8
120 sep_char => exists $opts->{'sep_char'} ? $opts->{'sep_char'} : "\t" }
124 if( exists $opts->{'string' } ) {
125 my @lines = split( "\n", $opts->{'string'} );
126 foreach my $l ( @lines ) {
127 my $status = $csv->parse( $l );
129 push( @$alignment_table, [ $csv->fields ] );
131 warn "Could not parse line $l: " . $csv->error_input;
134 } elsif( exists $opts->{'file'} ) {
135 open( my $fh, $opts->{'file'} )
136 or warn "Could not open input file " . $opts->{'file'};
137 binmode( $fh, ':utf8' );
138 while( my $row = $csv->getline( $fh ) ) {
139 push( @$alignment_table, $row );
143 warn "Could not find string or file option to parse";
147 # Set up the witnesses we find in the first line
149 my %ac_wits; # Track layered witness -> main witness mapping
150 foreach my $sigil ( @{$alignment_table->[0]} ) {
151 my $wit = $tradition->add_witness( 'sigil' => $sigil );
152 $wit->path( [ $c->start ] );
153 push( @witnesses, $wit );
154 my $aclabel = $c->ac_label;
155 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
156 $ac_wits{$sigil} = $1;
160 # Save the original witness text sequences. Have to loop back through
161 # the witness columns after we have identified all the a.c. witnesses.
162 foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
163 my @sequence = map { $_->[$idx] } @{$alignment_table};
164 my $sigil = shift @sequence;
165 my $is_layer = exists( $ac_wits{$sigil} );
166 my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
167 # Now get rid of gaps and meta-readings like #LACUNA#
168 my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
169 $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
172 # Now for the next rows, make nodes as necessary, assign their ranks, and
173 # add them to the witness paths.
174 foreach my $idx ( 1 .. $#{$alignment_table} ) {
175 my $row = $alignment_table->[$idx];
176 my $nodes = make_nodes( $c, $row, $idx );
177 foreach my $w ( 0 .. $#{$row} ) {
178 # push the appropriate node onto the appropriate witness path
179 my $word = $row->[$w];
181 my $reading = $nodes->{$word};
182 my $wit = $witnesses[$w];
183 push( @{$wit->path}, $reading );
184 } # else skip it for empty readings.
188 # Collapse our lacunae into a single node and
189 # push the end node onto all paths.
190 $c->end->rank( scalar @$alignment_table );
191 foreach my $wit ( @witnesses ) {
193 my $last_rdg = shift @$p;
194 my $new_p = [ $last_rdg ];
195 foreach my $rdg ( @$p ) {
196 # Omit the reading if we are in a lacuna already.
197 next if $rdg->is_lacuna && $last_rdg->is_lacuna;
198 # Save the reading otherwise.
199 push( @$new_p, $rdg );
202 push( @$new_p, $c->end );
203 $wit->path( $new_p );
206 # Fold any a.c. witnesses into their main witness objects, and
207 # delete the independent a.c. versions.
208 foreach my $a ( keys %ac_wits ) {
209 my $ac_wit = $tradition->witness( $a );
210 my $main_wit = $tradition->witness( $ac_wits{$a} );
211 next unless $main_wit;
212 $main_wit->uncorrected_path( $ac_wit->path );
213 $tradition->del_witness( $ac_wit );
217 $c->make_witness_paths;
218 # Delete our unused lacuna nodes.
219 foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
220 $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
225 my( $collation, $row, $index ) = @_;
227 foreach my $w ( @$row ) {
228 $unique{$w} = 1 if $w;
231 foreach my $w ( keys %unique ) {
233 'id' => "$index,$ctr",
237 $rargs->{'is_lacuna'} = 1 if $w eq '#LACUNA#';
238 my $r = $collation->add_reading( $rargs );
249 This package is free software and is provided "as is" without express
250 or implied warranty. You can redistribute it and/or modify it under
251 the same terms as Perl itself.
255 Tara L Andrews E<lt>aurum@cpan.orgE<gt>