move around some doc/testing logic
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Tabular.pm
CommitLineData
d9e873d0 1package Text::Tradition::Parser::Tabular;
2
3use strict;
4use warnings;
5use Text::CSV_XS;
6
7=head1 NAME
8
9Text::Tradition::Parser::Tabular
10
3b853983 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'Tabular',
18 'file' => '/path/to/collation.csv',
19 'sep_char' => ','
20 );
21
22 my $t_from_string = Text::Tradition->new(
23 'name' => 'my text',
24 'input' => 'Tabular',
25 'string' => $tab_separated_collation,
26 'sep_char' => "\t",
27 );
28
d9e873d0 29=head1 DESCRIPTION
30
31Parser module for Text::Tradition to read an alignment table format, such as CSV.
32
33=head1 METHODS
34
35=over
36
3b853983 37=item B<parse>( $tradition, $option_hash )
38
39Takes an initialized tradition and a set of options; creates the
40appropriate nodes and edges on the graph, as well as the appropriate
41witness objects. The $option_hash must contain either a 'file' or a
42'string' argument with the table to be parsed; it may also contain a
43'sep_char' argument to specify how the fields are separated.
44
45The table should have witnesses arranged in columns, with the witness sigla
46in the first row. Empty cells are interpreted as omissions (and thus
47stemmatologically relevant.) Longer lacunae in the text, to be disregarded
48in cladistic analysis, may be represented by filling the appropriate cells
49with the tag '#LACUNA#'.
50
51If a witness name ends in the collation's ac_label, it will be treated as
52an 'ante-correction' version of the 'main' witness whose sigil it shares.
53
54=begin testing
55
56use Text::Tradition;
57binmode STDOUT, ":utf8";
58binmode STDERR, ":utf8";
59eval { no warnings; binmode $DB::OUT, ":utf8"; };
60
61my $csv = 't/data/florilegium.csv';
62my $t = Text::Tradition->new(
63 'name' => 'inline',
64 'input' => 'Tabular',
65 'file' => $csv,
66 'sep_char' => ',',
67 );
d9e873d0 68
3b853983 69is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
d9e873d0 70
3b853983 71### TODO Check these figures
72if( $t ) {
73 is( scalar $t->collation->readings, 313, "Collation has all readings" );
74 is( scalar $t->collation->paths, 2877, "Collation has all paths" );
75 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
76}
77
78=end testing
d9e873d0 79
80=cut
81
82sub parse {
dfc37e38 83 my( $tradition, $opts ) = @_;
d9e873d0 84 my $c = $tradition->collation; # shorthand
dfc37e38 85 my $csv = Text::CSV_XS->new( {
86 binary => 1, # binary for UTF-8
87 sep_char => exists $opts->{'sep_char'} ? $opts->{'sep_char'} : "\t" }
88 );
dfc37e38 89
d9e873d0 90 my $alignment_table;
dfc37e38 91 if( exists $opts->{'string' } ) {
92 my @lines = split( "\n", $opts->{'string'} );
93 foreach my $l ( @lines ) {
94 my $status = $csv->parse( $l );
95 if( $status ) {
96 push( @$alignment_table, [ $csv->fields ] );
97 } else {
98 warn "Could not parse line $l: " . $csv->error_input;
99 }
100 }
101 } elsif( exists $opts->{'file'} ) {
102 open( my $fh, $opts->{'file'} ) or die "Could not open input file " . $opts->{'file'};
103 while( my $row = $csv->getline( $fh ) ) {
104 push( @$alignment_table, $row );
d9e873d0 105 }
dfc37e38 106 close $fh;
107 } else {
108 warn "Could not find string or file option to parse";
109 return;
d9e873d0 110 }
dfc37e38 111
d9e873d0 112 # Set up the witnesses we find in the first line
113 my @witnesses;
3b853983 114 my %ac_wits; # Track these for later removal
d9e873d0 115 foreach my $sigil ( @{$alignment_table->[0]} ) {
116 my $wit = $tradition->add_witness( 'sigil' => $sigil );
117 $wit->path( [ $c->start ] );
118 push( @witnesses, $wit );
3b853983 119 my $aclabel = $c->ac_label;
120 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
121 $ac_wits{$1} = $wit;
122 }
d9e873d0 123 }
124
125 # Now for the next rows, make nodes as necessary, assign their ranks, and
126 # add them to the witness paths.
d9e873d0 127 foreach my $idx ( 1 .. $#{$alignment_table} ) {
128 my $row = $alignment_table->[$idx];
129 my $nodes = make_nodes( $c, $row, $idx );
130 foreach my $w ( 0 .. $#{$row} ) {
131 # push the appropriate node onto the appropriate witness path
132 my $word = $row->[$w];
133 if( $word ) {
134 my $reading = $nodes->{$word};
135 my $wit = $witnesses[$w];
136 push( @{$wit->path}, $reading );
137 } # else skip it for empty readings.
138 }
139 }
140
eca16057 141
142 # Collapse our lacunae into a single node and
143 # push the end node onto all paths.
d9e873d0 144 $c->end->rank( scalar @$alignment_table );
145 foreach my $wit ( @witnesses ) {
eca16057 146 my $p = $wit->path;
147 my $last_rdg = shift @$p;
148 my $new_p = [ $last_rdg ];
149 foreach my $rdg ( @$p ) {
150 if( $rdg->text eq '#LACUNA#' ) {
151 # If we are in a lacuna already, drop this node.
152 # Otherwise make a lacuna node and drop this node.
153 unless( $last_rdg->is_lacuna ) {
154 my $l = $c->add_lacuna( $rdg->name );
155 $l->rank( $rdg->rank );
156 push( @$new_p, $l );
157 $last_rdg = $l;
158 }
159 $c->del_reading( $rdg );
160 } else {
161 # No lacuna, save the reading.
162 push( @$new_p, $rdg );
163 }
164 }
165 push( @$new_p, $c->end );
166 $wit->path( $new_p );
d9e873d0 167 }
168
3b853983 169 # Fold any a.c. witnesses into their main witness objects, and
170 # delete the independent a.c. versions.
171 foreach my $a ( keys %ac_wits ) {
172 my $main_wit = $tradition->witness( $a );
173 next unless $main_wit;
174 my $ac_wit = $ac_wits{$a};
175 $main_wit->uncorrected_path( $ac_wit->path );
176 $tradition->del_witness( $ac_wit );
177 }
178
d9e873d0 179 # Join up the paths.
180 $c->make_witness_paths;
d9e873d0 181}
182
183sub make_nodes {
184 my( $collation, $row, $index ) = @_;
185 my %unique;
186 foreach my $w ( @$row ) {
187 $unique{$w} = 1 if $w;
188 }
189 my $ctr = 1;
190 foreach my $w ( keys %unique ) {
191 my $r = $collation->add_reading( "$index,$ctr" );
192 $ctr++;
193 $r->rank( $index );
194 $r->text( $w );
195 $unique{$w} = $r;
196 }
197 return \%unique;
198}
199
3b853983 2001;
201
202=head1 LICENSE
203
204This package is free software and is provided "as is" without express
205or implied warranty. You can redistribute it and/or modify it under
206the same terms as Perl itself.
207
208=head1 AUTHOR
209
210Tara L Andrews E<lt>aurum@cpan.orgE<gt>