continued doc and testing drive; rationalize GraphML a little
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Tabular.pm
CommitLineData
d9e873d0 1package Text::Tradition::Parser::Tabular;
2
3use strict;
4use warnings;
5use Text::CSV_XS;
6
7=head1 NAME
8
9Text::Tradition::Parser::Tabular
10
3b853983 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'Tabular',
18 'file' => '/path/to/collation.csv',
19 'sep_char' => ','
20 );
21
22 my $t_from_string = Text::Tradition->new(
23 'name' => 'my text',
24 'input' => 'Tabular',
25 'string' => $tab_separated_collation,
26 'sep_char' => "\t",
27 );
28
d9e873d0 29=head1 DESCRIPTION
30
31Parser module for Text::Tradition to read an alignment table format, such as CSV.
32
33=head1 METHODS
34
e867486f 35=head2 B<parse>( $tradition, $option_hash )
3b853983 36
37Takes an initialized tradition and a set of options; creates the
38appropriate nodes and edges on the graph, as well as the appropriate
39witness objects. The $option_hash must contain either a 'file' or a
40'string' argument with the table to be parsed; it may also contain a
41'sep_char' argument to specify how the fields are separated.
42
43The table should have witnesses arranged in columns, with the witness sigla
44in the first row. Empty cells are interpreted as omissions (and thus
45stemmatologically relevant.) Longer lacunae in the text, to be disregarded
46in cladistic analysis, may be represented by filling the appropriate cells
47with the tag '#LACUNA#'.
48
49If a witness name ends in the collation's ac_label, it will be treated as
50an 'ante-correction' version of the 'main' witness whose sigil it shares.
51
52=begin testing
53
54use Text::Tradition;
55binmode STDOUT, ":utf8";
56binmode STDERR, ":utf8";
57eval { no warnings; binmode $DB::OUT, ":utf8"; };
58
59my $csv = 't/data/florilegium.csv';
60my $t = Text::Tradition->new(
61 'name' => 'inline',
62 'input' => 'Tabular',
63 'file' => $csv,
64 'sep_char' => ',',
65 );
d9e873d0 66
3b853983 67is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
d9e873d0 68
3b853983 69### TODO Check these figures
70if( $t ) {
71 is( scalar $t->collation->readings, 313, "Collation has all readings" );
72 is( scalar $t->collation->paths, 2877, "Collation has all paths" );
73 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
74}
75
76=end testing
d9e873d0 77
78=cut
79
80sub parse {
dfc37e38 81 my( $tradition, $opts ) = @_;
d9e873d0 82 my $c = $tradition->collation; # shorthand
dfc37e38 83 my $csv = Text::CSV_XS->new( {
84 binary => 1, # binary for UTF-8
85 sep_char => exists $opts->{'sep_char'} ? $opts->{'sep_char'} : "\t" }
86 );
dfc37e38 87
d9e873d0 88 my $alignment_table;
dfc37e38 89 if( exists $opts->{'string' } ) {
90 my @lines = split( "\n", $opts->{'string'} );
91 foreach my $l ( @lines ) {
92 my $status = $csv->parse( $l );
93 if( $status ) {
94 push( @$alignment_table, [ $csv->fields ] );
95 } else {
96 warn "Could not parse line $l: " . $csv->error_input;
97 }
98 }
99 } elsif( exists $opts->{'file'} ) {
100 open( my $fh, $opts->{'file'} ) or die "Could not open input file " . $opts->{'file'};
101 while( my $row = $csv->getline( $fh ) ) {
102 push( @$alignment_table, $row );
d9e873d0 103 }
dfc37e38 104 close $fh;
105 } else {
106 warn "Could not find string or file option to parse";
107 return;
d9e873d0 108 }
dfc37e38 109
d9e873d0 110 # Set up the witnesses we find in the first line
111 my @witnesses;
3b853983 112 my %ac_wits; # Track these for later removal
d9e873d0 113 foreach my $sigil ( @{$alignment_table->[0]} ) {
114 my $wit = $tradition->add_witness( 'sigil' => $sigil );
115 $wit->path( [ $c->start ] );
116 push( @witnesses, $wit );
3b853983 117 my $aclabel = $c->ac_label;
118 if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
119 $ac_wits{$1} = $wit;
120 }
d9e873d0 121 }
122
123 # Now for the next rows, make nodes as necessary, assign their ranks, and
124 # add them to the witness paths.
d9e873d0 125 foreach my $idx ( 1 .. $#{$alignment_table} ) {
126 my $row = $alignment_table->[$idx];
127 my $nodes = make_nodes( $c, $row, $idx );
128 foreach my $w ( 0 .. $#{$row} ) {
129 # push the appropriate node onto the appropriate witness path
130 my $word = $row->[$w];
131 if( $word ) {
132 my $reading = $nodes->{$word};
133 my $wit = $witnesses[$w];
134 push( @{$wit->path}, $reading );
135 } # else skip it for empty readings.
136 }
137 }
138
eca16057 139
140 # Collapse our lacunae into a single node and
141 # push the end node onto all paths.
d9e873d0 142 $c->end->rank( scalar @$alignment_table );
143 foreach my $wit ( @witnesses ) {
eca16057 144 my $p = $wit->path;
145 my $last_rdg = shift @$p;
146 my $new_p = [ $last_rdg ];
147 foreach my $rdg ( @$p ) {
148 if( $rdg->text eq '#LACUNA#' ) {
149 # If we are in a lacuna already, drop this node.
150 # Otherwise make a lacuna node and drop this node.
151 unless( $last_rdg->is_lacuna ) {
152 my $l = $c->add_lacuna( $rdg->name );
153 $l->rank( $rdg->rank );
154 push( @$new_p, $l );
155 $last_rdg = $l;
156 }
157 $c->del_reading( $rdg );
158 } else {
159 # No lacuna, save the reading.
160 push( @$new_p, $rdg );
161 }
162 }
163 push( @$new_p, $c->end );
164 $wit->path( $new_p );
d9e873d0 165 }
166
3b853983 167 # Fold any a.c. witnesses into their main witness objects, and
168 # delete the independent a.c. versions.
169 foreach my $a ( keys %ac_wits ) {
170 my $main_wit = $tradition->witness( $a );
171 next unless $main_wit;
172 my $ac_wit = $ac_wits{$a};
173 $main_wit->uncorrected_path( $ac_wit->path );
174 $tradition->del_witness( $ac_wit );
175 }
176
d9e873d0 177 # Join up the paths.
178 $c->make_witness_paths;
d9e873d0 179}
180
181sub make_nodes {
182 my( $collation, $row, $index ) = @_;
183 my %unique;
184 foreach my $w ( @$row ) {
185 $unique{$w} = 1 if $w;
186 }
187 my $ctr = 1;
188 foreach my $w ( keys %unique ) {
189 my $r = $collation->add_reading( "$index,$ctr" );
190 $ctr++;
191 $r->rank( $index );
192 $r->text( $w );
193 $unique{$w} = $r;
194 }
195 return \%unique;
196}
197
3b853983 1981;
199
200=head1 LICENSE
201
202This package is free software and is provided "as is" without express
203or implied warranty. You can redistribute it and/or modify it under
204the same terms as Perl itself.
205
206=head1 AUTHOR
207
208Tara L Andrews E<lt>aurum@cpan.orgE<gt>